diff --git "a/checkpoint-48000/trainer_state.json" "b/checkpoint-48000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-48000/trainer_state.json" @@ -0,0 +1,336033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.48, + "eval_steps": 500, + "global_step": 48000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1e-05, + "grad_norm": 1.631775788945859, + "learning_rate": 3e-06, + "loss": 10.8658, + "step": 1 + }, + { + "epoch": 2e-05, + "grad_norm": 1.6173222705173065, + "learning_rate": 6e-06, + "loss": 10.8645, + "step": 2 + }, + { + "epoch": 3e-05, + "grad_norm": 1.6387507243802044, + "learning_rate": 9e-06, + "loss": 10.8646, + "step": 3 + }, + { + "epoch": 4e-05, + "grad_norm": 1.5975767011448438, + "learning_rate": 1.2e-05, + "loss": 10.8638, + "step": 4 + }, + { + "epoch": 5e-05, + "grad_norm": 1.6454022013221787, + "learning_rate": 1.5e-05, + "loss": 10.8605, + "step": 5 + }, + { + "epoch": 6e-05, + "grad_norm": 1.6407987097684302, + "learning_rate": 1.8e-05, + "loss": 10.8581, + "step": 6 + }, + { + "epoch": 7e-05, + "grad_norm": 1.609586764602888, + "learning_rate": 2.1000000000000002e-05, + "loss": 10.8444, + "step": 7 + }, + { + "epoch": 8e-05, + "grad_norm": 1.4683048428970586, + "learning_rate": 2.4e-05, + "loss": 10.8173, + "step": 8 + }, + { + "epoch": 9e-05, + "grad_norm": 1.3933728304700357, + "learning_rate": 2.7e-05, + "loss": 10.8102, + "step": 9 + }, + { + "epoch": 0.0001, + "grad_norm": 1.3326098319804733, + "learning_rate": 3e-05, + "loss": 10.7958, + "step": 10 + }, + { + "epoch": 0.00011, + "grad_norm": 1.217237966643813, + "learning_rate": 3.2999999999999996e-05, + "loss": 10.779, + "step": 11 + }, + { + "epoch": 0.00012, + "grad_norm": 1.1764691634458027, + "learning_rate": 3.6e-05, + "loss": 10.7677, + "step": 12 + }, + { + "epoch": 0.00013, + "grad_norm": 1.1304717895604097, + "learning_rate": 3.9e-05, + "loss": 10.7486, + "step": 13 + }, + { + "epoch": 0.00014, + "grad_norm": 1.115888694899127, + "learning_rate": 4.2000000000000004e-05, + "loss": 10.7378, + "step": 14 + }, + { + "epoch": 0.00015, + "grad_norm": 1.1065220670447153, + "learning_rate": 4.4999999999999996e-05, + "loss": 10.7288, + "step": 15 + }, + { + "epoch": 0.00016, + "grad_norm": 1.075226769828573, + "learning_rate": 4.8e-05, + "loss": 10.7115, + "step": 16 + }, + { + "epoch": 0.00017, + "grad_norm": 1.0461893681391048, + "learning_rate": 5.1000000000000006e-05, + "loss": 10.6956, + "step": 17 + }, + { + "epoch": 0.00018, + "grad_norm": 1.0230892582928048, + "learning_rate": 5.4e-05, + "loss": 10.6795, + "step": 18 + }, + { + "epoch": 0.00019, + "grad_norm": 0.9869910790408369, + "learning_rate": 5.7e-05, + "loss": 10.6637, + "step": 19 + }, + { + "epoch": 0.0002, + "grad_norm": 0.9729421058544264, + "learning_rate": 6e-05, + "loss": 10.6515, + "step": 20 + }, + { + "epoch": 0.00021, + "grad_norm": 0.943988636941647, + "learning_rate": 6.3e-05, + "loss": 10.6378, + "step": 21 + }, + { + "epoch": 0.00022, + "grad_norm": 0.9278112602221015, + "learning_rate": 6.599999999999999e-05, + "loss": 10.6233, + "step": 22 + }, + { + "epoch": 0.00023, + "grad_norm": 0.920877936489298, + "learning_rate": 6.9e-05, + "loss": 10.6091, + "step": 23 + }, + { + "epoch": 0.00024, + "grad_norm": 0.9178015901962371, + "learning_rate": 7.2e-05, + "loss": 10.5981, + "step": 24 + }, + { + "epoch": 0.00025, + "grad_norm": 0.9193853191136445, + "learning_rate": 7.500000000000001e-05, + "loss": 10.5835, + "step": 25 + }, + { + "epoch": 0.00026, + "grad_norm": 0.9191743831022944, + "learning_rate": 7.8e-05, + "loss": 10.5705, + "step": 26 + }, + { + "epoch": 0.00027, + "grad_norm": 0.9136913401152261, + "learning_rate": 8.1e-05, + "loss": 10.5585, + "step": 27 + }, + { + "epoch": 0.00028, + "grad_norm": 0.9119853724531574, + "learning_rate": 8.400000000000001e-05, + "loss": 10.5456, + "step": 28 + }, + { + "epoch": 0.00029, + "grad_norm": 0.9130829908837624, + "learning_rate": 8.7e-05, + "loss": 10.5312, + "step": 29 + }, + { + "epoch": 0.0003, + "grad_norm": 0.9186726006674357, + "learning_rate": 8.999999999999999e-05, + "loss": 10.5159, + "step": 30 + }, + { + "epoch": 0.00031, + "grad_norm": 0.9155120967133267, + "learning_rate": 9.3e-05, + "loss": 10.5012, + "step": 31 + }, + { + "epoch": 0.00032, + "grad_norm": 0.9096055134642034, + "learning_rate": 9.6e-05, + "loss": 10.4871, + "step": 32 + }, + { + "epoch": 0.00033, + "grad_norm": 0.91013596598753, + "learning_rate": 9.900000000000001e-05, + "loss": 10.4706, + "step": 33 + }, + { + "epoch": 0.00034, + "grad_norm": 0.9103576711685224, + "learning_rate": 0.00010200000000000001, + "loss": 10.4543, + "step": 34 + }, + { + "epoch": 0.00035, + "grad_norm": 0.9164612613814794, + "learning_rate": 0.00010500000000000002, + "loss": 10.4377, + "step": 35 + }, + { + "epoch": 0.00036, + "grad_norm": 0.9129652960092816, + "learning_rate": 0.000108, + "loss": 10.4202, + "step": 36 + }, + { + "epoch": 0.00037, + "grad_norm": 0.9029365420083331, + "learning_rate": 0.000111, + "loss": 10.4036, + "step": 37 + }, + { + "epoch": 0.00038, + "grad_norm": 0.9075151789728013, + "learning_rate": 0.000114, + "loss": 10.3847, + "step": 38 + }, + { + "epoch": 0.00039, + "grad_norm": 0.9102185085970206, + "learning_rate": 0.000117, + "loss": 10.3654, + "step": 39 + }, + { + "epoch": 0.0004, + "grad_norm": 0.9144897951886523, + "learning_rate": 0.00012, + "loss": 10.3432, + "step": 40 + }, + { + "epoch": 0.00041, + "grad_norm": 0.9044769084629607, + "learning_rate": 0.000123, + "loss": 10.3253, + "step": 41 + }, + { + "epoch": 0.00042, + "grad_norm": 0.9101249031053068, + "learning_rate": 0.000126, + "loss": 10.3047, + "step": 42 + }, + { + "epoch": 0.00043, + "grad_norm": 0.9147428510616606, + "learning_rate": 0.000129, + "loss": 10.282, + "step": 43 + }, + { + "epoch": 0.00044, + "grad_norm": 0.9138907921510238, + "learning_rate": 0.00013199999999999998, + "loss": 10.2606, + "step": 44 + }, + { + "epoch": 0.00045, + "grad_norm": 0.9165726299867081, + "learning_rate": 0.000135, + "loss": 10.2377, + "step": 45 + }, + { + "epoch": 0.00046, + "grad_norm": 0.906013196308877, + "learning_rate": 0.000138, + "loss": 10.216, + "step": 46 + }, + { + "epoch": 0.00047, + "grad_norm": 0.9116233570839986, + "learning_rate": 0.000141, + "loss": 10.1904, + "step": 47 + }, + { + "epoch": 0.00048, + "grad_norm": 0.9086131145608887, + "learning_rate": 0.000144, + "loss": 10.1674, + "step": 48 + }, + { + "epoch": 0.00049, + "grad_norm": 0.9094213201037699, + "learning_rate": 0.000147, + "loss": 10.1435, + "step": 49 + }, + { + "epoch": 0.0005, + "grad_norm": 0.9128293203892458, + "learning_rate": 0.00015000000000000001, + "loss": 10.1173, + "step": 50 + }, + { + "epoch": 0.00051, + "grad_norm": 0.9140261923858894, + "learning_rate": 0.000153, + "loss": 10.0916, + "step": 51 + }, + { + "epoch": 0.00052, + "grad_norm": 0.9121253786029146, + "learning_rate": 0.000156, + "loss": 10.0657, + "step": 52 + }, + { + "epoch": 0.00053, + "grad_norm": 0.9034950978450355, + "learning_rate": 0.000159, + "loss": 10.0413, + "step": 53 + }, + { + "epoch": 0.00054, + "grad_norm": 0.9241206201798855, + "learning_rate": 0.000162, + "loss": 10.0092, + "step": 54 + }, + { + "epoch": 0.00055, + "grad_norm": 0.9149154841379399, + "learning_rate": 0.000165, + "loss": 9.9852, + "step": 55 + }, + { + "epoch": 0.00056, + "grad_norm": 0.9070429364957124, + "learning_rate": 0.00016800000000000002, + "loss": 9.9561, + "step": 56 + }, + { + "epoch": 0.00057, + "grad_norm": 0.9122859536360214, + "learning_rate": 0.000171, + "loss": 9.9285, + "step": 57 + }, + { + "epoch": 0.00058, + "grad_norm": 0.9101615769667563, + "learning_rate": 0.000174, + "loss": 9.9038, + "step": 58 + }, + { + "epoch": 0.00059, + "grad_norm": 0.91462118557662, + "learning_rate": 0.000177, + "loss": 9.872, + "step": 59 + }, + { + "epoch": 0.0006, + "grad_norm": 0.902238261354272, + "learning_rate": 0.00017999999999999998, + "loss": 9.846, + "step": 60 + }, + { + "epoch": 0.00061, + "grad_norm": 0.9010263217287604, + "learning_rate": 0.000183, + "loss": 9.8177, + "step": 61 + }, + { + "epoch": 0.00062, + "grad_norm": 0.9073977759483284, + "learning_rate": 0.000186, + "loss": 9.7857, + "step": 62 + }, + { + "epoch": 0.00063, + "grad_norm": 0.9002035349554564, + "learning_rate": 0.000189, + "loss": 9.7585, + "step": 63 + }, + { + "epoch": 0.00064, + "grad_norm": 0.8964472690991813, + "learning_rate": 0.000192, + "loss": 9.7283, + "step": 64 + }, + { + "epoch": 0.00065, + "grad_norm": 0.9038015098943822, + "learning_rate": 0.00019500000000000002, + "loss": 9.696, + "step": 65 + }, + { + "epoch": 0.00066, + "grad_norm": 0.8969444374465311, + "learning_rate": 0.00019800000000000002, + "loss": 9.6719, + "step": 66 + }, + { + "epoch": 0.00067, + "grad_norm": 0.898249772566312, + "learning_rate": 0.000201, + "loss": 9.642, + "step": 67 + }, + { + "epoch": 0.00068, + "grad_norm": 0.9048900549589218, + "learning_rate": 0.00020400000000000003, + "loss": 9.6091, + "step": 68 + }, + { + "epoch": 0.00069, + "grad_norm": 0.8901553719298861, + "learning_rate": 0.00020700000000000002, + "loss": 9.5816, + "step": 69 + }, + { + "epoch": 0.0007, + "grad_norm": 0.903386863411625, + "learning_rate": 0.00021000000000000004, + "loss": 9.5447, + "step": 70 + }, + { + "epoch": 0.00071, + "grad_norm": 0.8933303377898588, + "learning_rate": 0.00021299999999999997, + "loss": 9.518, + "step": 71 + }, + { + "epoch": 0.00072, + "grad_norm": 0.8969213907091691, + "learning_rate": 0.000216, + "loss": 9.4906, + "step": 72 + }, + { + "epoch": 0.00073, + "grad_norm": 0.8960518018191052, + "learning_rate": 0.00021899999999999998, + "loss": 9.4566, + "step": 73 + }, + { + "epoch": 0.00074, + "grad_norm": 0.8964935065687102, + "learning_rate": 0.000222, + "loss": 9.4297, + "step": 74 + }, + { + "epoch": 0.00075, + "grad_norm": 0.8917043311520961, + "learning_rate": 0.000225, + "loss": 9.3947, + "step": 75 + }, + { + "epoch": 0.00076, + "grad_norm": 0.8997723193205578, + "learning_rate": 0.000228, + "loss": 9.3676, + "step": 76 + }, + { + "epoch": 0.00077, + "grad_norm": 0.8937482133829812, + "learning_rate": 0.000231, + "loss": 9.3393, + "step": 77 + }, + { + "epoch": 0.00078, + "grad_norm": 0.886737139439046, + "learning_rate": 0.000234, + "loss": 9.3091, + "step": 78 + }, + { + "epoch": 0.00079, + "grad_norm": 0.8895258541637481, + "learning_rate": 0.00023700000000000001, + "loss": 9.2733, + "step": 79 + }, + { + "epoch": 0.0008, + "grad_norm": 0.8909958989870267, + "learning_rate": 0.00024, + "loss": 9.2384, + "step": 80 + }, + { + "epoch": 0.00081, + "grad_norm": 0.8966003836256963, + "learning_rate": 0.00024300000000000002, + "loss": 9.2045, + "step": 81 + }, + { + "epoch": 0.00082, + "grad_norm": 0.9055894629552318, + "learning_rate": 0.000246, + "loss": 9.1791, + "step": 82 + }, + { + "epoch": 0.00083, + "grad_norm": 0.8961362604582432, + "learning_rate": 0.00024900000000000004, + "loss": 9.1431, + "step": 83 + }, + { + "epoch": 0.00084, + "grad_norm": 0.8980147614185676, + "learning_rate": 0.000252, + "loss": 9.1116, + "step": 84 + }, + { + "epoch": 0.00085, + "grad_norm": 0.8928314575907911, + "learning_rate": 0.000255, + "loss": 9.0884, + "step": 85 + }, + { + "epoch": 0.00086, + "grad_norm": 0.8950046881177521, + "learning_rate": 0.000258, + "loss": 9.0518, + "step": 86 + }, + { + "epoch": 0.00087, + "grad_norm": 0.8877891352831231, + "learning_rate": 0.000261, + "loss": 9.0297, + "step": 87 + }, + { + "epoch": 0.00088, + "grad_norm": 0.8929677780146621, + "learning_rate": 0.00026399999999999997, + "loss": 8.992, + "step": 88 + }, + { + "epoch": 0.00089, + "grad_norm": 0.8871031960638883, + "learning_rate": 0.000267, + "loss": 8.9663, + "step": 89 + }, + { + "epoch": 0.0009, + "grad_norm": 0.8863614322091727, + "learning_rate": 0.00027, + "loss": 8.9387, + "step": 90 + }, + { + "epoch": 0.00091, + "grad_norm": 0.8806256133072948, + "learning_rate": 0.000273, + "loss": 8.9083, + "step": 91 + }, + { + "epoch": 0.00092, + "grad_norm": 0.8826316050497074, + "learning_rate": 0.000276, + "loss": 8.8757, + "step": 92 + }, + { + "epoch": 0.00093, + "grad_norm": 0.8833603577684496, + "learning_rate": 0.000279, + "loss": 8.8461, + "step": 93 + }, + { + "epoch": 0.00094, + "grad_norm": 0.8819538724809766, + "learning_rate": 0.000282, + "loss": 8.82, + "step": 94 + }, + { + "epoch": 0.00095, + "grad_norm": 0.8776473753829749, + "learning_rate": 0.000285, + "loss": 8.7909, + "step": 95 + }, + { + "epoch": 0.00096, + "grad_norm": 0.8854898687433331, + "learning_rate": 0.000288, + "loss": 8.7608, + "step": 96 + }, + { + "epoch": 0.00097, + "grad_norm": 0.8763526561707659, + "learning_rate": 0.000291, + "loss": 8.7376, + "step": 97 + }, + { + "epoch": 0.00098, + "grad_norm": 0.8773720509513535, + "learning_rate": 0.000294, + "loss": 8.7001, + "step": 98 + }, + { + "epoch": 0.00099, + "grad_norm": 0.8782537783818637, + "learning_rate": 0.000297, + "loss": 8.6785, + "step": 99 + }, + { + "epoch": 0.001, + "grad_norm": 0.876084031734807, + "learning_rate": 0.00030000000000000003, + "loss": 8.6509, + "step": 100 + }, + { + "epoch": 0.00101, + "grad_norm": 0.8775511931302766, + "learning_rate": 0.00030300000000000005, + "loss": 8.6162, + "step": 101 + }, + { + "epoch": 0.00102, + "grad_norm": 0.8666072056658197, + "learning_rate": 0.000306, + "loss": 8.5962, + "step": 102 + }, + { + "epoch": 0.00103, + "grad_norm": 0.8733315070806934, + "learning_rate": 0.000309, + "loss": 8.5716, + "step": 103 + }, + { + "epoch": 0.00104, + "grad_norm": 0.8664419648151436, + "learning_rate": 0.000312, + "loss": 8.5503, + "step": 104 + }, + { + "epoch": 0.00105, + "grad_norm": 0.8699404252946213, + "learning_rate": 0.000315, + "loss": 8.5232, + "step": 105 + }, + { + "epoch": 0.00106, + "grad_norm": 0.8630101619311507, + "learning_rate": 0.000318, + "loss": 8.4946, + "step": 106 + }, + { + "epoch": 0.00107, + "grad_norm": 0.8533006978361278, + "learning_rate": 0.000321, + "loss": 8.4694, + "step": 107 + }, + { + "epoch": 0.00108, + "grad_norm": 0.8571194919918376, + "learning_rate": 0.000324, + "loss": 8.4408, + "step": 108 + }, + { + "epoch": 0.00109, + "grad_norm": 0.8496626885878062, + "learning_rate": 0.000327, + "loss": 8.4213, + "step": 109 + }, + { + "epoch": 0.0011, + "grad_norm": 0.8617458268479945, + "learning_rate": 0.00033, + "loss": 8.3989, + "step": 110 + }, + { + "epoch": 0.00111, + "grad_norm": 0.874730580725934, + "learning_rate": 0.000333, + "loss": 8.3705, + "step": 111 + }, + { + "epoch": 0.00112, + "grad_norm": 0.9211403811176949, + "learning_rate": 0.00033600000000000004, + "loss": 8.351, + "step": 112 + }, + { + "epoch": 0.00113, + "grad_norm": 0.9451163730301329, + "learning_rate": 0.000339, + "loss": 8.3126, + "step": 113 + }, + { + "epoch": 0.00114, + "grad_norm": 0.8518853453666535, + "learning_rate": 0.000342, + "loss": 8.3025, + "step": 114 + }, + { + "epoch": 0.00115, + "grad_norm": 0.8499246309464553, + "learning_rate": 0.00034500000000000004, + "loss": 8.2758, + "step": 115 + }, + { + "epoch": 0.00116, + "grad_norm": 0.8769128820472754, + "learning_rate": 0.000348, + "loss": 8.2536, + "step": 116 + }, + { + "epoch": 0.00117, + "grad_norm": 0.829578266212784, + "learning_rate": 0.000351, + "loss": 8.2211, + "step": 117 + }, + { + "epoch": 0.00118, + "grad_norm": 0.8587574762862499, + "learning_rate": 0.000354, + "loss": 8.2068, + "step": 118 + }, + { + "epoch": 0.00119, + "grad_norm": 0.8383808879241313, + "learning_rate": 0.000357, + "loss": 8.1942, + "step": 119 + }, + { + "epoch": 0.0012, + "grad_norm": 0.8155263912622424, + "learning_rate": 0.00035999999999999997, + "loss": 8.1675, + "step": 120 + }, + { + "epoch": 0.00121, + "grad_norm": 0.8344307084821188, + "learning_rate": 0.000363, + "loss": 8.1409, + "step": 121 + }, + { + "epoch": 0.00122, + "grad_norm": 0.8097993043330719, + "learning_rate": 0.000366, + "loss": 8.1244, + "step": 122 + }, + { + "epoch": 0.00123, + "grad_norm": 0.8029969793277704, + "learning_rate": 0.000369, + "loss": 8.102, + "step": 123 + }, + { + "epoch": 0.00124, + "grad_norm": 0.7829455528112805, + "learning_rate": 0.000372, + "loss": 8.0811, + "step": 124 + }, + { + "epoch": 0.00125, + "grad_norm": 0.8110394816024603, + "learning_rate": 0.000375, + "loss": 8.0581, + "step": 125 + }, + { + "epoch": 0.00126, + "grad_norm": 0.8039928825408066, + "learning_rate": 0.000378, + "loss": 8.0463, + "step": 126 + }, + { + "epoch": 0.00127, + "grad_norm": 0.8369020039958236, + "learning_rate": 0.000381, + "loss": 8.0243, + "step": 127 + }, + { + "epoch": 0.00128, + "grad_norm": 0.9124681746054819, + "learning_rate": 0.000384, + "loss": 8.0062, + "step": 128 + }, + { + "epoch": 0.00129, + "grad_norm": 0.9497790585671452, + "learning_rate": 0.00038700000000000003, + "loss": 7.997, + "step": 129 + }, + { + "epoch": 0.0013, + "grad_norm": 0.895034767303024, + "learning_rate": 0.00039000000000000005, + "loss": 7.9709, + "step": 130 + }, + { + "epoch": 0.00131, + "grad_norm": 0.7344919418773682, + "learning_rate": 0.000393, + "loss": 7.9348, + "step": 131 + }, + { + "epoch": 0.00132, + "grad_norm": 0.8078203789615136, + "learning_rate": 0.00039600000000000003, + "loss": 7.9285, + "step": 132 + }, + { + "epoch": 0.00133, + "grad_norm": 0.8012273324997825, + "learning_rate": 0.00039900000000000005, + "loss": 7.907, + "step": 133 + }, + { + "epoch": 0.00134, + "grad_norm": 0.7227755159346104, + "learning_rate": 0.000402, + "loss": 7.8939, + "step": 134 + }, + { + "epoch": 0.00135, + "grad_norm": 0.7235157354092677, + "learning_rate": 0.00040500000000000003, + "loss": 7.8661, + "step": 135 + }, + { + "epoch": 0.00136, + "grad_norm": 0.7822895950244824, + "learning_rate": 0.00040800000000000005, + "loss": 7.852, + "step": 136 + }, + { + "epoch": 0.00137, + "grad_norm": 0.7608062709985561, + "learning_rate": 0.000411, + "loss": 7.8315, + "step": 137 + }, + { + "epoch": 0.00138, + "grad_norm": 0.7380360973204948, + "learning_rate": 0.00041400000000000003, + "loss": 7.8088, + "step": 138 + }, + { + "epoch": 0.00139, + "grad_norm": 0.684851228306475, + "learning_rate": 0.00041700000000000005, + "loss": 7.7952, + "step": 139 + }, + { + "epoch": 0.0014, + "grad_norm": 0.693462889960616, + "learning_rate": 0.00042000000000000007, + "loss": 7.7794, + "step": 140 + }, + { + "epoch": 0.00141, + "grad_norm": 0.7360428489985282, + "learning_rate": 0.000423, + "loss": 7.7672, + "step": 141 + }, + { + "epoch": 0.00142, + "grad_norm": 0.8088792714181905, + "learning_rate": 0.00042599999999999995, + "loss": 7.7572, + "step": 142 + }, + { + "epoch": 0.00143, + "grad_norm": 0.8184868537412088, + "learning_rate": 0.00042899999999999997, + "loss": 7.7297, + "step": 143 + }, + { + "epoch": 0.00144, + "grad_norm": 0.7328054873489724, + "learning_rate": 0.000432, + "loss": 7.7108, + "step": 144 + }, + { + "epoch": 0.00145, + "grad_norm": 0.742383079085953, + "learning_rate": 0.000435, + "loss": 7.7067, + "step": 145 + }, + { + "epoch": 0.00146, + "grad_norm": 0.8017743593965694, + "learning_rate": 0.00043799999999999997, + "loss": 7.6737, + "step": 146 + }, + { + "epoch": 0.00147, + "grad_norm": 0.7202665590443219, + "learning_rate": 0.000441, + "loss": 7.6628, + "step": 147 + }, + { + "epoch": 0.00148, + "grad_norm": 0.5746342281678257, + "learning_rate": 0.000444, + "loss": 7.6477, + "step": 148 + }, + { + "epoch": 0.00149, + "grad_norm": 0.5929410187247641, + "learning_rate": 0.00044699999999999997, + "loss": 7.6424, + "step": 149 + }, + { + "epoch": 0.0015, + "grad_norm": 0.7151318764270516, + "learning_rate": 0.00045, + "loss": 7.6133, + "step": 150 + }, + { + "epoch": 0.00151, + "grad_norm": 0.6796514647913489, + "learning_rate": 0.000453, + "loss": 7.5947, + "step": 151 + }, + { + "epoch": 0.00152, + "grad_norm": 0.543758313944587, + "learning_rate": 0.000456, + "loss": 7.5852, + "step": 152 + }, + { + "epoch": 0.00153, + "grad_norm": 0.7299692440583161, + "learning_rate": 0.000459, + "loss": 7.5768, + "step": 153 + }, + { + "epoch": 0.00154, + "grad_norm": 0.8631376359970574, + "learning_rate": 0.000462, + "loss": 7.5644, + "step": 154 + }, + { + "epoch": 0.00155, + "grad_norm": 0.9628027146132815, + "learning_rate": 0.000465, + "loss": 7.546, + "step": 155 + }, + { + "epoch": 0.00156, + "grad_norm": 1.6485250632015214, + "learning_rate": 0.000468, + "loss": 7.5336, + "step": 156 + }, + { + "epoch": 0.00157, + "grad_norm": 0.9248857473926935, + "learning_rate": 0.000471, + "loss": 7.5216, + "step": 157 + }, + { + "epoch": 0.00158, + "grad_norm": 0.7465186792591986, + "learning_rate": 0.00047400000000000003, + "loss": 7.4981, + "step": 158 + }, + { + "epoch": 0.00159, + "grad_norm": 0.5902384572619932, + "learning_rate": 0.000477, + "loss": 7.4827, + "step": 159 + }, + { + "epoch": 0.0016, + "grad_norm": 0.799266704152554, + "learning_rate": 0.00048, + "loss": 7.4675, + "step": 160 + }, + { + "epoch": 0.00161, + "grad_norm": 0.7827143911710401, + "learning_rate": 0.00048300000000000003, + "loss": 7.4466, + "step": 161 + }, + { + "epoch": 0.00162, + "grad_norm": 0.9218921339316959, + "learning_rate": 0.00048600000000000005, + "loss": 7.4513, + "step": 162 + }, + { + "epoch": 0.00163, + "grad_norm": 0.7287660540574216, + "learning_rate": 0.0004890000000000001, + "loss": 7.4357, + "step": 163 + }, + { + "epoch": 0.00164, + "grad_norm": 0.5250441215321361, + "learning_rate": 0.000492, + "loss": 7.4302, + "step": 164 + }, + { + "epoch": 0.00165, + "grad_norm": 0.741808200483857, + "learning_rate": 0.000495, + "loss": 7.4077, + "step": 165 + }, + { + "epoch": 0.00166, + "grad_norm": 0.6759515654254841, + "learning_rate": 0.0004980000000000001, + "loss": 7.3847, + "step": 166 + }, + { + "epoch": 0.00167, + "grad_norm": 0.5877266102377413, + "learning_rate": 0.000501, + "loss": 7.373, + "step": 167 + }, + { + "epoch": 0.00168, + "grad_norm": 0.571053515725499, + "learning_rate": 0.000504, + "loss": 7.3498, + "step": 168 + }, + { + "epoch": 0.00169, + "grad_norm": 0.6044099115482432, + "learning_rate": 0.0005070000000000001, + "loss": 7.3473, + "step": 169 + }, + { + "epoch": 0.0017, + "grad_norm": 0.46081682030640647, + "learning_rate": 0.00051, + "loss": 7.3262, + "step": 170 + }, + { + "epoch": 0.00171, + "grad_norm": 0.6775750897944629, + "learning_rate": 0.000513, + "loss": 7.3116, + "step": 171 + }, + { + "epoch": 0.00172, + "grad_norm": 0.57210862548929, + "learning_rate": 0.000516, + "loss": 7.3117, + "step": 172 + }, + { + "epoch": 0.00173, + "grad_norm": 0.5300582190464731, + "learning_rate": 0.0005189999999999999, + "loss": 7.2934, + "step": 173 + }, + { + "epoch": 0.00174, + "grad_norm": 0.7575900839335431, + "learning_rate": 0.000522, + "loss": 7.3114, + "step": 174 + }, + { + "epoch": 0.00175, + "grad_norm": 0.7613961222663432, + "learning_rate": 0.000525, + "loss": 7.2747, + "step": 175 + }, + { + "epoch": 0.00176, + "grad_norm": 0.6765759081090318, + "learning_rate": 0.0005279999999999999, + "loss": 7.2505, + "step": 176 + }, + { + "epoch": 0.00177, + "grad_norm": 0.8933831251648804, + "learning_rate": 0.000531, + "loss": 7.2576, + "step": 177 + }, + { + "epoch": 0.00178, + "grad_norm": 0.7076429855739662, + "learning_rate": 0.000534, + "loss": 7.2471, + "step": 178 + }, + { + "epoch": 0.00179, + "grad_norm": 0.49163795680938555, + "learning_rate": 0.000537, + "loss": 7.2319, + "step": 179 + }, + { + "epoch": 0.0018, + "grad_norm": 0.6284573091622804, + "learning_rate": 0.00054, + "loss": 7.2064, + "step": 180 + }, + { + "epoch": 0.00181, + "grad_norm": 0.5715580305485367, + "learning_rate": 0.000543, + "loss": 7.1941, + "step": 181 + }, + { + "epoch": 0.00182, + "grad_norm": 0.4282299340658738, + "learning_rate": 0.000546, + "loss": 7.1864, + "step": 182 + }, + { + "epoch": 0.00183, + "grad_norm": 0.5948913741099119, + "learning_rate": 0.000549, + "loss": 7.1781, + "step": 183 + }, + { + "epoch": 0.00184, + "grad_norm": 0.4755161712268706, + "learning_rate": 0.000552, + "loss": 7.1637, + "step": 184 + }, + { + "epoch": 0.00185, + "grad_norm": 0.46412280386502286, + "learning_rate": 0.000555, + "loss": 7.14, + "step": 185 + }, + { + "epoch": 0.00186, + "grad_norm": 0.526893652843914, + "learning_rate": 0.000558, + "loss": 7.1495, + "step": 186 + }, + { + "epoch": 0.00187, + "grad_norm": 0.46987960377579885, + "learning_rate": 0.000561, + "loss": 7.1236, + "step": 187 + }, + { + "epoch": 0.00188, + "grad_norm": 0.41993376006980737, + "learning_rate": 0.000564, + "loss": 7.1004, + "step": 188 + }, + { + "epoch": 0.00189, + "grad_norm": 0.5433001024887105, + "learning_rate": 0.000567, + "loss": 7.0986, + "step": 189 + }, + { + "epoch": 0.0019, + "grad_norm": 0.42284758794841465, + "learning_rate": 0.00057, + "loss": 7.0796, + "step": 190 + }, + { + "epoch": 0.00191, + "grad_norm": 0.47158596225286625, + "learning_rate": 0.000573, + "loss": 7.0701, + "step": 191 + }, + { + "epoch": 0.00192, + "grad_norm": 0.4590137866807191, + "learning_rate": 0.000576, + "loss": 7.0611, + "step": 192 + }, + { + "epoch": 0.00193, + "grad_norm": 0.43849327433792495, + "learning_rate": 0.000579, + "loss": 7.0612, + "step": 193 + }, + { + "epoch": 0.00194, + "grad_norm": 0.4326532382468588, + "learning_rate": 0.000582, + "loss": 7.0527, + "step": 194 + }, + { + "epoch": 0.00195, + "grad_norm": 0.508356146068285, + "learning_rate": 0.000585, + "loss": 7.0348, + "step": 195 + }, + { + "epoch": 0.00196, + "grad_norm": 0.534972126993015, + "learning_rate": 0.000588, + "loss": 7.0227, + "step": 196 + }, + { + "epoch": 0.00197, + "grad_norm": 0.693386583628206, + "learning_rate": 0.000591, + "loss": 7.0198, + "step": 197 + }, + { + "epoch": 0.00198, + "grad_norm": 1.4796610947894584, + "learning_rate": 0.000594, + "loss": 7.0311, + "step": 198 + }, + { + "epoch": 0.00199, + "grad_norm": 0.8596271717690976, + "learning_rate": 0.0005970000000000001, + "loss": 7.0019, + "step": 199 + }, + { + "epoch": 0.002, + "grad_norm": 0.5642900417004818, + "learning_rate": 0.0006000000000000001, + "loss": 6.991, + "step": 200 + }, + { + "epoch": 0.00201, + "grad_norm": 0.8692961874823039, + "learning_rate": 0.000603, + "loss": 6.9753, + "step": 201 + }, + { + "epoch": 0.00202, + "grad_norm": 0.6679575424248658, + "learning_rate": 0.0006060000000000001, + "loss": 6.9579, + "step": 202 + }, + { + "epoch": 0.00203, + "grad_norm": 0.8457647455673973, + "learning_rate": 0.0006090000000000001, + "loss": 6.9683, + "step": 203 + }, + { + "epoch": 0.00204, + "grad_norm": 0.9450482633193271, + "learning_rate": 0.000612, + "loss": 6.9556, + "step": 204 + }, + { + "epoch": 0.00205, + "grad_norm": 1.3090292735766231, + "learning_rate": 0.000615, + "loss": 6.9523, + "step": 205 + }, + { + "epoch": 0.00206, + "grad_norm": 0.6638095483572409, + "learning_rate": 0.000618, + "loss": 6.9303, + "step": 206 + }, + { + "epoch": 0.00207, + "grad_norm": 0.4840297186697793, + "learning_rate": 0.000621, + "loss": 6.9174, + "step": 207 + }, + { + "epoch": 0.00208, + "grad_norm": 0.7834386214905589, + "learning_rate": 0.000624, + "loss": 6.9192, + "step": 208 + }, + { + "epoch": 0.00209, + "grad_norm": 0.69675327719343, + "learning_rate": 0.000627, + "loss": 6.9018, + "step": 209 + }, + { + "epoch": 0.0021, + "grad_norm": 0.5517351762495105, + "learning_rate": 0.00063, + "loss": 6.8834, + "step": 210 + }, + { + "epoch": 0.00211, + "grad_norm": 0.6866030941755482, + "learning_rate": 0.000633, + "loss": 6.8831, + "step": 211 + }, + { + "epoch": 0.00212, + "grad_norm": 0.4815850149383259, + "learning_rate": 0.000636, + "loss": 6.8711, + "step": 212 + }, + { + "epoch": 0.00213, + "grad_norm": 0.5445114180165818, + "learning_rate": 0.000639, + "loss": 6.8513, + "step": 213 + }, + { + "epoch": 0.00214, + "grad_norm": 0.5893761744890885, + "learning_rate": 0.000642, + "loss": 6.8511, + "step": 214 + }, + { + "epoch": 0.00215, + "grad_norm": 0.3989068076407255, + "learning_rate": 0.000645, + "loss": 6.8428, + "step": 215 + }, + { + "epoch": 0.00216, + "grad_norm": 0.4674719209271809, + "learning_rate": 0.000648, + "loss": 6.8374, + "step": 216 + }, + { + "epoch": 0.00217, + "grad_norm": 0.556887374004828, + "learning_rate": 0.000651, + "loss": 6.8123, + "step": 217 + }, + { + "epoch": 0.00218, + "grad_norm": 0.5348764881519483, + "learning_rate": 0.000654, + "loss": 6.815, + "step": 218 + }, + { + "epoch": 0.00219, + "grad_norm": 0.6261358280484484, + "learning_rate": 0.000657, + "loss": 6.7906, + "step": 219 + }, + { + "epoch": 0.0022, + "grad_norm": 0.5490386646627615, + "learning_rate": 0.00066, + "loss": 6.8058, + "step": 220 + }, + { + "epoch": 0.00221, + "grad_norm": 0.5297655781082383, + "learning_rate": 0.0006630000000000001, + "loss": 6.7838, + "step": 221 + }, + { + "epoch": 0.00222, + "grad_norm": 0.53116953133404, + "learning_rate": 0.000666, + "loss": 6.7711, + "step": 222 + }, + { + "epoch": 0.00223, + "grad_norm": 0.46985855580572156, + "learning_rate": 0.000669, + "loss": 6.7662, + "step": 223 + }, + { + "epoch": 0.00224, + "grad_norm": 0.48892819667849163, + "learning_rate": 0.0006720000000000001, + "loss": 6.7615, + "step": 224 + }, + { + "epoch": 0.00225, + "grad_norm": 0.5426443115029689, + "learning_rate": 0.000675, + "loss": 6.7505, + "step": 225 + }, + { + "epoch": 0.00226, + "grad_norm": 0.47341745143430014, + "learning_rate": 0.000678, + "loss": 6.7457, + "step": 226 + }, + { + "epoch": 0.00227, + "grad_norm": 0.47753897999990824, + "learning_rate": 0.0006810000000000001, + "loss": 6.7186, + "step": 227 + }, + { + "epoch": 0.00228, + "grad_norm": 0.43835516232945165, + "learning_rate": 0.000684, + "loss": 6.721, + "step": 228 + }, + { + "epoch": 0.00229, + "grad_norm": 0.3666587821660354, + "learning_rate": 0.000687, + "loss": 6.7162, + "step": 229 + }, + { + "epoch": 0.0023, + "grad_norm": 0.5954344273025705, + "learning_rate": 0.0006900000000000001, + "loss": 6.6971, + "step": 230 + }, + { + "epoch": 0.00231, + "grad_norm": 0.8324250780860898, + "learning_rate": 0.000693, + "loss": 6.6912, + "step": 231 + }, + { + "epoch": 0.00232, + "grad_norm": 1.1082992895496584, + "learning_rate": 0.000696, + "loss": 6.7117, + "step": 232 + }, + { + "epoch": 0.00233, + "grad_norm": 0.8989391942429391, + "learning_rate": 0.0006990000000000001, + "loss": 6.6931, + "step": 233 + }, + { + "epoch": 0.00234, + "grad_norm": 0.8501087453831264, + "learning_rate": 0.000702, + "loss": 6.6816, + "step": 234 + }, + { + "epoch": 0.00235, + "grad_norm": 0.9709457331919181, + "learning_rate": 0.000705, + "loss": 6.6715, + "step": 235 + }, + { + "epoch": 0.00236, + "grad_norm": 0.8996312948341649, + "learning_rate": 0.000708, + "loss": 6.6542, + "step": 236 + }, + { + "epoch": 0.00237, + "grad_norm": 0.7941572817187773, + "learning_rate": 0.0007109999999999999, + "loss": 6.6634, + "step": 237 + }, + { + "epoch": 0.00238, + "grad_norm": 0.649710293154646, + "learning_rate": 0.000714, + "loss": 6.6483, + "step": 238 + }, + { + "epoch": 0.00239, + "grad_norm": 0.7175873388046764, + "learning_rate": 0.000717, + "loss": 6.6317, + "step": 239 + }, + { + "epoch": 0.0024, + "grad_norm": 1.0726439429102004, + "learning_rate": 0.0007199999999999999, + "loss": 6.6535, + "step": 240 + }, + { + "epoch": 0.00241, + "grad_norm": 1.1551390926973517, + "learning_rate": 0.000723, + "loss": 6.6508, + "step": 241 + }, + { + "epoch": 0.00242, + "grad_norm": 0.8245355038796127, + "learning_rate": 0.000726, + "loss": 6.615, + "step": 242 + }, + { + "epoch": 0.00243, + "grad_norm": 0.7119399485811939, + "learning_rate": 0.000729, + "loss": 6.6026, + "step": 243 + }, + { + "epoch": 0.00244, + "grad_norm": 0.6396700306701443, + "learning_rate": 0.000732, + "loss": 6.6042, + "step": 244 + }, + { + "epoch": 0.00245, + "grad_norm": 0.668492143187707, + "learning_rate": 0.000735, + "loss": 6.5953, + "step": 245 + }, + { + "epoch": 0.00246, + "grad_norm": 0.6209038847600604, + "learning_rate": 0.000738, + "loss": 6.5759, + "step": 246 + }, + { + "epoch": 0.00247, + "grad_norm": 0.49470830317055475, + "learning_rate": 0.000741, + "loss": 6.5677, + "step": 247 + }, + { + "epoch": 0.00248, + "grad_norm": 0.5745764116827149, + "learning_rate": 0.000744, + "loss": 6.5775, + "step": 248 + }, + { + "epoch": 0.00249, + "grad_norm": 0.5319509172858093, + "learning_rate": 0.000747, + "loss": 6.5558, + "step": 249 + }, + { + "epoch": 0.0025, + "grad_norm": 0.482084360804442, + "learning_rate": 0.00075, + "loss": 6.556, + "step": 250 + }, + { + "epoch": 0.00251, + "grad_norm": 0.46516739275647623, + "learning_rate": 0.000753, + "loss": 6.521, + "step": 251 + }, + { + "epoch": 0.00252, + "grad_norm": 0.4629119115625355, + "learning_rate": 0.000756, + "loss": 6.531, + "step": 252 + }, + { + "epoch": 0.00253, + "grad_norm": 0.37719629506333596, + "learning_rate": 0.000759, + "loss": 6.519, + "step": 253 + }, + { + "epoch": 0.00254, + "grad_norm": 0.44323602664762185, + "learning_rate": 0.000762, + "loss": 6.5149, + "step": 254 + }, + { + "epoch": 0.00255, + "grad_norm": 0.38153047495099895, + "learning_rate": 0.0007650000000000001, + "loss": 6.5129, + "step": 255 + }, + { + "epoch": 0.00256, + "grad_norm": 0.5270908471121704, + "learning_rate": 0.000768, + "loss": 6.4934, + "step": 256 + }, + { + "epoch": 0.00257, + "grad_norm": 0.6201344591076082, + "learning_rate": 0.000771, + "loss": 6.4997, + "step": 257 + }, + { + "epoch": 0.00258, + "grad_norm": 0.6391276132887356, + "learning_rate": 0.0007740000000000001, + "loss": 6.477, + "step": 258 + }, + { + "epoch": 0.00259, + "grad_norm": 0.6374758421191778, + "learning_rate": 0.000777, + "loss": 6.4787, + "step": 259 + }, + { + "epoch": 0.0026, + "grad_norm": 0.5597091224464362, + "learning_rate": 0.0007800000000000001, + "loss": 6.4607, + "step": 260 + }, + { + "epoch": 0.00261, + "grad_norm": 0.587169694241395, + "learning_rate": 0.0007830000000000001, + "loss": 6.4722, + "step": 261 + }, + { + "epoch": 0.00262, + "grad_norm": 0.6112267949829847, + "learning_rate": 0.000786, + "loss": 6.4511, + "step": 262 + }, + { + "epoch": 0.00263, + "grad_norm": 0.5933922824160996, + "learning_rate": 0.0007890000000000001, + "loss": 6.4574, + "step": 263 + }, + { + "epoch": 0.00264, + "grad_norm": 0.6560299493456899, + "learning_rate": 0.0007920000000000001, + "loss": 6.4408, + "step": 264 + }, + { + "epoch": 0.00265, + "grad_norm": 0.9913628812090025, + "learning_rate": 0.000795, + "loss": 6.4422, + "step": 265 + }, + { + "epoch": 0.00266, + "grad_norm": 1.340981155098937, + "learning_rate": 0.0007980000000000001, + "loss": 6.4533, + "step": 266 + }, + { + "epoch": 0.00267, + "grad_norm": 0.8266116512325479, + "learning_rate": 0.0008010000000000001, + "loss": 6.4319, + "step": 267 + }, + { + "epoch": 0.00268, + "grad_norm": 0.9896228951890642, + "learning_rate": 0.000804, + "loss": 6.4378, + "step": 268 + }, + { + "epoch": 0.00269, + "grad_norm": 1.2352739008881923, + "learning_rate": 0.0008070000000000001, + "loss": 6.4279, + "step": 269 + }, + { + "epoch": 0.0027, + "grad_norm": 1.1652427209458782, + "learning_rate": 0.0008100000000000001, + "loss": 6.4326, + "step": 270 + }, + { + "epoch": 0.00271, + "grad_norm": 1.0407181933539849, + "learning_rate": 0.000813, + "loss": 6.4319, + "step": 271 + }, + { + "epoch": 0.00272, + "grad_norm": 0.8880696455452757, + "learning_rate": 0.0008160000000000001, + "loss": 6.4138, + "step": 272 + }, + { + "epoch": 0.00273, + "grad_norm": 0.8477724135782442, + "learning_rate": 0.0008190000000000001, + "loss": 6.404, + "step": 273 + }, + { + "epoch": 0.00274, + "grad_norm": 0.7818547901656048, + "learning_rate": 0.000822, + "loss": 6.383, + "step": 274 + }, + { + "epoch": 0.00275, + "grad_norm": 0.6915610404761925, + "learning_rate": 0.0008250000000000001, + "loss": 6.3888, + "step": 275 + }, + { + "epoch": 0.00276, + "grad_norm": 0.6369714396426732, + "learning_rate": 0.0008280000000000001, + "loss": 6.3775, + "step": 276 + }, + { + "epoch": 0.00277, + "grad_norm": 0.6792843462530734, + "learning_rate": 0.0008310000000000001, + "loss": 6.3726, + "step": 277 + }, + { + "epoch": 0.00278, + "grad_norm": 0.6716653191335978, + "learning_rate": 0.0008340000000000001, + "loss": 6.3561, + "step": 278 + }, + { + "epoch": 0.00279, + "grad_norm": 0.6104270551210891, + "learning_rate": 0.0008370000000000001, + "loss": 6.3562, + "step": 279 + }, + { + "epoch": 0.0028, + "grad_norm": 0.5327216367370322, + "learning_rate": 0.0008400000000000001, + "loss": 6.3379, + "step": 280 + }, + { + "epoch": 0.00281, + "grad_norm": 0.4495801132850456, + "learning_rate": 0.0008430000000000001, + "loss": 6.3253, + "step": 281 + }, + { + "epoch": 0.00282, + "grad_norm": 0.4185635012011635, + "learning_rate": 0.000846, + "loss": 6.3254, + "step": 282 + }, + { + "epoch": 0.00283, + "grad_norm": 0.41306707794715253, + "learning_rate": 0.0008489999999999999, + "loss": 6.3154, + "step": 283 + }, + { + "epoch": 0.00284, + "grad_norm": 0.447351018324713, + "learning_rate": 0.0008519999999999999, + "loss": 6.3075, + "step": 284 + }, + { + "epoch": 0.00285, + "grad_norm": 0.4656314656211844, + "learning_rate": 0.000855, + "loss": 6.3102, + "step": 285 + }, + { + "epoch": 0.00286, + "grad_norm": 0.5287748664566101, + "learning_rate": 0.0008579999999999999, + "loss": 6.2881, + "step": 286 + }, + { + "epoch": 0.00287, + "grad_norm": 0.60454227039484, + "learning_rate": 0.000861, + "loss": 6.2937, + "step": 287 + }, + { + "epoch": 0.00288, + "grad_norm": 0.6409086244349441, + "learning_rate": 0.000864, + "loss": 6.2743, + "step": 288 + }, + { + "epoch": 0.00289, + "grad_norm": 0.7540915605033448, + "learning_rate": 0.0008669999999999999, + "loss": 6.2909, + "step": 289 + }, + { + "epoch": 0.0029, + "grad_norm": 0.9532532853232767, + "learning_rate": 0.00087, + "loss": 6.2777, + "step": 290 + }, + { + "epoch": 0.00291, + "grad_norm": 0.9297633606905631, + "learning_rate": 0.000873, + "loss": 6.278, + "step": 291 + }, + { + "epoch": 0.00292, + "grad_norm": 0.6544361038243887, + "learning_rate": 0.0008759999999999999, + "loss": 6.2635, + "step": 292 + }, + { + "epoch": 0.00293, + "grad_norm": 0.9586546136156446, + "learning_rate": 0.000879, + "loss": 6.2582, + "step": 293 + }, + { + "epoch": 0.00294, + "grad_norm": 0.8674924960686783, + "learning_rate": 0.000882, + "loss": 6.2684, + "step": 294 + }, + { + "epoch": 0.00295, + "grad_norm": 0.8596325280201164, + "learning_rate": 0.0008849999999999999, + "loss": 6.2363, + "step": 295 + }, + { + "epoch": 0.00296, + "grad_norm": 0.9927641151458286, + "learning_rate": 0.000888, + "loss": 6.2541, + "step": 296 + }, + { + "epoch": 0.00297, + "grad_norm": 1.342485766358639, + "learning_rate": 0.000891, + "loss": 6.2408, + "step": 297 + }, + { + "epoch": 0.00298, + "grad_norm": 1.1878316085061287, + "learning_rate": 0.0008939999999999999, + "loss": 6.2537, + "step": 298 + }, + { + "epoch": 0.00299, + "grad_norm": 0.9496422749623654, + "learning_rate": 0.000897, + "loss": 6.2242, + "step": 299 + }, + { + "epoch": 0.003, + "grad_norm": 1.122193003605518, + "learning_rate": 0.0009, + "loss": 6.2361, + "step": 300 + }, + { + "epoch": 0.00301, + "grad_norm": 1.129970594986655, + "learning_rate": 0.0009029999999999999, + "loss": 6.2273, + "step": 301 + }, + { + "epoch": 0.00302, + "grad_norm": 1.0740447263196922, + "learning_rate": 0.000906, + "loss": 6.2071, + "step": 302 + }, + { + "epoch": 0.00303, + "grad_norm": 1.1900410452977912, + "learning_rate": 0.000909, + "loss": 6.2313, + "step": 303 + }, + { + "epoch": 0.00304, + "grad_norm": 0.804691464481299, + "learning_rate": 0.000912, + "loss": 6.2111, + "step": 304 + }, + { + "epoch": 0.00305, + "grad_norm": 0.7167209084416579, + "learning_rate": 0.000915, + "loss": 6.2106, + "step": 305 + }, + { + "epoch": 0.00306, + "grad_norm": 0.5686498260282739, + "learning_rate": 0.000918, + "loss": 6.1897, + "step": 306 + }, + { + "epoch": 0.00307, + "grad_norm": 0.5740516870647188, + "learning_rate": 0.000921, + "loss": 6.1847, + "step": 307 + }, + { + "epoch": 0.00308, + "grad_norm": 0.5214022662741855, + "learning_rate": 0.000924, + "loss": 6.1668, + "step": 308 + }, + { + "epoch": 0.00309, + "grad_norm": 0.489157506496739, + "learning_rate": 0.000927, + "loss": 6.1798, + "step": 309 + }, + { + "epoch": 0.0031, + "grad_norm": 0.4872945232166538, + "learning_rate": 0.00093, + "loss": 6.1622, + "step": 310 + }, + { + "epoch": 0.00311, + "grad_norm": 0.4909949625440354, + "learning_rate": 0.000933, + "loss": 6.1533, + "step": 311 + }, + { + "epoch": 0.00312, + "grad_norm": 0.4186129744309998, + "learning_rate": 0.000936, + "loss": 6.1314, + "step": 312 + }, + { + "epoch": 0.00313, + "grad_norm": 0.36050967020968366, + "learning_rate": 0.0009390000000000001, + "loss": 6.1442, + "step": 313 + }, + { + "epoch": 0.00314, + "grad_norm": 0.3818285660239077, + "learning_rate": 0.000942, + "loss": 6.1495, + "step": 314 + }, + { + "epoch": 0.00315, + "grad_norm": 0.42967169925093956, + "learning_rate": 0.000945, + "loss": 6.126, + "step": 315 + }, + { + "epoch": 0.00316, + "grad_norm": 0.46511434454587514, + "learning_rate": 0.0009480000000000001, + "loss": 6.1004, + "step": 316 + }, + { + "epoch": 0.00317, + "grad_norm": 0.5237888199450732, + "learning_rate": 0.000951, + "loss": 6.1201, + "step": 317 + }, + { + "epoch": 0.00318, + "grad_norm": 0.6610672935792641, + "learning_rate": 0.000954, + "loss": 6.1161, + "step": 318 + }, + { + "epoch": 0.00319, + "grad_norm": 0.9099491192879063, + "learning_rate": 0.0009570000000000001, + "loss": 6.1122, + "step": 319 + }, + { + "epoch": 0.0032, + "grad_norm": 1.0329714723925014, + "learning_rate": 0.00096, + "loss": 6.1198, + "step": 320 + }, + { + "epoch": 0.00321, + "grad_norm": 0.9944509511152075, + "learning_rate": 0.000963, + "loss": 6.1077, + "step": 321 + }, + { + "epoch": 0.00322, + "grad_norm": 1.41191394849347, + "learning_rate": 0.0009660000000000001, + "loss": 6.1101, + "step": 322 + }, + { + "epoch": 0.00323, + "grad_norm": 0.9172707652477707, + "learning_rate": 0.000969, + "loss": 6.097, + "step": 323 + }, + { + "epoch": 0.00324, + "grad_norm": 1.1261423941310122, + "learning_rate": 0.0009720000000000001, + "loss": 6.1132, + "step": 324 + }, + { + "epoch": 0.00325, + "grad_norm": 0.6623590740718236, + "learning_rate": 0.0009750000000000001, + "loss": 6.0626, + "step": 325 + }, + { + "epoch": 0.00326, + "grad_norm": 0.7364273563271467, + "learning_rate": 0.0009780000000000001, + "loss": 6.0809, + "step": 326 + }, + { + "epoch": 0.00327, + "grad_norm": 0.8106867198528734, + "learning_rate": 0.000981, + "loss": 6.0853, + "step": 327 + }, + { + "epoch": 0.00328, + "grad_norm": 0.9008187294951384, + "learning_rate": 0.000984, + "loss": 6.0637, + "step": 328 + }, + { + "epoch": 0.00329, + "grad_norm": 1.0311811602663732, + "learning_rate": 0.000987, + "loss": 6.0736, + "step": 329 + }, + { + "epoch": 0.0033, + "grad_norm": 0.7413155368855245, + "learning_rate": 0.00099, + "loss": 6.0572, + "step": 330 + }, + { + "epoch": 0.00331, + "grad_norm": 0.6745658849207387, + "learning_rate": 0.0009930000000000002, + "loss": 6.0599, + "step": 331 + }, + { + "epoch": 0.00332, + "grad_norm": 0.5913240343902441, + "learning_rate": 0.0009960000000000001, + "loss": 6.041, + "step": 332 + }, + { + "epoch": 0.00333, + "grad_norm": 0.5668749800176679, + "learning_rate": 0.000999, + "loss": 6.025, + "step": 333 + }, + { + "epoch": 0.00334, + "grad_norm": 0.5007608052342689, + "learning_rate": 0.001002, + "loss": 6.0336, + "step": 334 + }, + { + "epoch": 0.00335, + "grad_norm": 0.3983861566645405, + "learning_rate": 0.001005, + "loss": 6.0284, + "step": 335 + }, + { + "epoch": 0.00336, + "grad_norm": 0.4274260388302738, + "learning_rate": 0.001008, + "loss": 6.0181, + "step": 336 + }, + { + "epoch": 0.00337, + "grad_norm": 0.5335498119421307, + "learning_rate": 0.0010110000000000002, + "loss": 6.0004, + "step": 337 + }, + { + "epoch": 0.00338, + "grad_norm": 0.4831054617031532, + "learning_rate": 0.0010140000000000001, + "loss": 6.0126, + "step": 338 + }, + { + "epoch": 0.00339, + "grad_norm": 0.5437142240239534, + "learning_rate": 0.0010170000000000001, + "loss": 6.0197, + "step": 339 + }, + { + "epoch": 0.0034, + "grad_norm": 0.5265472037464025, + "learning_rate": 0.00102, + "loss": 5.9884, + "step": 340 + }, + { + "epoch": 0.00341, + "grad_norm": 0.4912689865797111, + "learning_rate": 0.001023, + "loss": 5.9692, + "step": 341 + }, + { + "epoch": 0.00342, + "grad_norm": 0.42997171211054086, + "learning_rate": 0.001026, + "loss": 5.974, + "step": 342 + }, + { + "epoch": 0.00343, + "grad_norm": 0.5197303002983154, + "learning_rate": 0.0010290000000000002, + "loss": 5.9763, + "step": 343 + }, + { + "epoch": 0.00344, + "grad_norm": 0.8117900330313431, + "learning_rate": 0.001032, + "loss": 5.9747, + "step": 344 + }, + { + "epoch": 0.00345, + "grad_norm": 1.1753813945983669, + "learning_rate": 0.001035, + "loss": 5.9788, + "step": 345 + }, + { + "epoch": 0.00346, + "grad_norm": 0.8135676081857764, + "learning_rate": 0.0010379999999999999, + "loss": 5.9748, + "step": 346 + }, + { + "epoch": 0.00347, + "grad_norm": 1.0182912247404574, + "learning_rate": 0.001041, + "loss": 5.9557, + "step": 347 + }, + { + "epoch": 0.00348, + "grad_norm": 1.1407576555241683, + "learning_rate": 0.001044, + "loss": 5.978, + "step": 348 + }, + { + "epoch": 0.00349, + "grad_norm": 0.7853068136249622, + "learning_rate": 0.001047, + "loss": 5.9412, + "step": 349 + }, + { + "epoch": 0.0035, + "grad_norm": 1.0427704318540805, + "learning_rate": 0.00105, + "loss": 5.9779, + "step": 350 + }, + { + "epoch": 0.00351, + "grad_norm": 0.8821399606009466, + "learning_rate": 0.001053, + "loss": 5.9701, + "step": 351 + }, + { + "epoch": 0.00352, + "grad_norm": 0.9582157894617032, + "learning_rate": 0.0010559999999999999, + "loss": 5.955, + "step": 352 + }, + { + "epoch": 0.00353, + "grad_norm": 1.0526665256553966, + "learning_rate": 0.001059, + "loss": 5.958, + "step": 353 + }, + { + "epoch": 0.00354, + "grad_norm": 1.045275747166985, + "learning_rate": 0.001062, + "loss": 5.9353, + "step": 354 + }, + { + "epoch": 0.00355, + "grad_norm": 1.1505195376317356, + "learning_rate": 0.001065, + "loss": 5.9542, + "step": 355 + }, + { + "epoch": 0.00356, + "grad_norm": 1.0355197503433216, + "learning_rate": 0.001068, + "loss": 5.9425, + "step": 356 + }, + { + "epoch": 0.00357, + "grad_norm": 1.127934795973434, + "learning_rate": 0.001071, + "loss": 5.9396, + "step": 357 + }, + { + "epoch": 0.00358, + "grad_norm": 0.7430425972675007, + "learning_rate": 0.001074, + "loss": 5.9201, + "step": 358 + }, + { + "epoch": 0.00359, + "grad_norm": 0.6597065121039739, + "learning_rate": 0.001077, + "loss": 5.9099, + "step": 359 + }, + { + "epoch": 0.0036, + "grad_norm": 0.6034653307534226, + "learning_rate": 0.00108, + "loss": 5.9081, + "step": 360 + }, + { + "epoch": 0.00361, + "grad_norm": 0.5960018023982208, + "learning_rate": 0.001083, + "loss": 5.9047, + "step": 361 + }, + { + "epoch": 0.00362, + "grad_norm": 0.4563634882449727, + "learning_rate": 0.001086, + "loss": 5.884, + "step": 362 + }, + { + "epoch": 0.00363, + "grad_norm": 0.49274399902142996, + "learning_rate": 0.001089, + "loss": 5.8759, + "step": 363 + }, + { + "epoch": 0.00364, + "grad_norm": 0.4937234603270663, + "learning_rate": 0.001092, + "loss": 5.8901, + "step": 364 + }, + { + "epoch": 0.00365, + "grad_norm": 0.5102012627619638, + "learning_rate": 0.001095, + "loss": 5.888, + "step": 365 + }, + { + "epoch": 0.00366, + "grad_norm": 0.4676595798467989, + "learning_rate": 0.001098, + "loss": 5.862, + "step": 366 + }, + { + "epoch": 0.00367, + "grad_norm": 0.49526135096535867, + "learning_rate": 0.001101, + "loss": 5.8667, + "step": 367 + }, + { + "epoch": 0.00368, + "grad_norm": 0.47887378181150303, + "learning_rate": 0.001104, + "loss": 5.8643, + "step": 368 + }, + { + "epoch": 0.00369, + "grad_norm": 0.48887117156741833, + "learning_rate": 0.001107, + "loss": 5.8686, + "step": 369 + }, + { + "epoch": 0.0037, + "grad_norm": 0.4473709149836047, + "learning_rate": 0.00111, + "loss": 5.8472, + "step": 370 + }, + { + "epoch": 0.00371, + "grad_norm": 0.38589559577094035, + "learning_rate": 0.001113, + "loss": 5.8158, + "step": 371 + }, + { + "epoch": 0.00372, + "grad_norm": 0.3912315505838062, + "learning_rate": 0.001116, + "loss": 5.8379, + "step": 372 + }, + { + "epoch": 0.00373, + "grad_norm": 0.38616823047071297, + "learning_rate": 0.001119, + "loss": 5.8267, + "step": 373 + }, + { + "epoch": 0.00374, + "grad_norm": 0.45854090440574513, + "learning_rate": 0.001122, + "loss": 5.8316, + "step": 374 + }, + { + "epoch": 0.00375, + "grad_norm": 0.5169440196993219, + "learning_rate": 0.0011250000000000001, + "loss": 5.8332, + "step": 375 + }, + { + "epoch": 0.00376, + "grad_norm": 0.5067806568705457, + "learning_rate": 0.001128, + "loss": 5.8287, + "step": 376 + }, + { + "epoch": 0.00377, + "grad_norm": 0.48558945502532774, + "learning_rate": 0.001131, + "loss": 5.8236, + "step": 377 + }, + { + "epoch": 0.00378, + "grad_norm": 0.47384141098896654, + "learning_rate": 0.001134, + "loss": 5.8187, + "step": 378 + }, + { + "epoch": 0.00379, + "grad_norm": 0.5705731390544022, + "learning_rate": 0.001137, + "loss": 5.8065, + "step": 379 + }, + { + "epoch": 0.0038, + "grad_norm": 0.8415616570321116, + "learning_rate": 0.00114, + "loss": 5.8323, + "step": 380 + }, + { + "epoch": 0.00381, + "grad_norm": 1.152388235651458, + "learning_rate": 0.0011430000000000001, + "loss": 5.8155, + "step": 381 + }, + { + "epoch": 0.00382, + "grad_norm": 0.7784536663385624, + "learning_rate": 0.001146, + "loss": 5.7896, + "step": 382 + }, + { + "epoch": 0.00383, + "grad_norm": 1.2096458575940454, + "learning_rate": 0.001149, + "loss": 5.8132, + "step": 383 + }, + { + "epoch": 0.00384, + "grad_norm": 1.2032626959449177, + "learning_rate": 0.001152, + "loss": 5.8295, + "step": 384 + }, + { + "epoch": 0.00385, + "grad_norm": 1.2258405640081835, + "learning_rate": 0.001155, + "loss": 5.8193, + "step": 385 + }, + { + "epoch": 0.00386, + "grad_norm": 1.060557976067675, + "learning_rate": 0.001158, + "loss": 5.8182, + "step": 386 + }, + { + "epoch": 0.00387, + "grad_norm": 1.6852101829047932, + "learning_rate": 0.0011610000000000001, + "loss": 5.8306, + "step": 387 + }, + { + "epoch": 0.00388, + "grad_norm": 0.7125426173667109, + "learning_rate": 0.001164, + "loss": 5.7875, + "step": 388 + }, + { + "epoch": 0.00389, + "grad_norm": 0.9333298966305301, + "learning_rate": 0.001167, + "loss": 5.8092, + "step": 389 + }, + { + "epoch": 0.0039, + "grad_norm": 0.7871116791575423, + "learning_rate": 0.00117, + "loss": 5.7842, + "step": 390 + }, + { + "epoch": 0.00391, + "grad_norm": 0.9033950769229127, + "learning_rate": 0.001173, + "loss": 5.7945, + "step": 391 + }, + { + "epoch": 0.00392, + "grad_norm": 1.0985861295177402, + "learning_rate": 0.001176, + "loss": 5.8091, + "step": 392 + }, + { + "epoch": 0.00393, + "grad_norm": 0.9893983760666882, + "learning_rate": 0.0011790000000000001, + "loss": 5.787, + "step": 393 + }, + { + "epoch": 0.00394, + "grad_norm": 1.0087630537900902, + "learning_rate": 0.001182, + "loss": 5.7718, + "step": 394 + }, + { + "epoch": 0.00395, + "grad_norm": 0.9357634093540522, + "learning_rate": 0.001185, + "loss": 5.7577, + "step": 395 + }, + { + "epoch": 0.00396, + "grad_norm": 0.8613606742928634, + "learning_rate": 0.001188, + "loss": 5.7674, + "step": 396 + }, + { + "epoch": 0.00397, + "grad_norm": 0.9393680367248612, + "learning_rate": 0.001191, + "loss": 5.7666, + "step": 397 + }, + { + "epoch": 0.00398, + "grad_norm": 0.8380984764873387, + "learning_rate": 0.0011940000000000002, + "loss": 5.7669, + "step": 398 + }, + { + "epoch": 0.00399, + "grad_norm": 0.7495495962771003, + "learning_rate": 0.0011970000000000001, + "loss": 5.7689, + "step": 399 + }, + { + "epoch": 0.004, + "grad_norm": 0.6237821646680863, + "learning_rate": 0.0012000000000000001, + "loss": 5.751, + "step": 400 + }, + { + "epoch": 0.00401, + "grad_norm": 0.6042562364668606, + "learning_rate": 0.001203, + "loss": 5.7286, + "step": 401 + }, + { + "epoch": 0.00402, + "grad_norm": 0.6800421237430357, + "learning_rate": 0.001206, + "loss": 5.7387, + "step": 402 + }, + { + "epoch": 0.00403, + "grad_norm": 0.5349967773183291, + "learning_rate": 0.001209, + "loss": 5.7296, + "step": 403 + }, + { + "epoch": 0.00404, + "grad_norm": 0.4491885962138907, + "learning_rate": 0.0012120000000000002, + "loss": 5.7204, + "step": 404 + }, + { + "epoch": 0.00405, + "grad_norm": 0.4231295613571548, + "learning_rate": 0.0012150000000000002, + "loss": 5.7133, + "step": 405 + }, + { + "epoch": 0.00406, + "grad_norm": 0.4344671315280792, + "learning_rate": 0.0012180000000000001, + "loss": 5.7007, + "step": 406 + }, + { + "epoch": 0.00407, + "grad_norm": 0.3917681066216858, + "learning_rate": 0.0012209999999999999, + "loss": 5.6947, + "step": 407 + }, + { + "epoch": 0.00408, + "grad_norm": 0.3526446903520604, + "learning_rate": 0.001224, + "loss": 5.7113, + "step": 408 + }, + { + "epoch": 0.00409, + "grad_norm": 0.3601774169994176, + "learning_rate": 0.001227, + "loss": 5.689, + "step": 409 + }, + { + "epoch": 0.0041, + "grad_norm": 0.3999316895065895, + "learning_rate": 0.00123, + "loss": 5.6821, + "step": 410 + }, + { + "epoch": 0.00411, + "grad_norm": 0.5215468120681382, + "learning_rate": 0.001233, + "loss": 5.6975, + "step": 411 + }, + { + "epoch": 0.00412, + "grad_norm": 0.8731356141140694, + "learning_rate": 0.001236, + "loss": 5.697, + "step": 412 + }, + { + "epoch": 0.00413, + "grad_norm": 1.2920248463477522, + "learning_rate": 0.0012389999999999999, + "loss": 5.7158, + "step": 413 + }, + { + "epoch": 0.00414, + "grad_norm": 0.7474803494460109, + "learning_rate": 0.001242, + "loss": 5.6771, + "step": 414 + }, + { + "epoch": 0.00415, + "grad_norm": 0.9736431117993121, + "learning_rate": 0.001245, + "loss": 5.6888, + "step": 415 + }, + { + "epoch": 0.00416, + "grad_norm": 0.8653333577780613, + "learning_rate": 0.001248, + "loss": 5.6728, + "step": 416 + }, + { + "epoch": 0.00417, + "grad_norm": 0.6891363999339204, + "learning_rate": 0.001251, + "loss": 5.695, + "step": 417 + }, + { + "epoch": 0.00418, + "grad_norm": 0.7955125411502495, + "learning_rate": 0.001254, + "loss": 5.6734, + "step": 418 + }, + { + "epoch": 0.00419, + "grad_norm": 0.8034523576562718, + "learning_rate": 0.0012569999999999999, + "loss": 5.6601, + "step": 419 + }, + { + "epoch": 0.0042, + "grad_norm": 0.7731586474207807, + "learning_rate": 0.00126, + "loss": 5.6748, + "step": 420 + }, + { + "epoch": 0.00421, + "grad_norm": 0.747486262420627, + "learning_rate": 0.001263, + "loss": 5.6666, + "step": 421 + }, + { + "epoch": 0.00422, + "grad_norm": 0.7917981329409665, + "learning_rate": 0.001266, + "loss": 5.6544, + "step": 422 + }, + { + "epoch": 0.00423, + "grad_norm": 1.0889555078416353, + "learning_rate": 0.001269, + "loss": 5.6655, + "step": 423 + }, + { + "epoch": 0.00424, + "grad_norm": 0.9654337501414605, + "learning_rate": 0.001272, + "loss": 5.6614, + "step": 424 + }, + { + "epoch": 0.00425, + "grad_norm": 0.9055610792467201, + "learning_rate": 0.001275, + "loss": 5.6624, + "step": 425 + }, + { + "epoch": 0.00426, + "grad_norm": 0.8212981627676188, + "learning_rate": 0.001278, + "loss": 5.6637, + "step": 426 + }, + { + "epoch": 0.00427, + "grad_norm": 0.7602647353125763, + "learning_rate": 0.001281, + "loss": 5.6467, + "step": 427 + }, + { + "epoch": 0.00428, + "grad_norm": 0.677777266675102, + "learning_rate": 0.001284, + "loss": 5.6204, + "step": 428 + }, + { + "epoch": 0.00429, + "grad_norm": 0.5947091658499406, + "learning_rate": 0.001287, + "loss": 5.6311, + "step": 429 + }, + { + "epoch": 0.0043, + "grad_norm": 0.6377204770277832, + "learning_rate": 0.00129, + "loss": 5.6309, + "step": 430 + }, + { + "epoch": 0.00431, + "grad_norm": 0.6897719280155576, + "learning_rate": 0.001293, + "loss": 5.6193, + "step": 431 + }, + { + "epoch": 0.00432, + "grad_norm": 0.6884919593361081, + "learning_rate": 0.001296, + "loss": 5.6258, + "step": 432 + }, + { + "epoch": 0.00433, + "grad_norm": 0.6913913571918432, + "learning_rate": 0.001299, + "loss": 5.6177, + "step": 433 + }, + { + "epoch": 0.00434, + "grad_norm": 0.7261280979587743, + "learning_rate": 0.001302, + "loss": 5.6176, + "step": 434 + }, + { + "epoch": 0.00435, + "grad_norm": 0.8547702731757605, + "learning_rate": 0.001305, + "loss": 5.6162, + "step": 435 + }, + { + "epoch": 0.00436, + "grad_norm": 0.9457491419795808, + "learning_rate": 0.001308, + "loss": 5.5986, + "step": 436 + }, + { + "epoch": 0.00437, + "grad_norm": 0.9092672289397813, + "learning_rate": 0.001311, + "loss": 5.6144, + "step": 437 + }, + { + "epoch": 0.00438, + "grad_norm": 0.9049337850080227, + "learning_rate": 0.001314, + "loss": 5.6026, + "step": 438 + }, + { + "epoch": 0.00439, + "grad_norm": 0.7237349559204094, + "learning_rate": 0.001317, + "loss": 5.622, + "step": 439 + }, + { + "epoch": 0.0044, + "grad_norm": 0.8693791239531735, + "learning_rate": 0.00132, + "loss": 5.6008, + "step": 440 + }, + { + "epoch": 0.00441, + "grad_norm": 0.8508838859779835, + "learning_rate": 0.001323, + "loss": 5.5985, + "step": 441 + }, + { + "epoch": 0.00442, + "grad_norm": 0.6987140204651114, + "learning_rate": 0.0013260000000000001, + "loss": 5.587, + "step": 442 + }, + { + "epoch": 0.00443, + "grad_norm": 0.5719525863328404, + "learning_rate": 0.001329, + "loss": 5.5843, + "step": 443 + }, + { + "epoch": 0.00444, + "grad_norm": 0.5407139364493208, + "learning_rate": 0.001332, + "loss": 5.5841, + "step": 444 + }, + { + "epoch": 0.00445, + "grad_norm": 0.5170533332401992, + "learning_rate": 0.001335, + "loss": 5.5667, + "step": 445 + }, + { + "epoch": 0.00446, + "grad_norm": 0.43806698904849195, + "learning_rate": 0.001338, + "loss": 5.5666, + "step": 446 + }, + { + "epoch": 0.00447, + "grad_norm": 0.49048920433285326, + "learning_rate": 0.001341, + "loss": 5.5671, + "step": 447 + }, + { + "epoch": 0.00448, + "grad_norm": 0.46215050883864656, + "learning_rate": 0.0013440000000000001, + "loss": 5.5475, + "step": 448 + }, + { + "epoch": 0.00449, + "grad_norm": 0.5259389714982564, + "learning_rate": 0.001347, + "loss": 5.5523, + "step": 449 + }, + { + "epoch": 0.0045, + "grad_norm": 0.6261840891481112, + "learning_rate": 0.00135, + "loss": 5.5542, + "step": 450 + }, + { + "epoch": 0.00451, + "grad_norm": 0.659112753346069, + "learning_rate": 0.001353, + "loss": 5.5431, + "step": 451 + }, + { + "epoch": 0.00452, + "grad_norm": 0.6800042712218282, + "learning_rate": 0.001356, + "loss": 5.5311, + "step": 452 + }, + { + "epoch": 0.00453, + "grad_norm": 0.5745322110996829, + "learning_rate": 0.001359, + "loss": 5.5188, + "step": 453 + }, + { + "epoch": 0.00454, + "grad_norm": 0.5230006416449293, + "learning_rate": 0.0013620000000000001, + "loss": 5.5319, + "step": 454 + }, + { + "epoch": 0.00455, + "grad_norm": 0.5858648520183006, + "learning_rate": 0.0013650000000000001, + "loss": 5.5314, + "step": 455 + }, + { + "epoch": 0.00456, + "grad_norm": 0.5800568870191161, + "learning_rate": 0.001368, + "loss": 5.5269, + "step": 456 + }, + { + "epoch": 0.00457, + "grad_norm": 0.5535894536098482, + "learning_rate": 0.001371, + "loss": 5.5162, + "step": 457 + }, + { + "epoch": 0.00458, + "grad_norm": 0.56095957103827, + "learning_rate": 0.001374, + "loss": 5.5224, + "step": 458 + }, + { + "epoch": 0.00459, + "grad_norm": 0.7492551931077938, + "learning_rate": 0.0013770000000000002, + "loss": 5.517, + "step": 459 + }, + { + "epoch": 0.0046, + "grad_norm": 0.8251083608050601, + "learning_rate": 0.0013800000000000002, + "loss": 5.5084, + "step": 460 + }, + { + "epoch": 0.00461, + "grad_norm": 0.7810512714683711, + "learning_rate": 0.0013830000000000001, + "loss": 5.5038, + "step": 461 + }, + { + "epoch": 0.00462, + "grad_norm": 0.8065032793416945, + "learning_rate": 0.001386, + "loss": 5.5174, + "step": 462 + }, + { + "epoch": 0.00463, + "grad_norm": 1.0894770209329594, + "learning_rate": 0.001389, + "loss": 5.5013, + "step": 463 + }, + { + "epoch": 0.00464, + "grad_norm": 1.3225439160647088, + "learning_rate": 0.001392, + "loss": 5.5348, + "step": 464 + }, + { + "epoch": 0.00465, + "grad_norm": 1.0604196603169807, + "learning_rate": 0.0013950000000000002, + "loss": 5.4973, + "step": 465 + }, + { + "epoch": 0.00466, + "grad_norm": 1.279638390325454, + "learning_rate": 0.0013980000000000002, + "loss": 5.5365, + "step": 466 + }, + { + "epoch": 0.00467, + "grad_norm": 0.8376473861337255, + "learning_rate": 0.0014010000000000001, + "loss": 5.5034, + "step": 467 + }, + { + "epoch": 0.00468, + "grad_norm": 0.8787509291075707, + "learning_rate": 0.001404, + "loss": 5.4981, + "step": 468 + }, + { + "epoch": 0.00469, + "grad_norm": 0.8315105482947757, + "learning_rate": 0.001407, + "loss": 5.4995, + "step": 469 + }, + { + "epoch": 0.0047, + "grad_norm": 0.900233910053011, + "learning_rate": 0.00141, + "loss": 5.5098, + "step": 470 + }, + { + "epoch": 0.00471, + "grad_norm": 1.1782268624389831, + "learning_rate": 0.001413, + "loss": 5.5031, + "step": 471 + }, + { + "epoch": 0.00472, + "grad_norm": 0.8433457613569132, + "learning_rate": 0.001416, + "loss": 5.4989, + "step": 472 + }, + { + "epoch": 0.00473, + "grad_norm": 0.8984284318795871, + "learning_rate": 0.001419, + "loss": 5.5107, + "step": 473 + }, + { + "epoch": 0.00474, + "grad_norm": 0.8057807296189134, + "learning_rate": 0.0014219999999999999, + "loss": 5.4892, + "step": 474 + }, + { + "epoch": 0.00475, + "grad_norm": 0.8485388443307728, + "learning_rate": 0.001425, + "loss": 5.4826, + "step": 475 + }, + { + "epoch": 0.00476, + "grad_norm": 0.9809665505076786, + "learning_rate": 0.001428, + "loss": 5.5192, + "step": 476 + }, + { + "epoch": 0.00477, + "grad_norm": 0.9686040040277449, + "learning_rate": 0.001431, + "loss": 5.4785, + "step": 477 + }, + { + "epoch": 0.00478, + "grad_norm": 0.8580634771679295, + "learning_rate": 0.001434, + "loss": 5.4949, + "step": 478 + }, + { + "epoch": 0.00479, + "grad_norm": 0.9699411511566143, + "learning_rate": 0.001437, + "loss": 5.4782, + "step": 479 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8190893004419723, + "learning_rate": 0.0014399999999999999, + "loss": 5.4711, + "step": 480 + }, + { + "epoch": 0.00481, + "grad_norm": 0.7019568417012634, + "learning_rate": 0.001443, + "loss": 5.4711, + "step": 481 + }, + { + "epoch": 0.00482, + "grad_norm": 0.6677085458766991, + "learning_rate": 0.001446, + "loss": 5.4413, + "step": 482 + }, + { + "epoch": 0.00483, + "grad_norm": 0.6622223138809283, + "learning_rate": 0.001449, + "loss": 5.4499, + "step": 483 + }, + { + "epoch": 0.00484, + "grad_norm": 0.6831178312322733, + "learning_rate": 0.001452, + "loss": 5.4529, + "step": 484 + }, + { + "epoch": 0.00485, + "grad_norm": 0.6786720110326826, + "learning_rate": 0.001455, + "loss": 5.4548, + "step": 485 + }, + { + "epoch": 0.00486, + "grad_norm": 0.639008514866701, + "learning_rate": 0.001458, + "loss": 5.4237, + "step": 486 + }, + { + "epoch": 0.00487, + "grad_norm": 0.6663550617928226, + "learning_rate": 0.001461, + "loss": 5.4384, + "step": 487 + }, + { + "epoch": 0.00488, + "grad_norm": 0.5492133897414355, + "learning_rate": 0.001464, + "loss": 5.4132, + "step": 488 + }, + { + "epoch": 0.00489, + "grad_norm": 0.5801507624750007, + "learning_rate": 0.001467, + "loss": 5.4021, + "step": 489 + }, + { + "epoch": 0.0049, + "grad_norm": 0.7369258331072537, + "learning_rate": 0.00147, + "loss": 5.4206, + "step": 490 + }, + { + "epoch": 0.00491, + "grad_norm": 0.8149385883601376, + "learning_rate": 0.001473, + "loss": 5.4211, + "step": 491 + }, + { + "epoch": 0.00492, + "grad_norm": 0.7605903135404127, + "learning_rate": 0.001476, + "loss": 5.4167, + "step": 492 + }, + { + "epoch": 0.00493, + "grad_norm": 0.5930344173404182, + "learning_rate": 0.001479, + "loss": 5.4047, + "step": 493 + }, + { + "epoch": 0.00494, + "grad_norm": 0.7636413305061132, + "learning_rate": 0.001482, + "loss": 5.4167, + "step": 494 + }, + { + "epoch": 0.00495, + "grad_norm": 0.7369014833667976, + "learning_rate": 0.001485, + "loss": 5.4191, + "step": 495 + }, + { + "epoch": 0.00496, + "grad_norm": 0.8504550792082386, + "learning_rate": 0.001488, + "loss": 5.4031, + "step": 496 + }, + { + "epoch": 0.00497, + "grad_norm": 0.8843425860263048, + "learning_rate": 0.001491, + "loss": 5.3921, + "step": 497 + }, + { + "epoch": 0.00498, + "grad_norm": 0.7438750793797253, + "learning_rate": 0.001494, + "loss": 5.4145, + "step": 498 + }, + { + "epoch": 0.00499, + "grad_norm": 0.7036650868069556, + "learning_rate": 0.001497, + "loss": 5.3822, + "step": 499 + }, + { + "epoch": 0.005, + "grad_norm": 0.7877256477029045, + "learning_rate": 0.0015, + "loss": 5.3896, + "step": 500 + }, + { + "epoch": 0.00501, + "grad_norm": 0.7990985096145019, + "learning_rate": 0.001503, + "loss": 5.3912, + "step": 501 + }, + { + "epoch": 0.00502, + "grad_norm": 0.5932348165440957, + "learning_rate": 0.001506, + "loss": 5.3876, + "step": 502 + }, + { + "epoch": 0.00503, + "grad_norm": 0.6328380348360387, + "learning_rate": 0.0015090000000000001, + "loss": 5.391, + "step": 503 + }, + { + "epoch": 0.00504, + "grad_norm": 0.5819727032922326, + "learning_rate": 0.001512, + "loss": 5.3693, + "step": 504 + }, + { + "epoch": 0.00505, + "grad_norm": 0.5953710061568833, + "learning_rate": 0.001515, + "loss": 5.3594, + "step": 505 + }, + { + "epoch": 0.00506, + "grad_norm": 0.5986845177656173, + "learning_rate": 0.001518, + "loss": 5.3624, + "step": 506 + }, + { + "epoch": 0.00507, + "grad_norm": 0.623690249743195, + "learning_rate": 0.001521, + "loss": 5.3571, + "step": 507 + }, + { + "epoch": 0.00508, + "grad_norm": 0.653996676321799, + "learning_rate": 0.001524, + "loss": 5.3588, + "step": 508 + }, + { + "epoch": 0.00509, + "grad_norm": 0.7417086851753733, + "learning_rate": 0.0015270000000000001, + "loss": 5.3422, + "step": 509 + }, + { + "epoch": 0.0051, + "grad_norm": 0.7033408638361137, + "learning_rate": 0.0015300000000000001, + "loss": 5.3598, + "step": 510 + }, + { + "epoch": 0.00511, + "grad_norm": 0.7013752626190988, + "learning_rate": 0.001533, + "loss": 5.3361, + "step": 511 + }, + { + "epoch": 0.00512, + "grad_norm": 0.7403626060663853, + "learning_rate": 0.001536, + "loss": 5.3344, + "step": 512 + }, + { + "epoch": 0.00513, + "grad_norm": 0.7668257914395668, + "learning_rate": 0.001539, + "loss": 5.3488, + "step": 513 + }, + { + "epoch": 0.00514, + "grad_norm": 0.8677889222141009, + "learning_rate": 0.001542, + "loss": 5.3327, + "step": 514 + }, + { + "epoch": 0.00515, + "grad_norm": 0.896065553430359, + "learning_rate": 0.0015450000000000001, + "loss": 5.3422, + "step": 515 + }, + { + "epoch": 0.00516, + "grad_norm": 1.0837566571017694, + "learning_rate": 0.0015480000000000001, + "loss": 5.3497, + "step": 516 + }, + { + "epoch": 0.00517, + "grad_norm": 0.8071431981996826, + "learning_rate": 0.001551, + "loss": 5.3323, + "step": 517 + }, + { + "epoch": 0.00518, + "grad_norm": 0.7918860105262308, + "learning_rate": 0.001554, + "loss": 5.3156, + "step": 518 + }, + { + "epoch": 0.00519, + "grad_norm": 0.7777992304037674, + "learning_rate": 0.001557, + "loss": 5.3213, + "step": 519 + }, + { + "epoch": 0.0052, + "grad_norm": 0.8275508154311308, + "learning_rate": 0.0015600000000000002, + "loss": 5.3297, + "step": 520 + }, + { + "epoch": 0.00521, + "grad_norm": 1.081326682572488, + "learning_rate": 0.0015630000000000002, + "loss": 5.3161, + "step": 521 + }, + { + "epoch": 0.00522, + "grad_norm": 1.0769033841173197, + "learning_rate": 0.0015660000000000001, + "loss": 5.3235, + "step": 522 + }, + { + "epoch": 0.00523, + "grad_norm": 1.1142920349652348, + "learning_rate": 0.001569, + "loss": 5.3418, + "step": 523 + }, + { + "epoch": 0.00524, + "grad_norm": 0.9680819380144772, + "learning_rate": 0.001572, + "loss": 5.3387, + "step": 524 + }, + { + "epoch": 0.00525, + "grad_norm": 1.042843464512002, + "learning_rate": 0.001575, + "loss": 5.3364, + "step": 525 + }, + { + "epoch": 0.00526, + "grad_norm": 0.8760110028730904, + "learning_rate": 0.0015780000000000002, + "loss": 5.3079, + "step": 526 + }, + { + "epoch": 0.00527, + "grad_norm": 0.7131338611439731, + "learning_rate": 0.0015810000000000002, + "loss": 5.3127, + "step": 527 + }, + { + "epoch": 0.00528, + "grad_norm": 0.6786352126644868, + "learning_rate": 0.0015840000000000001, + "loss": 5.2868, + "step": 528 + }, + { + "epoch": 0.00529, + "grad_norm": 0.6952357562094686, + "learning_rate": 0.001587, + "loss": 5.2922, + "step": 529 + }, + { + "epoch": 0.0053, + "grad_norm": 0.8086799159810172, + "learning_rate": 0.00159, + "loss": 5.3039, + "step": 530 + }, + { + "epoch": 0.00531, + "grad_norm": 0.828973806141186, + "learning_rate": 0.001593, + "loss": 5.2873, + "step": 531 + }, + { + "epoch": 0.00532, + "grad_norm": 0.7467931538676229, + "learning_rate": 0.0015960000000000002, + "loss": 5.2943, + "step": 532 + }, + { + "epoch": 0.00533, + "grad_norm": 0.7141354989500697, + "learning_rate": 0.0015990000000000002, + "loss": 5.2786, + "step": 533 + }, + { + "epoch": 0.00534, + "grad_norm": 0.9764493557723114, + "learning_rate": 0.0016020000000000001, + "loss": 5.2728, + "step": 534 + }, + { + "epoch": 0.00535, + "grad_norm": 1.3880068471110967, + "learning_rate": 0.001605, + "loss": 5.2984, + "step": 535 + }, + { + "epoch": 0.00536, + "grad_norm": 0.9041341459356813, + "learning_rate": 0.001608, + "loss": 5.2855, + "step": 536 + }, + { + "epoch": 0.00537, + "grad_norm": 0.762740194970871, + "learning_rate": 0.0016110000000000002, + "loss": 5.2723, + "step": 537 + }, + { + "epoch": 0.00538, + "grad_norm": 0.787312661332683, + "learning_rate": 0.0016140000000000002, + "loss": 5.2506, + "step": 538 + }, + { + "epoch": 0.00539, + "grad_norm": 0.6102453826005042, + "learning_rate": 0.0016170000000000002, + "loss": 5.2365, + "step": 539 + }, + { + "epoch": 0.0054, + "grad_norm": 0.6664103859218952, + "learning_rate": 0.0016200000000000001, + "loss": 5.2513, + "step": 540 + }, + { + "epoch": 0.00541, + "grad_norm": 0.7228434818484509, + "learning_rate": 0.001623, + "loss": 5.2273, + "step": 541 + }, + { + "epoch": 0.00542, + "grad_norm": 0.9646545444558308, + "learning_rate": 0.001626, + "loss": 5.2641, + "step": 542 + }, + { + "epoch": 0.00543, + "grad_norm": 1.1121220265997553, + "learning_rate": 0.0016290000000000002, + "loss": 5.2329, + "step": 543 + }, + { + "epoch": 0.00544, + "grad_norm": 0.7994777164441184, + "learning_rate": 0.0016320000000000002, + "loss": 5.2404, + "step": 544 + }, + { + "epoch": 0.00545, + "grad_norm": 0.7226008260314222, + "learning_rate": 0.0016350000000000002, + "loss": 5.2461, + "step": 545 + }, + { + "epoch": 0.00546, + "grad_norm": 0.7699535423166085, + "learning_rate": 0.0016380000000000001, + "loss": 5.2193, + "step": 546 + }, + { + "epoch": 0.00547, + "grad_norm": 0.6548240326600666, + "learning_rate": 0.001641, + "loss": 5.2108, + "step": 547 + }, + { + "epoch": 0.00548, + "grad_norm": 0.6332922946851393, + "learning_rate": 0.001644, + "loss": 5.2061, + "step": 548 + }, + { + "epoch": 0.00549, + "grad_norm": 0.6231528959927674, + "learning_rate": 0.0016470000000000002, + "loss": 5.2, + "step": 549 + }, + { + "epoch": 0.0055, + "grad_norm": 0.7419840881368932, + "learning_rate": 0.0016500000000000002, + "loss": 5.2179, + "step": 550 + }, + { + "epoch": 0.00551, + "grad_norm": 0.7180205816820676, + "learning_rate": 0.0016530000000000002, + "loss": 5.2057, + "step": 551 + }, + { + "epoch": 0.00552, + "grad_norm": 0.5920069574731561, + "learning_rate": 0.0016560000000000001, + "loss": 5.1823, + "step": 552 + }, + { + "epoch": 0.00553, + "grad_norm": 0.7996998429214144, + "learning_rate": 0.001659, + "loss": 5.1806, + "step": 553 + }, + { + "epoch": 0.00554, + "grad_norm": 1.0229110500291838, + "learning_rate": 0.0016620000000000003, + "loss": 5.1965, + "step": 554 + }, + { + "epoch": 0.00555, + "grad_norm": 1.1118473608885646, + "learning_rate": 0.0016650000000000002, + "loss": 5.1994, + "step": 555 + }, + { + "epoch": 0.00556, + "grad_norm": 0.9366759039894813, + "learning_rate": 0.0016680000000000002, + "loss": 5.1806, + "step": 556 + }, + { + "epoch": 0.00557, + "grad_norm": 0.9046668934887724, + "learning_rate": 0.0016710000000000002, + "loss": 5.1671, + "step": 557 + }, + { + "epoch": 0.00558, + "grad_norm": 1.142251826676036, + "learning_rate": 0.0016740000000000001, + "loss": 5.2009, + "step": 558 + }, + { + "epoch": 0.00559, + "grad_norm": 1.0520781475504497, + "learning_rate": 0.001677, + "loss": 5.1865, + "step": 559 + }, + { + "epoch": 0.0056, + "grad_norm": 1.0780070897638405, + "learning_rate": 0.0016800000000000003, + "loss": 5.1609, + "step": 560 + }, + { + "epoch": 0.00561, + "grad_norm": 0.8904071170090557, + "learning_rate": 0.0016830000000000003, + "loss": 5.1755, + "step": 561 + }, + { + "epoch": 0.00562, + "grad_norm": 0.8189640026396579, + "learning_rate": 0.0016860000000000002, + "loss": 5.168, + "step": 562 + }, + { + "epoch": 0.00563, + "grad_norm": 0.746495696524217, + "learning_rate": 0.001689, + "loss": 5.1552, + "step": 563 + }, + { + "epoch": 0.00564, + "grad_norm": 0.7249953066463264, + "learning_rate": 0.001692, + "loss": 5.1416, + "step": 564 + }, + { + "epoch": 0.00565, + "grad_norm": 0.6193711615047397, + "learning_rate": 0.001695, + "loss": 5.1336, + "step": 565 + }, + { + "epoch": 0.00566, + "grad_norm": 0.8661212922050541, + "learning_rate": 0.0016979999999999999, + "loss": 5.1381, + "step": 566 + }, + { + "epoch": 0.00567, + "grad_norm": 0.9452019797636565, + "learning_rate": 0.0017009999999999998, + "loss": 5.1333, + "step": 567 + }, + { + "epoch": 0.00568, + "grad_norm": 0.8863756714851743, + "learning_rate": 0.0017039999999999998, + "loss": 5.1455, + "step": 568 + }, + { + "epoch": 0.00569, + "grad_norm": 0.8164512297006329, + "learning_rate": 0.001707, + "loss": 5.1087, + "step": 569 + }, + { + "epoch": 0.0057, + "grad_norm": 0.8055756655780417, + "learning_rate": 0.00171, + "loss": 5.1416, + "step": 570 + }, + { + "epoch": 0.00571, + "grad_norm": 0.9556127682537684, + "learning_rate": 0.001713, + "loss": 5.1421, + "step": 571 + }, + { + "epoch": 0.00572, + "grad_norm": 1.1121438340859977, + "learning_rate": 0.0017159999999999999, + "loss": 5.1242, + "step": 572 + }, + { + "epoch": 0.00573, + "grad_norm": 0.8538691427356556, + "learning_rate": 0.0017189999999999998, + "loss": 5.1261, + "step": 573 + }, + { + "epoch": 0.00574, + "grad_norm": 0.754134808897758, + "learning_rate": 0.001722, + "loss": 5.1186, + "step": 574 + }, + { + "epoch": 0.00575, + "grad_norm": 0.6045959777005846, + "learning_rate": 0.001725, + "loss": 5.0826, + "step": 575 + }, + { + "epoch": 0.00576, + "grad_norm": 0.5849168439848929, + "learning_rate": 0.001728, + "loss": 5.0868, + "step": 576 + }, + { + "epoch": 0.00577, + "grad_norm": 0.5881868482585118, + "learning_rate": 0.001731, + "loss": 5.0984, + "step": 577 + }, + { + "epoch": 0.00578, + "grad_norm": 0.6496481817365951, + "learning_rate": 0.0017339999999999999, + "loss": 5.0795, + "step": 578 + }, + { + "epoch": 0.00579, + "grad_norm": 0.7126042661301508, + "learning_rate": 0.0017369999999999998, + "loss": 5.0702, + "step": 579 + }, + { + "epoch": 0.0058, + "grad_norm": 0.691634070596695, + "learning_rate": 0.00174, + "loss": 5.0826, + "step": 580 + }, + { + "epoch": 0.00581, + "grad_norm": 0.6405819953602082, + "learning_rate": 0.001743, + "loss": 5.0809, + "step": 581 + }, + { + "epoch": 0.00582, + "grad_norm": 0.6144348123489994, + "learning_rate": 0.001746, + "loss": 5.0509, + "step": 582 + }, + { + "epoch": 0.00583, + "grad_norm": 0.5400038424579979, + "learning_rate": 0.001749, + "loss": 5.0752, + "step": 583 + }, + { + "epoch": 0.00584, + "grad_norm": 0.4936797939946059, + "learning_rate": 0.0017519999999999999, + "loss": 5.0634, + "step": 584 + }, + { + "epoch": 0.00585, + "grad_norm": 0.5420757595953297, + "learning_rate": 0.0017549999999999998, + "loss": 5.0509, + "step": 585 + }, + { + "epoch": 0.00586, + "grad_norm": 0.6461298473240921, + "learning_rate": 0.001758, + "loss": 5.0463, + "step": 586 + }, + { + "epoch": 0.00587, + "grad_norm": 1.0127747465457377, + "learning_rate": 0.001761, + "loss": 5.0477, + "step": 587 + }, + { + "epoch": 0.00588, + "grad_norm": 1.312605638466154, + "learning_rate": 0.001764, + "loss": 5.0646, + "step": 588 + }, + { + "epoch": 0.00589, + "grad_norm": 0.7336414507180539, + "learning_rate": 0.001767, + "loss": 5.0246, + "step": 589 + }, + { + "epoch": 0.0059, + "grad_norm": 0.9403709566188089, + "learning_rate": 0.0017699999999999999, + "loss": 5.0344, + "step": 590 + }, + { + "epoch": 0.00591, + "grad_norm": 1.1082528668309917, + "learning_rate": 0.001773, + "loss": 5.0455, + "step": 591 + }, + { + "epoch": 0.00592, + "grad_norm": 1.2840924567627583, + "learning_rate": 0.001776, + "loss": 5.0904, + "step": 592 + }, + { + "epoch": 0.00593, + "grad_norm": 0.7010705426983365, + "learning_rate": 0.001779, + "loss": 5.0507, + "step": 593 + }, + { + "epoch": 0.00594, + "grad_norm": 0.7515579064184676, + "learning_rate": 0.001782, + "loss": 5.0452, + "step": 594 + }, + { + "epoch": 0.00595, + "grad_norm": 0.8237589608472985, + "learning_rate": 0.001785, + "loss": 5.0574, + "step": 595 + }, + { + "epoch": 0.00596, + "grad_norm": 0.7511193245039597, + "learning_rate": 0.0017879999999999999, + "loss": 5.0458, + "step": 596 + }, + { + "epoch": 0.00597, + "grad_norm": 0.6951714106885373, + "learning_rate": 0.001791, + "loss": 5.0059, + "step": 597 + }, + { + "epoch": 0.00598, + "grad_norm": 0.6637745885790589, + "learning_rate": 0.001794, + "loss": 5.0231, + "step": 598 + }, + { + "epoch": 0.00599, + "grad_norm": 0.7127858481763457, + "learning_rate": 0.001797, + "loss": 5.0133, + "step": 599 + }, + { + "epoch": 0.006, + "grad_norm": 0.6761974733345899, + "learning_rate": 0.0018, + "loss": 5.0135, + "step": 600 + }, + { + "epoch": 0.00601, + "grad_norm": 0.6625605364634614, + "learning_rate": 0.001803, + "loss": 5.0015, + "step": 601 + }, + { + "epoch": 0.00602, + "grad_norm": 0.6742478727145375, + "learning_rate": 0.0018059999999999999, + "loss": 4.9862, + "step": 602 + }, + { + "epoch": 0.00603, + "grad_norm": 1.00015732542698, + "learning_rate": 0.001809, + "loss": 5.0193, + "step": 603 + }, + { + "epoch": 0.00604, + "grad_norm": 1.387382192884798, + "learning_rate": 0.001812, + "loss": 5.0251, + "step": 604 + }, + { + "epoch": 0.00605, + "grad_norm": 0.6727404947716551, + "learning_rate": 0.001815, + "loss": 5.0023, + "step": 605 + }, + { + "epoch": 0.00606, + "grad_norm": 0.9044609854709968, + "learning_rate": 0.001818, + "loss": 5.0189, + "step": 606 + }, + { + "epoch": 0.00607, + "grad_norm": 1.08596708759871, + "learning_rate": 0.001821, + "loss": 5.0221, + "step": 607 + }, + { + "epoch": 0.00608, + "grad_norm": 1.0369936566425986, + "learning_rate": 0.001824, + "loss": 5.008, + "step": 608 + }, + { + "epoch": 0.00609, + "grad_norm": 1.0935517991120203, + "learning_rate": 0.001827, + "loss": 5.0109, + "step": 609 + }, + { + "epoch": 0.0061, + "grad_norm": 0.9727711844599547, + "learning_rate": 0.00183, + "loss": 4.9666, + "step": 610 + }, + { + "epoch": 0.00611, + "grad_norm": 0.9492725313696737, + "learning_rate": 0.001833, + "loss": 4.9894, + "step": 611 + }, + { + "epoch": 0.00612, + "grad_norm": 0.845936691035656, + "learning_rate": 0.001836, + "loss": 4.9768, + "step": 612 + }, + { + "epoch": 0.00613, + "grad_norm": 0.917579763764549, + "learning_rate": 0.001839, + "loss": 4.9836, + "step": 613 + }, + { + "epoch": 0.00614, + "grad_norm": 0.8975809320202123, + "learning_rate": 0.001842, + "loss": 5.0024, + "step": 614 + }, + { + "epoch": 0.00615, + "grad_norm": 1.1935315831043936, + "learning_rate": 0.001845, + "loss": 5.0018, + "step": 615 + }, + { + "epoch": 0.00616, + "grad_norm": 0.9948318214992812, + "learning_rate": 0.001848, + "loss": 4.9924, + "step": 616 + }, + { + "epoch": 0.00617, + "grad_norm": 0.8063669388844663, + "learning_rate": 0.001851, + "loss": 4.9919, + "step": 617 + }, + { + "epoch": 0.00618, + "grad_norm": 0.8184910219660716, + "learning_rate": 0.001854, + "loss": 4.9666, + "step": 618 + }, + { + "epoch": 0.00619, + "grad_norm": 0.7780464806882716, + "learning_rate": 0.001857, + "loss": 4.9753, + "step": 619 + }, + { + "epoch": 0.0062, + "grad_norm": 0.7430630101852395, + "learning_rate": 0.00186, + "loss": 4.9566, + "step": 620 + }, + { + "epoch": 0.00621, + "grad_norm": 0.8040699289060931, + "learning_rate": 0.001863, + "loss": 4.9542, + "step": 621 + }, + { + "epoch": 0.00622, + "grad_norm": 0.8423285566803137, + "learning_rate": 0.001866, + "loss": 4.9653, + "step": 622 + }, + { + "epoch": 0.00623, + "grad_norm": 0.6802855865245334, + "learning_rate": 0.001869, + "loss": 4.9365, + "step": 623 + }, + { + "epoch": 0.00624, + "grad_norm": 0.7045868643772514, + "learning_rate": 0.001872, + "loss": 4.9425, + "step": 624 + }, + { + "epoch": 0.00625, + "grad_norm": 0.69605003901388, + "learning_rate": 0.001875, + "loss": 4.9397, + "step": 625 + }, + { + "epoch": 0.00626, + "grad_norm": 0.8788947819856907, + "learning_rate": 0.0018780000000000001, + "loss": 4.9403, + "step": 626 + }, + { + "epoch": 0.00627, + "grad_norm": 0.8580113274469313, + "learning_rate": 0.001881, + "loss": 4.9238, + "step": 627 + }, + { + "epoch": 0.00628, + "grad_norm": 0.7437087045232712, + "learning_rate": 0.001884, + "loss": 4.9553, + "step": 628 + }, + { + "epoch": 0.00629, + "grad_norm": 0.673794469112573, + "learning_rate": 0.001887, + "loss": 4.9059, + "step": 629 + }, + { + "epoch": 0.0063, + "grad_norm": 0.7529443514224647, + "learning_rate": 0.00189, + "loss": 4.9225, + "step": 630 + }, + { + "epoch": 0.00631, + "grad_norm": 0.7882316002133182, + "learning_rate": 0.0018930000000000002, + "loss": 4.9159, + "step": 631 + }, + { + "epoch": 0.00632, + "grad_norm": 0.7345089369079263, + "learning_rate": 0.0018960000000000001, + "loss": 4.9318, + "step": 632 + }, + { + "epoch": 0.00633, + "grad_norm": 0.807557335679046, + "learning_rate": 0.001899, + "loss": 4.9156, + "step": 633 + }, + { + "epoch": 0.00634, + "grad_norm": 0.856273971211143, + "learning_rate": 0.001902, + "loss": 4.9086, + "step": 634 + }, + { + "epoch": 0.00635, + "grad_norm": 0.8041095750030954, + "learning_rate": 0.001905, + "loss": 4.8993, + "step": 635 + }, + { + "epoch": 0.00636, + "grad_norm": 0.8334087326563642, + "learning_rate": 0.001908, + "loss": 4.9117, + "step": 636 + }, + { + "epoch": 0.00637, + "grad_norm": 0.8711627404236827, + "learning_rate": 0.0019110000000000002, + "loss": 4.9242, + "step": 637 + }, + { + "epoch": 0.00638, + "grad_norm": 0.950273886749592, + "learning_rate": 0.0019140000000000001, + "loss": 4.918, + "step": 638 + }, + { + "epoch": 0.00639, + "grad_norm": 0.9763758019156279, + "learning_rate": 0.001917, + "loss": 4.8946, + "step": 639 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9069546349974866, + "learning_rate": 0.00192, + "loss": 4.9049, + "step": 640 + }, + { + "epoch": 0.00641, + "grad_norm": 0.7602914411110755, + "learning_rate": 0.001923, + "loss": 4.8967, + "step": 641 + }, + { + "epoch": 0.00642, + "grad_norm": 0.6358369958975738, + "learning_rate": 0.001926, + "loss": 4.8823, + "step": 642 + }, + { + "epoch": 0.00643, + "grad_norm": 0.6298192488222032, + "learning_rate": 0.0019290000000000002, + "loss": 4.8984, + "step": 643 + }, + { + "epoch": 0.00644, + "grad_norm": 0.5835488483304159, + "learning_rate": 0.0019320000000000001, + "loss": 4.8719, + "step": 644 + }, + { + "epoch": 0.00645, + "grad_norm": 0.509893487039198, + "learning_rate": 0.001935, + "loss": 4.8697, + "step": 645 + }, + { + "epoch": 0.00646, + "grad_norm": 0.5016743009567477, + "learning_rate": 0.001938, + "loss": 4.8822, + "step": 646 + }, + { + "epoch": 0.00647, + "grad_norm": 0.4834671196673339, + "learning_rate": 0.001941, + "loss": 4.8737, + "step": 647 + }, + { + "epoch": 0.00648, + "grad_norm": 0.4900666309904975, + "learning_rate": 0.0019440000000000002, + "loss": 4.8665, + "step": 648 + }, + { + "epoch": 0.00649, + "grad_norm": 0.6242032394190251, + "learning_rate": 0.0019470000000000002, + "loss": 4.8569, + "step": 649 + }, + { + "epoch": 0.0065, + "grad_norm": 0.7946730011730083, + "learning_rate": 0.0019500000000000001, + "loss": 4.8944, + "step": 650 + }, + { + "epoch": 0.00651, + "grad_norm": 0.7813096551019217, + "learning_rate": 0.001953, + "loss": 4.8797, + "step": 651 + }, + { + "epoch": 0.00652, + "grad_norm": 0.5708054833927125, + "learning_rate": 0.0019560000000000003, + "loss": 4.8538, + "step": 652 + }, + { + "epoch": 0.00653, + "grad_norm": 0.6416767177196502, + "learning_rate": 0.0019590000000000002, + "loss": 4.8612, + "step": 653 + }, + { + "epoch": 0.00654, + "grad_norm": 0.6414247107018044, + "learning_rate": 0.001962, + "loss": 4.8324, + "step": 654 + }, + { + "epoch": 0.00655, + "grad_norm": 0.5608777579230684, + "learning_rate": 0.001965, + "loss": 4.8581, + "step": 655 + }, + { + "epoch": 0.00656, + "grad_norm": 0.4812696659686437, + "learning_rate": 0.001968, + "loss": 4.8497, + "step": 656 + }, + { + "epoch": 0.00657, + "grad_norm": 0.5196607021803705, + "learning_rate": 0.001971, + "loss": 4.8212, + "step": 657 + }, + { + "epoch": 0.00658, + "grad_norm": 0.5384007134004025, + "learning_rate": 0.001974, + "loss": 4.8422, + "step": 658 + }, + { + "epoch": 0.00659, + "grad_norm": 0.6084877834513672, + "learning_rate": 0.001977, + "loss": 4.8215, + "step": 659 + }, + { + "epoch": 0.0066, + "grad_norm": 0.7589081819730935, + "learning_rate": 0.00198, + "loss": 4.8434, + "step": 660 + }, + { + "epoch": 0.00661, + "grad_norm": 0.7941713837035096, + "learning_rate": 0.001983, + "loss": 4.8155, + "step": 661 + }, + { + "epoch": 0.00662, + "grad_norm": 0.7978547164974868, + "learning_rate": 0.0019860000000000004, + "loss": 4.8127, + "step": 662 + }, + { + "epoch": 0.00663, + "grad_norm": 0.893378015840618, + "learning_rate": 0.0019890000000000003, + "loss": 4.8265, + "step": 663 + }, + { + "epoch": 0.00664, + "grad_norm": 0.899949440268495, + "learning_rate": 0.0019920000000000003, + "loss": 4.8283, + "step": 664 + }, + { + "epoch": 0.00665, + "grad_norm": 0.8127988991300923, + "learning_rate": 0.0019950000000000002, + "loss": 4.7986, + "step": 665 + }, + { + "epoch": 0.00666, + "grad_norm": 0.9200900241429067, + "learning_rate": 0.001998, + "loss": 4.8226, + "step": 666 + }, + { + "epoch": 0.00667, + "grad_norm": 1.037264674390151, + "learning_rate": 0.002001, + "loss": 4.8324, + "step": 667 + }, + { + "epoch": 0.00668, + "grad_norm": 0.8082146942337904, + "learning_rate": 0.002004, + "loss": 4.8427, + "step": 668 + }, + { + "epoch": 0.00669, + "grad_norm": 0.7033624756486074, + "learning_rate": 0.002007, + "loss": 4.8562, + "step": 669 + }, + { + "epoch": 0.0067, + "grad_norm": 0.751969455636164, + "learning_rate": 0.00201, + "loss": 4.8525, + "step": 670 + }, + { + "epoch": 0.00671, + "grad_norm": 0.736520529365372, + "learning_rate": 0.002013, + "loss": 4.8206, + "step": 671 + }, + { + "epoch": 0.00672, + "grad_norm": 0.7466982774868701, + "learning_rate": 0.002016, + "loss": 4.8129, + "step": 672 + }, + { + "epoch": 0.00673, + "grad_norm": 0.7025220166479262, + "learning_rate": 0.002019, + "loss": 4.8146, + "step": 673 + }, + { + "epoch": 0.00674, + "grad_norm": 0.8461453039283889, + "learning_rate": 0.0020220000000000004, + "loss": 4.8144, + "step": 674 + }, + { + "epoch": 0.00675, + "grad_norm": 0.9399881649158435, + "learning_rate": 0.0020250000000000003, + "loss": 4.8482, + "step": 675 + }, + { + "epoch": 0.00676, + "grad_norm": 0.9357632097468723, + "learning_rate": 0.0020280000000000003, + "loss": 4.8268, + "step": 676 + }, + { + "epoch": 0.00677, + "grad_norm": 0.7758960619033557, + "learning_rate": 0.0020310000000000003, + "loss": 4.8204, + "step": 677 + }, + { + "epoch": 0.00678, + "grad_norm": 0.699292513140664, + "learning_rate": 0.0020340000000000002, + "loss": 4.8248, + "step": 678 + }, + { + "epoch": 0.00679, + "grad_norm": 0.7370787957429817, + "learning_rate": 0.002037, + "loss": 4.816, + "step": 679 + }, + { + "epoch": 0.0068, + "grad_norm": 0.8377547362902558, + "learning_rate": 0.00204, + "loss": 4.8174, + "step": 680 + }, + { + "epoch": 0.00681, + "grad_norm": 0.8259782799224379, + "learning_rate": 0.002043, + "loss": 4.8155, + "step": 681 + }, + { + "epoch": 0.00682, + "grad_norm": 0.7684261091318535, + "learning_rate": 0.002046, + "loss": 4.8082, + "step": 682 + }, + { + "epoch": 0.00683, + "grad_norm": 0.8487367019402318, + "learning_rate": 0.002049, + "loss": 4.7989, + "step": 683 + }, + { + "epoch": 0.00684, + "grad_norm": 0.8838018894616847, + "learning_rate": 0.002052, + "loss": 4.8194, + "step": 684 + }, + { + "epoch": 0.00685, + "grad_norm": 0.8860697203395584, + "learning_rate": 0.0020550000000000004, + "loss": 4.8252, + "step": 685 + }, + { + "epoch": 0.00686, + "grad_norm": 0.7336183086529302, + "learning_rate": 0.0020580000000000004, + "loss": 4.7837, + "step": 686 + }, + { + "epoch": 0.00687, + "grad_norm": 0.8176630379413288, + "learning_rate": 0.0020610000000000003, + "loss": 4.7944, + "step": 687 + }, + { + "epoch": 0.00688, + "grad_norm": 0.7703386551342313, + "learning_rate": 0.002064, + "loss": 4.7926, + "step": 688 + }, + { + "epoch": 0.00689, + "grad_norm": 0.6919162061146223, + "learning_rate": 0.002067, + "loss": 4.7965, + "step": 689 + }, + { + "epoch": 0.0069, + "grad_norm": 0.7424392154268248, + "learning_rate": 0.00207, + "loss": 4.7893, + "step": 690 + }, + { + "epoch": 0.00691, + "grad_norm": 0.6515618524352145, + "learning_rate": 0.0020729999999999998, + "loss": 4.7559, + "step": 691 + }, + { + "epoch": 0.00692, + "grad_norm": 0.6440846578393002, + "learning_rate": 0.0020759999999999997, + "loss": 4.776, + "step": 692 + }, + { + "epoch": 0.00693, + "grad_norm": 0.6847536481828279, + "learning_rate": 0.0020789999999999997, + "loss": 4.7889, + "step": 693 + }, + { + "epoch": 0.00694, + "grad_norm": 0.6321576161870056, + "learning_rate": 0.002082, + "loss": 4.7627, + "step": 694 + }, + { + "epoch": 0.00695, + "grad_norm": 0.5791129920715202, + "learning_rate": 0.002085, + "loss": 4.7609, + "step": 695 + }, + { + "epoch": 0.00696, + "grad_norm": 0.5895865438272808, + "learning_rate": 0.002088, + "loss": 4.7723, + "step": 696 + }, + { + "epoch": 0.00697, + "grad_norm": 0.5008187604770619, + "learning_rate": 0.002091, + "loss": 4.7695, + "step": 697 + }, + { + "epoch": 0.00698, + "grad_norm": 0.6970439697756265, + "learning_rate": 0.002094, + "loss": 4.7619, + "step": 698 + }, + { + "epoch": 0.00699, + "grad_norm": 0.8941704543265332, + "learning_rate": 0.002097, + "loss": 4.7572, + "step": 699 + }, + { + "epoch": 0.007, + "grad_norm": 0.9068627730041655, + "learning_rate": 0.0021, + "loss": 4.787, + "step": 700 + }, + { + "epoch": 0.00701, + "grad_norm": 0.7146483381512303, + "learning_rate": 0.002103, + "loss": 4.7547, + "step": 701 + }, + { + "epoch": 0.00702, + "grad_norm": 0.9172255209446268, + "learning_rate": 0.002106, + "loss": 4.77, + "step": 702 + }, + { + "epoch": 0.00703, + "grad_norm": 0.9047172643914575, + "learning_rate": 0.0021089999999999998, + "loss": 4.7553, + "step": 703 + }, + { + "epoch": 0.00704, + "grad_norm": 0.7853692419556185, + "learning_rate": 0.0021119999999999997, + "loss": 4.7583, + "step": 704 + }, + { + "epoch": 0.00705, + "grad_norm": 0.7199878385614988, + "learning_rate": 0.002115, + "loss": 4.7725, + "step": 705 + }, + { + "epoch": 0.00706, + "grad_norm": 0.7213393080579115, + "learning_rate": 0.002118, + "loss": 4.7581, + "step": 706 + }, + { + "epoch": 0.00707, + "grad_norm": 0.7597119331851468, + "learning_rate": 0.002121, + "loss": 4.7413, + "step": 707 + }, + { + "epoch": 0.00708, + "grad_norm": 0.6864102182118973, + "learning_rate": 0.002124, + "loss": 4.7187, + "step": 708 + }, + { + "epoch": 0.00709, + "grad_norm": 0.7815902187763394, + "learning_rate": 0.002127, + "loss": 4.7572, + "step": 709 + }, + { + "epoch": 0.0071, + "grad_norm": 0.8451784595752648, + "learning_rate": 0.00213, + "loss": 4.7552, + "step": 710 + }, + { + "epoch": 0.00711, + "grad_norm": 1.0054655399528605, + "learning_rate": 0.002133, + "loss": 4.7414, + "step": 711 + }, + { + "epoch": 0.00712, + "grad_norm": 0.9031323884556907, + "learning_rate": 0.002136, + "loss": 4.7728, + "step": 712 + }, + { + "epoch": 0.00713, + "grad_norm": 0.961250906275713, + "learning_rate": 0.002139, + "loss": 4.7862, + "step": 713 + }, + { + "epoch": 0.00714, + "grad_norm": 0.9556615314074448, + "learning_rate": 0.002142, + "loss": 4.7819, + "step": 714 + }, + { + "epoch": 0.00715, + "grad_norm": 0.837203607680531, + "learning_rate": 0.0021449999999999998, + "loss": 4.7417, + "step": 715 + }, + { + "epoch": 0.00716, + "grad_norm": 0.7607986282551458, + "learning_rate": 0.002148, + "loss": 4.7359, + "step": 716 + }, + { + "epoch": 0.00717, + "grad_norm": 0.8703365352693242, + "learning_rate": 0.002151, + "loss": 4.7519, + "step": 717 + }, + { + "epoch": 0.00718, + "grad_norm": 0.8830641357048177, + "learning_rate": 0.002154, + "loss": 4.7536, + "step": 718 + }, + { + "epoch": 0.00719, + "grad_norm": 0.8090298717986324, + "learning_rate": 0.002157, + "loss": 4.7586, + "step": 719 + }, + { + "epoch": 0.0072, + "grad_norm": 0.7002439324520396, + "learning_rate": 0.00216, + "loss": 4.7466, + "step": 720 + }, + { + "epoch": 0.00721, + "grad_norm": 0.7540412799334538, + "learning_rate": 0.002163, + "loss": 4.7512, + "step": 721 + }, + { + "epoch": 0.00722, + "grad_norm": 0.7234067697970273, + "learning_rate": 0.002166, + "loss": 4.7241, + "step": 722 + }, + { + "epoch": 0.00723, + "grad_norm": 0.5796869415275953, + "learning_rate": 0.002169, + "loss": 4.73, + "step": 723 + }, + { + "epoch": 0.00724, + "grad_norm": 0.6360613090935692, + "learning_rate": 0.002172, + "loss": 4.7294, + "step": 724 + }, + { + "epoch": 0.00725, + "grad_norm": 0.6592111108932344, + "learning_rate": 0.002175, + "loss": 4.7232, + "step": 725 + }, + { + "epoch": 0.00726, + "grad_norm": 0.7000176967246123, + "learning_rate": 0.002178, + "loss": 4.7406, + "step": 726 + }, + { + "epoch": 0.00727, + "grad_norm": 0.6658154130327723, + "learning_rate": 0.0021809999999999998, + "loss": 4.7131, + "step": 727 + }, + { + "epoch": 0.00728, + "grad_norm": 0.500886702178687, + "learning_rate": 0.002184, + "loss": 4.7222, + "step": 728 + }, + { + "epoch": 0.00729, + "grad_norm": 0.553445989931654, + "learning_rate": 0.002187, + "loss": 4.7196, + "step": 729 + }, + { + "epoch": 0.0073, + "grad_norm": 0.5928953773304845, + "learning_rate": 0.00219, + "loss": 4.7153, + "step": 730 + }, + { + "epoch": 0.00731, + "grad_norm": 0.5280339609019513, + "learning_rate": 0.002193, + "loss": 4.7069, + "step": 731 + }, + { + "epoch": 0.00732, + "grad_norm": 0.4601497488067425, + "learning_rate": 0.002196, + "loss": 4.7146, + "step": 732 + }, + { + "epoch": 0.00733, + "grad_norm": 0.4831437067076967, + "learning_rate": 0.002199, + "loss": 4.6865, + "step": 733 + }, + { + "epoch": 0.00734, + "grad_norm": 0.48957731222976764, + "learning_rate": 0.002202, + "loss": 4.7176, + "step": 734 + }, + { + "epoch": 0.00735, + "grad_norm": 0.5029506248084066, + "learning_rate": 0.002205, + "loss": 4.7231, + "step": 735 + }, + { + "epoch": 0.00736, + "grad_norm": 0.5300436466729722, + "learning_rate": 0.002208, + "loss": 4.7045, + "step": 736 + }, + { + "epoch": 0.00737, + "grad_norm": 0.5354857814520255, + "learning_rate": 0.002211, + "loss": 4.6701, + "step": 737 + }, + { + "epoch": 0.00738, + "grad_norm": 0.6855959285026678, + "learning_rate": 0.002214, + "loss": 4.6857, + "step": 738 + }, + { + "epoch": 0.00739, + "grad_norm": 0.7193696222416395, + "learning_rate": 0.0022170000000000002, + "loss": 4.6773, + "step": 739 + }, + { + "epoch": 0.0074, + "grad_norm": 0.7000843676029133, + "learning_rate": 0.00222, + "loss": 4.686, + "step": 740 + }, + { + "epoch": 0.00741, + "grad_norm": 0.8262482718120322, + "learning_rate": 0.002223, + "loss": 4.6648, + "step": 741 + }, + { + "epoch": 0.00742, + "grad_norm": 0.8068053565529363, + "learning_rate": 0.002226, + "loss": 4.71, + "step": 742 + }, + { + "epoch": 0.00743, + "grad_norm": 0.7713935209386231, + "learning_rate": 0.002229, + "loss": 4.6667, + "step": 743 + }, + { + "epoch": 0.00744, + "grad_norm": 0.6806090978340125, + "learning_rate": 0.002232, + "loss": 4.668, + "step": 744 + }, + { + "epoch": 0.00745, + "grad_norm": 0.8150134425373752, + "learning_rate": 0.002235, + "loss": 4.6906, + "step": 745 + }, + { + "epoch": 0.00746, + "grad_norm": 0.9083405480081935, + "learning_rate": 0.002238, + "loss": 4.6909, + "step": 746 + }, + { + "epoch": 0.00747, + "grad_norm": 1.1312224699232258, + "learning_rate": 0.002241, + "loss": 4.6956, + "step": 747 + }, + { + "epoch": 0.00748, + "grad_norm": 0.8174601291810354, + "learning_rate": 0.002244, + "loss": 4.6973, + "step": 748 + }, + { + "epoch": 0.00749, + "grad_norm": 0.8423282795209855, + "learning_rate": 0.002247, + "loss": 4.6802, + "step": 749 + }, + { + "epoch": 0.0075, + "grad_norm": 0.8679250685858194, + "learning_rate": 0.0022500000000000003, + "loss": 4.7268, + "step": 750 + }, + { + "epoch": 0.00751, + "grad_norm": 1.0939871221004271, + "learning_rate": 0.0022530000000000002, + "loss": 4.7337, + "step": 751 + }, + { + "epoch": 0.00752, + "grad_norm": 0.9886618564303525, + "learning_rate": 0.002256, + "loss": 4.6822, + "step": 752 + }, + { + "epoch": 0.00753, + "grad_norm": 0.9258452428585717, + "learning_rate": 0.002259, + "loss": 4.7192, + "step": 753 + }, + { + "epoch": 0.00754, + "grad_norm": 0.8790583060068752, + "learning_rate": 0.002262, + "loss": 4.7345, + "step": 754 + }, + { + "epoch": 0.00755, + "grad_norm": 0.7727162685258321, + "learning_rate": 0.002265, + "loss": 4.6919, + "step": 755 + }, + { + "epoch": 0.00756, + "grad_norm": 0.8048369196552551, + "learning_rate": 0.002268, + "loss": 4.6801, + "step": 756 + }, + { + "epoch": 0.00757, + "grad_norm": 0.7307749692176225, + "learning_rate": 0.002271, + "loss": 4.6902, + "step": 757 + }, + { + "epoch": 0.00758, + "grad_norm": 0.7628918458541498, + "learning_rate": 0.002274, + "loss": 4.6882, + "step": 758 + }, + { + "epoch": 0.00759, + "grad_norm": 0.6811469027490352, + "learning_rate": 0.002277, + "loss": 4.693, + "step": 759 + }, + { + "epoch": 0.0076, + "grad_norm": 0.5530412513377371, + "learning_rate": 0.00228, + "loss": 4.6735, + "step": 760 + }, + { + "epoch": 0.00761, + "grad_norm": 0.5221449888101848, + "learning_rate": 0.002283, + "loss": 4.6405, + "step": 761 + }, + { + "epoch": 0.00762, + "grad_norm": 0.5687089394846262, + "learning_rate": 0.0022860000000000003, + "loss": 4.6208, + "step": 762 + }, + { + "epoch": 0.00763, + "grad_norm": 0.5814285508645848, + "learning_rate": 0.0022890000000000002, + "loss": 4.6718, + "step": 763 + }, + { + "epoch": 0.00764, + "grad_norm": 0.6388540284979518, + "learning_rate": 0.002292, + "loss": 4.683, + "step": 764 + }, + { + "epoch": 0.00765, + "grad_norm": 0.7278589788698283, + "learning_rate": 0.002295, + "loss": 4.6752, + "step": 765 + }, + { + "epoch": 0.00766, + "grad_norm": 0.7050667087459527, + "learning_rate": 0.002298, + "loss": 4.6715, + "step": 766 + }, + { + "epoch": 0.00767, + "grad_norm": 0.6023307971425665, + "learning_rate": 0.002301, + "loss": 4.6623, + "step": 767 + }, + { + "epoch": 0.00768, + "grad_norm": 0.7162182495110988, + "learning_rate": 0.002304, + "loss": 4.6687, + "step": 768 + }, + { + "epoch": 0.00769, + "grad_norm": 0.8754398583131885, + "learning_rate": 0.002307, + "loss": 4.6855, + "step": 769 + }, + { + "epoch": 0.0077, + "grad_norm": 0.9282950433205286, + "learning_rate": 0.00231, + "loss": 4.656, + "step": 770 + }, + { + "epoch": 0.00771, + "grad_norm": 0.8826983762517153, + "learning_rate": 0.002313, + "loss": 4.6927, + "step": 771 + }, + { + "epoch": 0.00772, + "grad_norm": 0.7955428299875815, + "learning_rate": 0.002316, + "loss": 4.6752, + "step": 772 + }, + { + "epoch": 0.00773, + "grad_norm": 0.6879017191297421, + "learning_rate": 0.0023190000000000003, + "loss": 4.6732, + "step": 773 + }, + { + "epoch": 0.00774, + "grad_norm": 0.5805904836674535, + "learning_rate": 0.0023220000000000003, + "loss": 4.6842, + "step": 774 + }, + { + "epoch": 0.00775, + "grad_norm": 0.5872309146101224, + "learning_rate": 0.0023250000000000002, + "loss": 4.6741, + "step": 775 + }, + { + "epoch": 0.00776, + "grad_norm": 0.6663800275805344, + "learning_rate": 0.002328, + "loss": 4.6762, + "step": 776 + }, + { + "epoch": 0.00777, + "grad_norm": 0.6475349218207965, + "learning_rate": 0.002331, + "loss": 4.6499, + "step": 777 + }, + { + "epoch": 0.00778, + "grad_norm": 0.6498373909650491, + "learning_rate": 0.002334, + "loss": 4.6643, + "step": 778 + }, + { + "epoch": 0.00779, + "grad_norm": 0.6357690952406082, + "learning_rate": 0.002337, + "loss": 4.6181, + "step": 779 + }, + { + "epoch": 0.0078, + "grad_norm": 0.6241849680287349, + "learning_rate": 0.00234, + "loss": 4.6419, + "step": 780 + }, + { + "epoch": 0.00781, + "grad_norm": 0.6808062655697796, + "learning_rate": 0.002343, + "loss": 4.6451, + "step": 781 + }, + { + "epoch": 0.00782, + "grad_norm": 0.7065267585011001, + "learning_rate": 0.002346, + "loss": 4.6436, + "step": 782 + }, + { + "epoch": 0.00783, + "grad_norm": 0.6381701638777938, + "learning_rate": 0.002349, + "loss": 4.6242, + "step": 783 + }, + { + "epoch": 0.00784, + "grad_norm": 0.647841330234918, + "learning_rate": 0.002352, + "loss": 4.6355, + "step": 784 + }, + { + "epoch": 0.00785, + "grad_norm": 0.60562299847623, + "learning_rate": 0.0023550000000000003, + "loss": 4.6345, + "step": 785 + }, + { + "epoch": 0.00786, + "grad_norm": 0.5148036449557751, + "learning_rate": 0.0023580000000000003, + "loss": 4.628, + "step": 786 + }, + { + "epoch": 0.00787, + "grad_norm": 0.7217903580979332, + "learning_rate": 0.0023610000000000003, + "loss": 4.641, + "step": 787 + }, + { + "epoch": 0.00788, + "grad_norm": 0.8515178253715305, + "learning_rate": 0.002364, + "loss": 4.6466, + "step": 788 + }, + { + "epoch": 0.00789, + "grad_norm": 0.8831947515557061, + "learning_rate": 0.002367, + "loss": 4.6753, + "step": 789 + }, + { + "epoch": 0.0079, + "grad_norm": 0.9816312249435458, + "learning_rate": 0.00237, + "loss": 4.6574, + "step": 790 + }, + { + "epoch": 0.00791, + "grad_norm": 0.8257428278422617, + "learning_rate": 0.002373, + "loss": 4.6081, + "step": 791 + }, + { + "epoch": 0.00792, + "grad_norm": 0.6889392893349975, + "learning_rate": 0.002376, + "loss": 4.6372, + "step": 792 + }, + { + "epoch": 0.00793, + "grad_norm": 0.5470340397913868, + "learning_rate": 0.002379, + "loss": 4.6272, + "step": 793 + }, + { + "epoch": 0.00794, + "grad_norm": 0.5333909794818302, + "learning_rate": 0.002382, + "loss": 4.6359, + "step": 794 + }, + { + "epoch": 0.00795, + "grad_norm": 0.47330116805918854, + "learning_rate": 0.002385, + "loss": 4.6105, + "step": 795 + }, + { + "epoch": 0.00796, + "grad_norm": 0.46996625387544017, + "learning_rate": 0.0023880000000000004, + "loss": 4.6049, + "step": 796 + }, + { + "epoch": 0.00797, + "grad_norm": 0.4648603328586337, + "learning_rate": 0.0023910000000000003, + "loss": 4.6461, + "step": 797 + }, + { + "epoch": 0.00798, + "grad_norm": 0.48147045136320854, + "learning_rate": 0.0023940000000000003, + "loss": 4.6125, + "step": 798 + }, + { + "epoch": 0.00799, + "grad_norm": 0.5520002928611956, + "learning_rate": 0.0023970000000000003, + "loss": 4.6461, + "step": 799 + }, + { + "epoch": 0.008, + "grad_norm": 0.49520453644625784, + "learning_rate": 0.0024000000000000002, + "loss": 4.5958, + "step": 800 + }, + { + "epoch": 0.00801, + "grad_norm": 0.4617883447215667, + "learning_rate": 0.002403, + "loss": 4.6244, + "step": 801 + }, + { + "epoch": 0.00802, + "grad_norm": 0.6171622349156032, + "learning_rate": 0.002406, + "loss": 4.6206, + "step": 802 + }, + { + "epoch": 0.00803, + "grad_norm": 0.7819651777797723, + "learning_rate": 0.002409, + "loss": 4.605, + "step": 803 + }, + { + "epoch": 0.00804, + "grad_norm": 0.8072878053268496, + "learning_rate": 0.002412, + "loss": 4.6195, + "step": 804 + }, + { + "epoch": 0.00805, + "grad_norm": 0.6878812715646375, + "learning_rate": 0.002415, + "loss": 4.6081, + "step": 805 + }, + { + "epoch": 0.00806, + "grad_norm": 0.7628005366591507, + "learning_rate": 0.002418, + "loss": 4.6308, + "step": 806 + }, + { + "epoch": 0.00807, + "grad_norm": 0.7958527408861041, + "learning_rate": 0.0024210000000000004, + "loss": 4.6228, + "step": 807 + }, + { + "epoch": 0.00808, + "grad_norm": 0.7899389450719584, + "learning_rate": 0.0024240000000000004, + "loss": 4.6083, + "step": 808 + }, + { + "epoch": 0.00809, + "grad_norm": 1.193539574978161, + "learning_rate": 0.0024270000000000003, + "loss": 4.6201, + "step": 809 + }, + { + "epoch": 0.0081, + "grad_norm": 1.0067442084818319, + "learning_rate": 0.0024300000000000003, + "loss": 4.6554, + "step": 810 + }, + { + "epoch": 0.00811, + "grad_norm": 1.0302946243785736, + "learning_rate": 0.0024330000000000003, + "loss": 4.6338, + "step": 811 + }, + { + "epoch": 0.00812, + "grad_norm": 0.7966936317025509, + "learning_rate": 0.0024360000000000002, + "loss": 4.6044, + "step": 812 + }, + { + "epoch": 0.00813, + "grad_norm": 0.7843512654561826, + "learning_rate": 0.0024389999999999998, + "loss": 4.6449, + "step": 813 + }, + { + "epoch": 0.00814, + "grad_norm": 0.7981724618057067, + "learning_rate": 0.0024419999999999997, + "loss": 4.6482, + "step": 814 + }, + { + "epoch": 0.00815, + "grad_norm": 0.8382113380987876, + "learning_rate": 0.0024449999999999997, + "loss": 4.6588, + "step": 815 + }, + { + "epoch": 0.00816, + "grad_norm": 0.8204581966398267, + "learning_rate": 0.002448, + "loss": 4.6311, + "step": 816 + }, + { + "epoch": 0.00817, + "grad_norm": 1.02336938073518, + "learning_rate": 0.002451, + "loss": 4.6583, + "step": 817 + }, + { + "epoch": 0.00818, + "grad_norm": 0.815478540285485, + "learning_rate": 0.002454, + "loss": 4.6309, + "step": 818 + }, + { + "epoch": 0.00819, + "grad_norm": 0.9108711148393207, + "learning_rate": 0.002457, + "loss": 4.6483, + "step": 819 + }, + { + "epoch": 0.0082, + "grad_norm": 0.8773364113378127, + "learning_rate": 0.00246, + "loss": 4.6373, + "step": 820 + }, + { + "epoch": 0.00821, + "grad_norm": 0.8093045902813614, + "learning_rate": 0.002463, + "loss": 4.6126, + "step": 821 + }, + { + "epoch": 0.00822, + "grad_norm": 0.8775561270633004, + "learning_rate": 0.002466, + "loss": 4.6164, + "step": 822 + }, + { + "epoch": 0.00823, + "grad_norm": 0.8916321669966187, + "learning_rate": 0.002469, + "loss": 4.6298, + "step": 823 + }, + { + "epoch": 0.00824, + "grad_norm": 0.7939051533904264, + "learning_rate": 0.002472, + "loss": 4.6511, + "step": 824 + }, + { + "epoch": 0.00825, + "grad_norm": 0.9509206049767348, + "learning_rate": 0.0024749999999999998, + "loss": 4.6625, + "step": 825 + }, + { + "epoch": 0.00826, + "grad_norm": 1.0914612729506281, + "learning_rate": 0.0024779999999999997, + "loss": 4.6356, + "step": 826 + }, + { + "epoch": 0.00827, + "grad_norm": 1.0440371247088225, + "learning_rate": 0.002481, + "loss": 4.6709, + "step": 827 + }, + { + "epoch": 0.00828, + "grad_norm": 1.0292672719671891, + "learning_rate": 0.002484, + "loss": 4.6644, + "step": 828 + }, + { + "epoch": 0.00829, + "grad_norm": 1.002549149572049, + "learning_rate": 0.002487, + "loss": 4.6608, + "step": 829 + }, + { + "epoch": 0.0083, + "grad_norm": 0.8229445260626227, + "learning_rate": 0.00249, + "loss": 4.6642, + "step": 830 + }, + { + "epoch": 0.00831, + "grad_norm": 0.7223475739297199, + "learning_rate": 0.002493, + "loss": 4.6379, + "step": 831 + }, + { + "epoch": 0.00832, + "grad_norm": 0.6422365780456449, + "learning_rate": 0.002496, + "loss": 4.6349, + "step": 832 + }, + { + "epoch": 0.00833, + "grad_norm": 0.6433276699815419, + "learning_rate": 0.002499, + "loss": 4.626, + "step": 833 + }, + { + "epoch": 0.00834, + "grad_norm": 0.6136587911860008, + "learning_rate": 0.002502, + "loss": 4.6284, + "step": 834 + }, + { + "epoch": 0.00835, + "grad_norm": 0.5892258230535582, + "learning_rate": 0.002505, + "loss": 4.617, + "step": 835 + }, + { + "epoch": 0.00836, + "grad_norm": 0.5407454758774727, + "learning_rate": 0.002508, + "loss": 4.6416, + "step": 836 + }, + { + "epoch": 0.00837, + "grad_norm": 0.5840603338652609, + "learning_rate": 0.0025109999999999998, + "loss": 4.582, + "step": 837 + }, + { + "epoch": 0.00838, + "grad_norm": 0.5192725472759927, + "learning_rate": 0.0025139999999999997, + "loss": 4.6102, + "step": 838 + }, + { + "epoch": 0.00839, + "grad_norm": 0.5064380785759203, + "learning_rate": 0.002517, + "loss": 4.6034, + "step": 839 + }, + { + "epoch": 0.0084, + "grad_norm": 0.48476724687493267, + "learning_rate": 0.00252, + "loss": 4.5854, + "step": 840 + }, + { + "epoch": 0.00841, + "grad_norm": 0.44496684540968734, + "learning_rate": 0.002523, + "loss": 4.5786, + "step": 841 + }, + { + "epoch": 0.00842, + "grad_norm": 0.42609007840223895, + "learning_rate": 0.002526, + "loss": 4.5771, + "step": 842 + }, + { + "epoch": 0.00843, + "grad_norm": 0.4703393687667864, + "learning_rate": 0.002529, + "loss": 4.5751, + "step": 843 + }, + { + "epoch": 0.00844, + "grad_norm": 0.4976619641323943, + "learning_rate": 0.002532, + "loss": 4.5956, + "step": 844 + }, + { + "epoch": 0.00845, + "grad_norm": 0.530881004052944, + "learning_rate": 0.002535, + "loss": 4.5625, + "step": 845 + }, + { + "epoch": 0.00846, + "grad_norm": 0.5069253535552343, + "learning_rate": 0.002538, + "loss": 4.584, + "step": 846 + }, + { + "epoch": 0.00847, + "grad_norm": 0.4900054637856495, + "learning_rate": 0.002541, + "loss": 4.5637, + "step": 847 + }, + { + "epoch": 0.00848, + "grad_norm": 0.5361511355183629, + "learning_rate": 0.002544, + "loss": 4.5693, + "step": 848 + }, + { + "epoch": 0.00849, + "grad_norm": 0.6067359238432654, + "learning_rate": 0.002547, + "loss": 4.5644, + "step": 849 + }, + { + "epoch": 0.0085, + "grad_norm": 0.5519192846207763, + "learning_rate": 0.00255, + "loss": 4.573, + "step": 850 + }, + { + "epoch": 0.00851, + "grad_norm": 0.46694877598438245, + "learning_rate": 0.002553, + "loss": 4.5875, + "step": 851 + }, + { + "epoch": 0.00852, + "grad_norm": 0.477565098915178, + "learning_rate": 0.002556, + "loss": 4.5765, + "step": 852 + }, + { + "epoch": 0.00853, + "grad_norm": 0.5020213435824815, + "learning_rate": 0.002559, + "loss": 4.5556, + "step": 853 + }, + { + "epoch": 0.00854, + "grad_norm": 0.5171409161048013, + "learning_rate": 0.002562, + "loss": 4.5495, + "step": 854 + }, + { + "epoch": 0.00855, + "grad_norm": 0.46627459343076927, + "learning_rate": 0.002565, + "loss": 4.5252, + "step": 855 + }, + { + "epoch": 0.00856, + "grad_norm": 0.5139521756940325, + "learning_rate": 0.002568, + "loss": 4.5623, + "step": 856 + }, + { + "epoch": 0.00857, + "grad_norm": 0.6011403998041547, + "learning_rate": 0.002571, + "loss": 4.5577, + "step": 857 + }, + { + "epoch": 0.00858, + "grad_norm": 0.554768384377006, + "learning_rate": 0.002574, + "loss": 4.5487, + "step": 858 + }, + { + "epoch": 0.00859, + "grad_norm": 0.539858542755145, + "learning_rate": 0.002577, + "loss": 4.5383, + "step": 859 + }, + { + "epoch": 0.0086, + "grad_norm": 0.586599307397693, + "learning_rate": 0.00258, + "loss": 4.571, + "step": 860 + }, + { + "epoch": 0.00861, + "grad_norm": 0.7154666400015554, + "learning_rate": 0.0025830000000000002, + "loss": 4.5928, + "step": 861 + }, + { + "epoch": 0.00862, + "grad_norm": 0.7345971792604707, + "learning_rate": 0.002586, + "loss": 4.5402, + "step": 862 + }, + { + "epoch": 0.00863, + "grad_norm": 0.7491760821516434, + "learning_rate": 0.002589, + "loss": 4.5765, + "step": 863 + }, + { + "epoch": 0.00864, + "grad_norm": 0.9255705361922033, + "learning_rate": 0.002592, + "loss": 4.5322, + "step": 864 + }, + { + "epoch": 0.00865, + "grad_norm": 0.9964986146275199, + "learning_rate": 0.002595, + "loss": 4.5664, + "step": 865 + }, + { + "epoch": 0.00866, + "grad_norm": 0.7618488122087141, + "learning_rate": 0.002598, + "loss": 4.5836, + "step": 866 + }, + { + "epoch": 0.00867, + "grad_norm": 0.8524780083566116, + "learning_rate": 0.002601, + "loss": 4.5742, + "step": 867 + }, + { + "epoch": 0.00868, + "grad_norm": 0.9692981312410378, + "learning_rate": 0.002604, + "loss": 4.5808, + "step": 868 + }, + { + "epoch": 0.00869, + "grad_norm": 1.1822252043975705, + "learning_rate": 0.002607, + "loss": 4.6073, + "step": 869 + }, + { + "epoch": 0.0087, + "grad_norm": 0.9057663759386707, + "learning_rate": 0.00261, + "loss": 4.5844, + "step": 870 + }, + { + "epoch": 0.00871, + "grad_norm": 0.9457338978675252, + "learning_rate": 0.002613, + "loss": 4.6115, + "step": 871 + }, + { + "epoch": 0.00872, + "grad_norm": 0.9845348105394848, + "learning_rate": 0.002616, + "loss": 4.5975, + "step": 872 + }, + { + "epoch": 0.00873, + "grad_norm": 0.8202179076205192, + "learning_rate": 0.0026190000000000002, + "loss": 4.5967, + "step": 873 + }, + { + "epoch": 0.00874, + "grad_norm": 0.6587988147688274, + "learning_rate": 0.002622, + "loss": 4.6142, + "step": 874 + }, + { + "epoch": 0.00875, + "grad_norm": 0.6312495265838277, + "learning_rate": 0.002625, + "loss": 4.5549, + "step": 875 + }, + { + "epoch": 0.00876, + "grad_norm": 0.6646817876274769, + "learning_rate": 0.002628, + "loss": 4.5838, + "step": 876 + }, + { + "epoch": 0.00877, + "grad_norm": 0.632354886157607, + "learning_rate": 0.002631, + "loss": 4.6011, + "step": 877 + }, + { + "epoch": 0.00878, + "grad_norm": 0.5630676163174572, + "learning_rate": 0.002634, + "loss": 4.5288, + "step": 878 + }, + { + "epoch": 0.00879, + "grad_norm": 0.4918531988998375, + "learning_rate": 0.002637, + "loss": 4.5559, + "step": 879 + }, + { + "epoch": 0.0088, + "grad_norm": 0.42476181684324305, + "learning_rate": 0.00264, + "loss": 4.5634, + "step": 880 + }, + { + "epoch": 0.00881, + "grad_norm": 0.4573573466722849, + "learning_rate": 0.002643, + "loss": 4.5644, + "step": 881 + }, + { + "epoch": 0.00882, + "grad_norm": 0.5881448674370812, + "learning_rate": 0.002646, + "loss": 4.5659, + "step": 882 + }, + { + "epoch": 0.00883, + "grad_norm": 0.7764456560266775, + "learning_rate": 0.002649, + "loss": 4.5645, + "step": 883 + }, + { + "epoch": 0.00884, + "grad_norm": 0.9651176541039754, + "learning_rate": 0.0026520000000000003, + "loss": 4.5629, + "step": 884 + }, + { + "epoch": 0.00885, + "grad_norm": 1.0453630458113787, + "learning_rate": 0.0026550000000000002, + "loss": 4.5675, + "step": 885 + }, + { + "epoch": 0.00886, + "grad_norm": 0.8249295551343052, + "learning_rate": 0.002658, + "loss": 4.5811, + "step": 886 + }, + { + "epoch": 0.00887, + "grad_norm": 0.6632170677276661, + "learning_rate": 0.002661, + "loss": 4.5643, + "step": 887 + }, + { + "epoch": 0.00888, + "grad_norm": 0.7818922566742896, + "learning_rate": 0.002664, + "loss": 4.5709, + "step": 888 + }, + { + "epoch": 0.00889, + "grad_norm": 0.8131057291041344, + "learning_rate": 0.002667, + "loss": 4.5276, + "step": 889 + }, + { + "epoch": 0.0089, + "grad_norm": 0.7364786352062309, + "learning_rate": 0.00267, + "loss": 4.5735, + "step": 890 + }, + { + "epoch": 0.00891, + "grad_norm": 0.6174568923006037, + "learning_rate": 0.002673, + "loss": 4.5538, + "step": 891 + }, + { + "epoch": 0.00892, + "grad_norm": 0.6060396654742667, + "learning_rate": 0.002676, + "loss": 4.5678, + "step": 892 + }, + { + "epoch": 0.00893, + "grad_norm": 0.6503337239639668, + "learning_rate": 0.002679, + "loss": 4.5453, + "step": 893 + }, + { + "epoch": 0.00894, + "grad_norm": 0.6599395002026207, + "learning_rate": 0.002682, + "loss": 4.5291, + "step": 894 + }, + { + "epoch": 0.00895, + "grad_norm": 0.5989877186645693, + "learning_rate": 0.0026850000000000003, + "loss": 4.5412, + "step": 895 + }, + { + "epoch": 0.00896, + "grad_norm": 0.5286031214975206, + "learning_rate": 0.0026880000000000003, + "loss": 4.5273, + "step": 896 + }, + { + "epoch": 0.00897, + "grad_norm": 0.6246596167729576, + "learning_rate": 0.0026910000000000002, + "loss": 4.5504, + "step": 897 + }, + { + "epoch": 0.00898, + "grad_norm": 0.6886920087577523, + "learning_rate": 0.002694, + "loss": 4.5437, + "step": 898 + }, + { + "epoch": 0.00899, + "grad_norm": 0.7603324493631337, + "learning_rate": 0.002697, + "loss": 4.5543, + "step": 899 + }, + { + "epoch": 0.009, + "grad_norm": 0.7773743953648492, + "learning_rate": 0.0027, + "loss": 4.5794, + "step": 900 + }, + { + "epoch": 0.00901, + "grad_norm": 0.683256197441996, + "learning_rate": 0.002703, + "loss": 4.5307, + "step": 901 + }, + { + "epoch": 0.00902, + "grad_norm": 0.5681357763332335, + "learning_rate": 0.002706, + "loss": 4.5356, + "step": 902 + }, + { + "epoch": 0.00903, + "grad_norm": 0.5420591540444755, + "learning_rate": 0.002709, + "loss": 4.5338, + "step": 903 + }, + { + "epoch": 0.00904, + "grad_norm": 0.5224631659490503, + "learning_rate": 0.002712, + "loss": 4.5093, + "step": 904 + }, + { + "epoch": 0.00905, + "grad_norm": 0.5026034590467293, + "learning_rate": 0.002715, + "loss": 4.5252, + "step": 905 + }, + { + "epoch": 0.00906, + "grad_norm": 0.5177890071237494, + "learning_rate": 0.002718, + "loss": 4.5378, + "step": 906 + }, + { + "epoch": 0.00907, + "grad_norm": 0.5764689015080159, + "learning_rate": 0.0027210000000000003, + "loss": 4.5536, + "step": 907 + }, + { + "epoch": 0.00908, + "grad_norm": 0.6259624722487185, + "learning_rate": 0.0027240000000000003, + "loss": 4.5265, + "step": 908 + }, + { + "epoch": 0.00909, + "grad_norm": 0.670091172363038, + "learning_rate": 0.0027270000000000003, + "loss": 4.5481, + "step": 909 + }, + { + "epoch": 0.0091, + "grad_norm": 0.7211417475777565, + "learning_rate": 0.0027300000000000002, + "loss": 4.555, + "step": 910 + }, + { + "epoch": 0.00911, + "grad_norm": 0.6734826041799787, + "learning_rate": 0.002733, + "loss": 4.5118, + "step": 911 + }, + { + "epoch": 0.00912, + "grad_norm": 0.5721394003951694, + "learning_rate": 0.002736, + "loss": 4.5134, + "step": 912 + }, + { + "epoch": 0.00913, + "grad_norm": 0.5576363751689392, + "learning_rate": 0.002739, + "loss": 4.5062, + "step": 913 + }, + { + "epoch": 0.00914, + "grad_norm": 0.659556953854551, + "learning_rate": 0.002742, + "loss": 4.5426, + "step": 914 + }, + { + "epoch": 0.00915, + "grad_norm": 0.7056386423863461, + "learning_rate": 0.002745, + "loss": 4.5041, + "step": 915 + }, + { + "epoch": 0.00916, + "grad_norm": 0.6615396800381155, + "learning_rate": 0.002748, + "loss": 4.4829, + "step": 916 + }, + { + "epoch": 0.00917, + "grad_norm": 0.6203895946828626, + "learning_rate": 0.002751, + "loss": 4.4952, + "step": 917 + }, + { + "epoch": 0.00918, + "grad_norm": 0.6455523293432982, + "learning_rate": 0.0027540000000000004, + "loss": 4.5159, + "step": 918 + }, + { + "epoch": 0.00919, + "grad_norm": 0.6153975854015812, + "learning_rate": 0.0027570000000000003, + "loss": 4.4956, + "step": 919 + }, + { + "epoch": 0.0092, + "grad_norm": 0.6747074236078813, + "learning_rate": 0.0027600000000000003, + "loss": 4.516, + "step": 920 + }, + { + "epoch": 0.00921, + "grad_norm": 0.7525938258933302, + "learning_rate": 0.0027630000000000003, + "loss": 4.5035, + "step": 921 + }, + { + "epoch": 0.00922, + "grad_norm": 0.6613679668687953, + "learning_rate": 0.0027660000000000002, + "loss": 4.5161, + "step": 922 + }, + { + "epoch": 0.00923, + "grad_norm": 0.657751627761755, + "learning_rate": 0.002769, + "loss": 4.4882, + "step": 923 + }, + { + "epoch": 0.00924, + "grad_norm": 0.6747361910557791, + "learning_rate": 0.002772, + "loss": 4.515, + "step": 924 + }, + { + "epoch": 0.00925, + "grad_norm": 0.6239646474738244, + "learning_rate": 0.002775, + "loss": 4.4929, + "step": 925 + }, + { + "epoch": 0.00926, + "grad_norm": 0.599067117804374, + "learning_rate": 0.002778, + "loss": 4.5097, + "step": 926 + }, + { + "epoch": 0.00927, + "grad_norm": 0.5594951339370652, + "learning_rate": 0.002781, + "loss": 4.4719, + "step": 927 + }, + { + "epoch": 0.00928, + "grad_norm": 0.6063845847981192, + "learning_rate": 0.002784, + "loss": 4.518, + "step": 928 + }, + { + "epoch": 0.00929, + "grad_norm": 0.6363503981385901, + "learning_rate": 0.0027870000000000004, + "loss": 4.5031, + "step": 929 + }, + { + "epoch": 0.0093, + "grad_norm": 0.7396258221206569, + "learning_rate": 0.0027900000000000004, + "loss": 4.4944, + "step": 930 + }, + { + "epoch": 0.00931, + "grad_norm": 0.8942550404249334, + "learning_rate": 0.0027930000000000003, + "loss": 4.517, + "step": 931 + }, + { + "epoch": 0.00932, + "grad_norm": 1.0354660733966428, + "learning_rate": 0.0027960000000000003, + "loss": 4.5402, + "step": 932 + }, + { + "epoch": 0.00933, + "grad_norm": 1.180367237398422, + "learning_rate": 0.0027990000000000003, + "loss": 4.525, + "step": 933 + }, + { + "epoch": 0.00934, + "grad_norm": 0.9502642927196222, + "learning_rate": 0.0028020000000000002, + "loss": 4.5541, + "step": 934 + }, + { + "epoch": 0.00935, + "grad_norm": 0.8858808588378486, + "learning_rate": 0.002805, + "loss": 4.5601, + "step": 935 + }, + { + "epoch": 0.00936, + "grad_norm": 0.9244572615029755, + "learning_rate": 0.002808, + "loss": 4.569, + "step": 936 + }, + { + "epoch": 0.00937, + "grad_norm": 0.9225067704838915, + "learning_rate": 0.002811, + "loss": 4.5806, + "step": 937 + }, + { + "epoch": 0.00938, + "grad_norm": 0.8534895885659719, + "learning_rate": 0.002814, + "loss": 4.5604, + "step": 938 + }, + { + "epoch": 0.00939, + "grad_norm": 0.9046137087836131, + "learning_rate": 0.002817, + "loss": 4.5554, + "step": 939 + }, + { + "epoch": 0.0094, + "grad_norm": 0.7476401969651744, + "learning_rate": 0.00282, + "loss": 4.5482, + "step": 940 + }, + { + "epoch": 0.00941, + "grad_norm": 0.7066455268972154, + "learning_rate": 0.002823, + "loss": 4.5225, + "step": 941 + }, + { + "epoch": 0.00942, + "grad_norm": 0.6459119835251312, + "learning_rate": 0.002826, + "loss": 4.544, + "step": 942 + }, + { + "epoch": 0.00943, + "grad_norm": 0.6055625869260791, + "learning_rate": 0.002829, + "loss": 4.5363, + "step": 943 + }, + { + "epoch": 0.00944, + "grad_norm": 0.5293280202161804, + "learning_rate": 0.002832, + "loss": 4.5307, + "step": 944 + }, + { + "epoch": 0.00945, + "grad_norm": 0.5438831110130244, + "learning_rate": 0.002835, + "loss": 4.5334, + "step": 945 + }, + { + "epoch": 0.00946, + "grad_norm": 0.4766420743521973, + "learning_rate": 0.002838, + "loss": 4.5433, + "step": 946 + }, + { + "epoch": 0.00947, + "grad_norm": 0.40195567884756706, + "learning_rate": 0.0028409999999999998, + "loss": 4.5081, + "step": 947 + }, + { + "epoch": 0.00948, + "grad_norm": 0.3783844921089427, + "learning_rate": 0.0028439999999999997, + "loss": 4.4865, + "step": 948 + }, + { + "epoch": 0.00949, + "grad_norm": 0.4197576569837563, + "learning_rate": 0.002847, + "loss": 4.5306, + "step": 949 + }, + { + "epoch": 0.0095, + "grad_norm": 0.45947372263331304, + "learning_rate": 0.00285, + "loss": 4.5193, + "step": 950 + }, + { + "epoch": 0.00951, + "grad_norm": 0.5187245758366383, + "learning_rate": 0.002853, + "loss": 4.4969, + "step": 951 + }, + { + "epoch": 0.00952, + "grad_norm": 0.5515692080168162, + "learning_rate": 0.002856, + "loss": 4.5218, + "step": 952 + }, + { + "epoch": 0.00953, + "grad_norm": 0.501582875002041, + "learning_rate": 0.002859, + "loss": 4.4606, + "step": 953 + }, + { + "epoch": 0.00954, + "grad_norm": 0.5014106294436917, + "learning_rate": 0.002862, + "loss": 4.5197, + "step": 954 + }, + { + "epoch": 0.00955, + "grad_norm": 0.6047606934565909, + "learning_rate": 0.002865, + "loss": 4.5086, + "step": 955 + }, + { + "epoch": 0.00956, + "grad_norm": 0.6661868633369662, + "learning_rate": 0.002868, + "loss": 4.4921, + "step": 956 + }, + { + "epoch": 0.00957, + "grad_norm": 0.6511713371124522, + "learning_rate": 0.002871, + "loss": 4.514, + "step": 957 + }, + { + "epoch": 0.00958, + "grad_norm": 0.5733443203887492, + "learning_rate": 0.002874, + "loss": 4.4931, + "step": 958 + }, + { + "epoch": 0.00959, + "grad_norm": 0.6024952806359369, + "learning_rate": 0.002877, + "loss": 4.4895, + "step": 959 + }, + { + "epoch": 0.0096, + "grad_norm": 0.6029559818977924, + "learning_rate": 0.0028799999999999997, + "loss": 4.4868, + "step": 960 + }, + { + "epoch": 0.00961, + "grad_norm": 0.5721073369283843, + "learning_rate": 0.002883, + "loss": 4.4604, + "step": 961 + }, + { + "epoch": 0.00962, + "grad_norm": 0.5737900491823522, + "learning_rate": 0.002886, + "loss": 4.4898, + "step": 962 + }, + { + "epoch": 0.00963, + "grad_norm": 0.5323481251626608, + "learning_rate": 0.002889, + "loss": 4.4867, + "step": 963 + }, + { + "epoch": 0.00964, + "grad_norm": 0.5436801325002781, + "learning_rate": 0.002892, + "loss": 4.4807, + "step": 964 + }, + { + "epoch": 0.00965, + "grad_norm": 0.60229729083351, + "learning_rate": 0.002895, + "loss": 4.4568, + "step": 965 + }, + { + "epoch": 0.00966, + "grad_norm": 0.6629818387101306, + "learning_rate": 0.002898, + "loss": 4.4766, + "step": 966 + }, + { + "epoch": 0.00967, + "grad_norm": 0.6748155978904155, + "learning_rate": 0.002901, + "loss": 4.5156, + "step": 967 + }, + { + "epoch": 0.00968, + "grad_norm": 0.7427494012599226, + "learning_rate": 0.002904, + "loss": 4.4866, + "step": 968 + }, + { + "epoch": 0.00969, + "grad_norm": 0.8794961931178971, + "learning_rate": 0.002907, + "loss": 4.5273, + "step": 969 + }, + { + "epoch": 0.0097, + "grad_norm": 0.8586008854691127, + "learning_rate": 0.00291, + "loss": 4.4896, + "step": 970 + }, + { + "epoch": 0.00971, + "grad_norm": 0.8273563438543869, + "learning_rate": 0.002913, + "loss": 4.4955, + "step": 971 + }, + { + "epoch": 0.00972, + "grad_norm": 0.7536097688784559, + "learning_rate": 0.002916, + "loss": 4.5029, + "step": 972 + }, + { + "epoch": 0.00973, + "grad_norm": 0.7541845251322323, + "learning_rate": 0.002919, + "loss": 4.4985, + "step": 973 + }, + { + "epoch": 0.00974, + "grad_norm": 0.6473607436694337, + "learning_rate": 0.002922, + "loss": 4.4796, + "step": 974 + }, + { + "epoch": 0.00975, + "grad_norm": 0.7361402706574074, + "learning_rate": 0.002925, + "loss": 4.5044, + "step": 975 + }, + { + "epoch": 0.00976, + "grad_norm": 0.868936228763895, + "learning_rate": 0.002928, + "loss": 4.508, + "step": 976 + }, + { + "epoch": 0.00977, + "grad_norm": 0.8813019375073942, + "learning_rate": 0.002931, + "loss": 4.5408, + "step": 977 + }, + { + "epoch": 0.00978, + "grad_norm": 0.9426880168937273, + "learning_rate": 0.002934, + "loss": 4.5239, + "step": 978 + }, + { + "epoch": 0.00979, + "grad_norm": 0.8776049562434768, + "learning_rate": 0.002937, + "loss": 4.5177, + "step": 979 + }, + { + "epoch": 0.0098, + "grad_norm": 0.7621714209410982, + "learning_rate": 0.00294, + "loss": 4.5005, + "step": 980 + }, + { + "epoch": 0.00981, + "grad_norm": 0.7607321859563556, + "learning_rate": 0.002943, + "loss": 4.5183, + "step": 981 + }, + { + "epoch": 0.00982, + "grad_norm": 0.8148690145722087, + "learning_rate": 0.002946, + "loss": 4.5101, + "step": 982 + }, + { + "epoch": 0.00983, + "grad_norm": 0.8602879224908239, + "learning_rate": 0.0029490000000000002, + "loss": 4.5186, + "step": 983 + }, + { + "epoch": 0.00984, + "grad_norm": 0.9348586711938943, + "learning_rate": 0.002952, + "loss": 4.5105, + "step": 984 + }, + { + "epoch": 0.00985, + "grad_norm": 0.9010607870226212, + "learning_rate": 0.002955, + "loss": 4.5341, + "step": 985 + }, + { + "epoch": 0.00986, + "grad_norm": 0.8225410280635316, + "learning_rate": 0.002958, + "loss": 4.497, + "step": 986 + }, + { + "epoch": 0.00987, + "grad_norm": 0.820560994458863, + "learning_rate": 0.002961, + "loss": 4.5111, + "step": 987 + }, + { + "epoch": 0.00988, + "grad_norm": 0.7430257271274537, + "learning_rate": 0.002964, + "loss": 4.5437, + "step": 988 + }, + { + "epoch": 0.00989, + "grad_norm": 0.7193873820034543, + "learning_rate": 0.002967, + "loss": 4.4838, + "step": 989 + }, + { + "epoch": 0.0099, + "grad_norm": 0.7329220852773792, + "learning_rate": 0.00297, + "loss": 4.5219, + "step": 990 + }, + { + "epoch": 0.00991, + "grad_norm": 0.7694030006138932, + "learning_rate": 0.002973, + "loss": 4.5213, + "step": 991 + }, + { + "epoch": 0.00992, + "grad_norm": 0.7726534251991994, + "learning_rate": 0.002976, + "loss": 4.5153, + "step": 992 + }, + { + "epoch": 0.00993, + "grad_norm": 0.6817474065224322, + "learning_rate": 0.002979, + "loss": 4.511, + "step": 993 + }, + { + "epoch": 0.00994, + "grad_norm": 0.6628189744120299, + "learning_rate": 0.002982, + "loss": 4.5078, + "step": 994 + }, + { + "epoch": 0.00995, + "grad_norm": 0.6249124245549155, + "learning_rate": 0.0029850000000000002, + "loss": 4.5069, + "step": 995 + }, + { + "epoch": 0.00996, + "grad_norm": 0.54278083452404, + "learning_rate": 0.002988, + "loss": 4.503, + "step": 996 + }, + { + "epoch": 0.00997, + "grad_norm": 0.5131542547273349, + "learning_rate": 0.002991, + "loss": 4.49, + "step": 997 + }, + { + "epoch": 0.00998, + "grad_norm": 0.4760166868407609, + "learning_rate": 0.002994, + "loss": 4.4895, + "step": 998 + }, + { + "epoch": 0.00999, + "grad_norm": 0.4855483581267517, + "learning_rate": 0.002997, + "loss": 4.4707, + "step": 999 + }, + { + "epoch": 0.01, + "grad_norm": 0.44929338024832627, + "learning_rate": 0.003, + "loss": 4.4897, + "step": 1000 + }, + { + "epoch": 0.01001, + "grad_norm": 0.3731978010457433, + "learning_rate": 0.003, + "loss": 4.4851, + "step": 1001 + }, + { + "epoch": 0.01002, + "grad_norm": 0.37113077487631946, + "learning_rate": 0.003, + "loss": 4.4516, + "step": 1002 + }, + { + "epoch": 0.01003, + "grad_norm": 0.35590481029592896, + "learning_rate": 0.003, + "loss": 4.4696, + "step": 1003 + }, + { + "epoch": 0.01004, + "grad_norm": 0.36275775448168, + "learning_rate": 0.003, + "loss": 4.4605, + "step": 1004 + }, + { + "epoch": 0.01005, + "grad_norm": 0.43407423243746507, + "learning_rate": 0.003, + "loss": 4.4329, + "step": 1005 + }, + { + "epoch": 0.01006, + "grad_norm": 0.5622897072513495, + "learning_rate": 0.003, + "loss": 4.4378, + "step": 1006 + }, + { + "epoch": 0.01007, + "grad_norm": 0.742851457120247, + "learning_rate": 0.003, + "loss": 4.4915, + "step": 1007 + }, + { + "epoch": 0.01008, + "grad_norm": 0.6898510838085624, + "learning_rate": 0.003, + "loss": 4.5119, + "step": 1008 + }, + { + "epoch": 0.01009, + "grad_norm": 0.5850313111790646, + "learning_rate": 0.003, + "loss": 4.4686, + "step": 1009 + }, + { + "epoch": 0.0101, + "grad_norm": 0.6689327239491644, + "learning_rate": 0.003, + "loss": 4.4833, + "step": 1010 + }, + { + "epoch": 0.01011, + "grad_norm": 0.6177431531224691, + "learning_rate": 0.003, + "loss": 4.4907, + "step": 1011 + }, + { + "epoch": 0.01012, + "grad_norm": 0.529876877427535, + "learning_rate": 0.003, + "loss": 4.4694, + "step": 1012 + }, + { + "epoch": 0.01013, + "grad_norm": 0.5027890853405218, + "learning_rate": 0.003, + "loss": 4.4915, + "step": 1013 + }, + { + "epoch": 0.01014, + "grad_norm": 0.4606528826158309, + "learning_rate": 0.003, + "loss": 4.4311, + "step": 1014 + }, + { + "epoch": 0.01015, + "grad_norm": 0.43413097561878167, + "learning_rate": 0.003, + "loss": 4.4729, + "step": 1015 + }, + { + "epoch": 0.01016, + "grad_norm": 0.45162191517129524, + "learning_rate": 0.003, + "loss": 4.4813, + "step": 1016 + }, + { + "epoch": 0.01017, + "grad_norm": 0.4151722379204413, + "learning_rate": 0.003, + "loss": 4.4194, + "step": 1017 + }, + { + "epoch": 0.01018, + "grad_norm": 0.42684710143489196, + "learning_rate": 0.003, + "loss": 4.4738, + "step": 1018 + }, + { + "epoch": 0.01019, + "grad_norm": 0.49455469810835373, + "learning_rate": 0.003, + "loss": 4.4546, + "step": 1019 + }, + { + "epoch": 0.0102, + "grad_norm": 0.44331421163833096, + "learning_rate": 0.003, + "loss": 4.4504, + "step": 1020 + }, + { + "epoch": 0.01021, + "grad_norm": 0.44293820189086996, + "learning_rate": 0.003, + "loss": 4.4351, + "step": 1021 + }, + { + "epoch": 0.01022, + "grad_norm": 0.471018484914238, + "learning_rate": 0.003, + "loss": 4.4454, + "step": 1022 + }, + { + "epoch": 0.01023, + "grad_norm": 0.5245843463151362, + "learning_rate": 0.003, + "loss": 4.4439, + "step": 1023 + }, + { + "epoch": 0.01024, + "grad_norm": 0.5471345601505091, + "learning_rate": 0.003, + "loss": 4.4059, + "step": 1024 + }, + { + "epoch": 0.01025, + "grad_norm": 0.5561682271484449, + "learning_rate": 0.003, + "loss": 4.4357, + "step": 1025 + }, + { + "epoch": 0.01026, + "grad_norm": 0.6750941633687737, + "learning_rate": 0.003, + "loss": 4.4343, + "step": 1026 + }, + { + "epoch": 0.01027, + "grad_norm": 0.6614940222952833, + "learning_rate": 0.003, + "loss": 4.4593, + "step": 1027 + }, + { + "epoch": 0.01028, + "grad_norm": 0.8232761879596748, + "learning_rate": 0.003, + "loss": 4.4444, + "step": 1028 + }, + { + "epoch": 0.01029, + "grad_norm": 0.926962960927152, + "learning_rate": 0.003, + "loss": 4.4584, + "step": 1029 + }, + { + "epoch": 0.0103, + "grad_norm": 0.7880165390972924, + "learning_rate": 0.003, + "loss": 4.4737, + "step": 1030 + }, + { + "epoch": 0.01031, + "grad_norm": 0.8244417100638626, + "learning_rate": 0.003, + "loss": 4.493, + "step": 1031 + }, + { + "epoch": 0.01032, + "grad_norm": 0.8457876785075499, + "learning_rate": 0.003, + "loss": 4.4747, + "step": 1032 + }, + { + "epoch": 0.01033, + "grad_norm": 0.7537115798125514, + "learning_rate": 0.003, + "loss": 4.5075, + "step": 1033 + }, + { + "epoch": 0.01034, + "grad_norm": 0.7955188357741166, + "learning_rate": 0.003, + "loss": 4.4759, + "step": 1034 + }, + { + "epoch": 0.01035, + "grad_norm": 0.807776556733491, + "learning_rate": 0.003, + "loss": 4.4793, + "step": 1035 + }, + { + "epoch": 0.01036, + "grad_norm": 0.8192211538243458, + "learning_rate": 0.003, + "loss": 4.4746, + "step": 1036 + }, + { + "epoch": 0.01037, + "grad_norm": 0.8125711495155172, + "learning_rate": 0.003, + "loss": 4.5016, + "step": 1037 + }, + { + "epoch": 0.01038, + "grad_norm": 0.8698834270185546, + "learning_rate": 0.003, + "loss": 4.5079, + "step": 1038 + }, + { + "epoch": 0.01039, + "grad_norm": 0.8070907960047083, + "learning_rate": 0.003, + "loss": 4.4962, + "step": 1039 + }, + { + "epoch": 0.0104, + "grad_norm": 0.8379908956267174, + "learning_rate": 0.003, + "loss": 4.5113, + "step": 1040 + }, + { + "epoch": 0.01041, + "grad_norm": 0.9467065153215795, + "learning_rate": 0.003, + "loss": 4.4994, + "step": 1041 + }, + { + "epoch": 0.01042, + "grad_norm": 1.0844879341171343, + "learning_rate": 0.003, + "loss": 4.5455, + "step": 1042 + }, + { + "epoch": 0.01043, + "grad_norm": 0.8223563094033745, + "learning_rate": 0.003, + "loss": 4.4732, + "step": 1043 + }, + { + "epoch": 0.01044, + "grad_norm": 0.7034538108566051, + "learning_rate": 0.003, + "loss": 4.5118, + "step": 1044 + }, + { + "epoch": 0.01045, + "grad_norm": 0.7046831935605008, + "learning_rate": 0.003, + "loss": 4.5282, + "step": 1045 + }, + { + "epoch": 0.01046, + "grad_norm": 0.5291717642722772, + "learning_rate": 0.003, + "loss": 4.4864, + "step": 1046 + }, + { + "epoch": 0.01047, + "grad_norm": 0.517981866453337, + "learning_rate": 0.003, + "loss": 4.4886, + "step": 1047 + }, + { + "epoch": 0.01048, + "grad_norm": 0.5100897334283181, + "learning_rate": 0.003, + "loss": 4.4775, + "step": 1048 + }, + { + "epoch": 0.01049, + "grad_norm": 0.4646755867304285, + "learning_rate": 0.003, + "loss": 4.4516, + "step": 1049 + }, + { + "epoch": 0.0105, + "grad_norm": 0.4688262914765259, + "learning_rate": 0.003, + "loss": 4.491, + "step": 1050 + }, + { + "epoch": 0.01051, + "grad_norm": 0.40479473066275506, + "learning_rate": 0.003, + "loss": 4.4613, + "step": 1051 + }, + { + "epoch": 0.01052, + "grad_norm": 0.37634413468362676, + "learning_rate": 0.003, + "loss": 4.4718, + "step": 1052 + }, + { + "epoch": 0.01053, + "grad_norm": 0.35380747787174, + "learning_rate": 0.003, + "loss": 4.4768, + "step": 1053 + }, + { + "epoch": 0.01054, + "grad_norm": 0.3233101912432746, + "learning_rate": 0.003, + "loss": 4.4657, + "step": 1054 + }, + { + "epoch": 0.01055, + "grad_norm": 0.3051647539319004, + "learning_rate": 0.003, + "loss": 4.4412, + "step": 1055 + }, + { + "epoch": 0.01056, + "grad_norm": 0.3183252525248436, + "learning_rate": 0.003, + "loss": 4.4395, + "step": 1056 + }, + { + "epoch": 0.01057, + "grad_norm": 0.32244230779772487, + "learning_rate": 0.003, + "loss": 4.443, + "step": 1057 + }, + { + "epoch": 0.01058, + "grad_norm": 0.37190643160876063, + "learning_rate": 0.003, + "loss": 4.4376, + "step": 1058 + }, + { + "epoch": 0.01059, + "grad_norm": 0.4982331351188646, + "learning_rate": 0.003, + "loss": 4.4406, + "step": 1059 + }, + { + "epoch": 0.0106, + "grad_norm": 0.630051966003807, + "learning_rate": 0.003, + "loss": 4.4377, + "step": 1060 + }, + { + "epoch": 0.01061, + "grad_norm": 0.6685664249418685, + "learning_rate": 0.003, + "loss": 4.4357, + "step": 1061 + }, + { + "epoch": 0.01062, + "grad_norm": 0.5702779525542006, + "learning_rate": 0.003, + "loss": 4.4281, + "step": 1062 + }, + { + "epoch": 0.01063, + "grad_norm": 0.47516871800908467, + "learning_rate": 0.003, + "loss": 4.4193, + "step": 1063 + }, + { + "epoch": 0.01064, + "grad_norm": 0.5362466950028499, + "learning_rate": 0.003, + "loss": 4.4381, + "step": 1064 + }, + { + "epoch": 0.01065, + "grad_norm": 0.5373139287833821, + "learning_rate": 0.003, + "loss": 4.4375, + "step": 1065 + }, + { + "epoch": 0.01066, + "grad_norm": 0.5486100821685685, + "learning_rate": 0.003, + "loss": 4.44, + "step": 1066 + }, + { + "epoch": 0.01067, + "grad_norm": 0.4816436794229303, + "learning_rate": 0.003, + "loss": 4.4454, + "step": 1067 + }, + { + "epoch": 0.01068, + "grad_norm": 0.429069056402875, + "learning_rate": 0.003, + "loss": 4.4286, + "step": 1068 + }, + { + "epoch": 0.01069, + "grad_norm": 0.4352935464081401, + "learning_rate": 0.003, + "loss": 4.4347, + "step": 1069 + }, + { + "epoch": 0.0107, + "grad_norm": 0.4882626803662278, + "learning_rate": 0.003, + "loss": 4.4874, + "step": 1070 + }, + { + "epoch": 0.01071, + "grad_norm": 0.599919651347176, + "learning_rate": 0.003, + "loss": 4.4109, + "step": 1071 + }, + { + "epoch": 0.01072, + "grad_norm": 0.6067483783044649, + "learning_rate": 0.003, + "loss": 4.4536, + "step": 1072 + }, + { + "epoch": 0.01073, + "grad_norm": 0.49239058441195105, + "learning_rate": 0.003, + "loss": 4.4412, + "step": 1073 + }, + { + "epoch": 0.01074, + "grad_norm": 0.49249754408010815, + "learning_rate": 0.003, + "loss": 4.4068, + "step": 1074 + }, + { + "epoch": 0.01075, + "grad_norm": 0.4103440289244663, + "learning_rate": 0.003, + "loss": 4.4418, + "step": 1075 + }, + { + "epoch": 0.01076, + "grad_norm": 0.41435198540181395, + "learning_rate": 0.003, + "loss": 4.4375, + "step": 1076 + }, + { + "epoch": 0.01077, + "grad_norm": 0.42961621912720355, + "learning_rate": 0.003, + "loss": 4.4322, + "step": 1077 + }, + { + "epoch": 0.01078, + "grad_norm": 0.47858570596245864, + "learning_rate": 0.003, + "loss": 4.3877, + "step": 1078 + }, + { + "epoch": 0.01079, + "grad_norm": 0.5533382064380803, + "learning_rate": 0.003, + "loss": 4.4233, + "step": 1079 + }, + { + "epoch": 0.0108, + "grad_norm": 0.6417480625144324, + "learning_rate": 0.003, + "loss": 4.4054, + "step": 1080 + }, + { + "epoch": 0.01081, + "grad_norm": 0.7431146939801978, + "learning_rate": 0.003, + "loss": 4.4226, + "step": 1081 + }, + { + "epoch": 0.01082, + "grad_norm": 0.9014359739424833, + "learning_rate": 0.003, + "loss": 4.4534, + "step": 1082 + }, + { + "epoch": 0.01083, + "grad_norm": 0.8675952653065004, + "learning_rate": 0.003, + "loss": 4.4518, + "step": 1083 + }, + { + "epoch": 0.01084, + "grad_norm": 0.7255782707521201, + "learning_rate": 0.003, + "loss": 4.448, + "step": 1084 + }, + { + "epoch": 0.01085, + "grad_norm": 0.6450755897364046, + "learning_rate": 0.003, + "loss": 4.4349, + "step": 1085 + }, + { + "epoch": 0.01086, + "grad_norm": 0.6814430736440221, + "learning_rate": 0.003, + "loss": 4.4125, + "step": 1086 + }, + { + "epoch": 0.01087, + "grad_norm": 0.6399587640180319, + "learning_rate": 0.003, + "loss": 4.4286, + "step": 1087 + }, + { + "epoch": 0.01088, + "grad_norm": 0.5625590086017324, + "learning_rate": 0.003, + "loss": 4.4612, + "step": 1088 + }, + { + "epoch": 0.01089, + "grad_norm": 0.5476518731942144, + "learning_rate": 0.003, + "loss": 4.4282, + "step": 1089 + }, + { + "epoch": 0.0109, + "grad_norm": 0.6962765199988842, + "learning_rate": 0.003, + "loss": 4.4603, + "step": 1090 + }, + { + "epoch": 0.01091, + "grad_norm": 0.7992579884725848, + "learning_rate": 0.003, + "loss": 4.4391, + "step": 1091 + }, + { + "epoch": 0.01092, + "grad_norm": 0.6922142003261247, + "learning_rate": 0.003, + "loss": 4.43, + "step": 1092 + }, + { + "epoch": 0.01093, + "grad_norm": 0.7201776431280809, + "learning_rate": 0.003, + "loss": 4.4545, + "step": 1093 + }, + { + "epoch": 0.01094, + "grad_norm": 0.7324411775727792, + "learning_rate": 0.003, + "loss": 4.4431, + "step": 1094 + }, + { + "epoch": 0.01095, + "grad_norm": 0.7356072077097602, + "learning_rate": 0.003, + "loss": 4.4704, + "step": 1095 + }, + { + "epoch": 0.01096, + "grad_norm": 0.7577305220019848, + "learning_rate": 0.003, + "loss": 4.445, + "step": 1096 + }, + { + "epoch": 0.01097, + "grad_norm": 0.7290952400289119, + "learning_rate": 0.003, + "loss": 4.4352, + "step": 1097 + }, + { + "epoch": 0.01098, + "grad_norm": 0.8085780813837044, + "learning_rate": 0.003, + "loss": 4.4555, + "step": 1098 + }, + { + "epoch": 0.01099, + "grad_norm": 0.8944752292785944, + "learning_rate": 0.003, + "loss": 4.4361, + "step": 1099 + }, + { + "epoch": 0.011, + "grad_norm": 0.7875559243106586, + "learning_rate": 0.003, + "loss": 4.4348, + "step": 1100 + }, + { + "epoch": 0.01101, + "grad_norm": 0.8589536043002628, + "learning_rate": 0.003, + "loss": 4.4539, + "step": 1101 + }, + { + "epoch": 0.01102, + "grad_norm": 0.8464001238353072, + "learning_rate": 0.003, + "loss": 4.4762, + "step": 1102 + }, + { + "epoch": 0.01103, + "grad_norm": 0.8851736378981229, + "learning_rate": 0.003, + "loss": 4.4662, + "step": 1103 + }, + { + "epoch": 0.01104, + "grad_norm": 0.7153611537759403, + "learning_rate": 0.003, + "loss": 4.4545, + "step": 1104 + }, + { + "epoch": 0.01105, + "grad_norm": 0.5180783683134489, + "learning_rate": 0.003, + "loss": 4.4659, + "step": 1105 + }, + { + "epoch": 0.01106, + "grad_norm": 0.5726422216533474, + "learning_rate": 0.003, + "loss": 4.4387, + "step": 1106 + }, + { + "epoch": 0.01107, + "grad_norm": 0.5702919336798088, + "learning_rate": 0.003, + "loss": 4.4468, + "step": 1107 + }, + { + "epoch": 0.01108, + "grad_norm": 0.48082946780834995, + "learning_rate": 0.003, + "loss": 4.4114, + "step": 1108 + }, + { + "epoch": 0.01109, + "grad_norm": 0.48270985447367, + "learning_rate": 0.003, + "loss": 4.4251, + "step": 1109 + }, + { + "epoch": 0.0111, + "grad_norm": 0.521002318978107, + "learning_rate": 0.003, + "loss": 4.4465, + "step": 1110 + }, + { + "epoch": 0.01111, + "grad_norm": 0.5366151298263073, + "learning_rate": 0.003, + "loss": 4.4215, + "step": 1111 + }, + { + "epoch": 0.01112, + "grad_norm": 0.4687552317220928, + "learning_rate": 0.003, + "loss": 4.4366, + "step": 1112 + }, + { + "epoch": 0.01113, + "grad_norm": 0.41403168260550727, + "learning_rate": 0.003, + "loss": 4.403, + "step": 1113 + }, + { + "epoch": 0.01114, + "grad_norm": 0.4499817067649614, + "learning_rate": 0.003, + "loss": 4.4155, + "step": 1114 + }, + { + "epoch": 0.01115, + "grad_norm": 0.5079887140928414, + "learning_rate": 0.003, + "loss": 4.4407, + "step": 1115 + }, + { + "epoch": 0.01116, + "grad_norm": 0.4835939068607683, + "learning_rate": 0.003, + "loss": 4.4267, + "step": 1116 + }, + { + "epoch": 0.01117, + "grad_norm": 0.4596276551432057, + "learning_rate": 0.003, + "loss": 4.4124, + "step": 1117 + }, + { + "epoch": 0.01118, + "grad_norm": 0.4299051580324855, + "learning_rate": 0.003, + "loss": 4.414, + "step": 1118 + }, + { + "epoch": 0.01119, + "grad_norm": 0.43855200803943045, + "learning_rate": 0.003, + "loss": 4.4183, + "step": 1119 + }, + { + "epoch": 0.0112, + "grad_norm": 0.47045620681482975, + "learning_rate": 0.003, + "loss": 4.4017, + "step": 1120 + }, + { + "epoch": 0.01121, + "grad_norm": 0.47984890763909716, + "learning_rate": 0.003, + "loss": 4.4142, + "step": 1121 + }, + { + "epoch": 0.01122, + "grad_norm": 0.4953392048385066, + "learning_rate": 0.003, + "loss": 4.3919, + "step": 1122 + }, + { + "epoch": 0.01123, + "grad_norm": 0.5318471574599554, + "learning_rate": 0.003, + "loss": 4.3799, + "step": 1123 + }, + { + "epoch": 0.01124, + "grad_norm": 0.5664861125554082, + "learning_rate": 0.003, + "loss": 4.4277, + "step": 1124 + }, + { + "epoch": 0.01125, + "grad_norm": 0.5893995505624438, + "learning_rate": 0.003, + "loss": 4.3995, + "step": 1125 + }, + { + "epoch": 0.01126, + "grad_norm": 0.5624864569583264, + "learning_rate": 0.003, + "loss": 4.4071, + "step": 1126 + }, + { + "epoch": 0.01127, + "grad_norm": 0.6840590538183918, + "learning_rate": 0.003, + "loss": 4.442, + "step": 1127 + }, + { + "epoch": 0.01128, + "grad_norm": 0.6290952877293927, + "learning_rate": 0.003, + "loss": 4.4221, + "step": 1128 + }, + { + "epoch": 0.01129, + "grad_norm": 0.5439309858740465, + "learning_rate": 0.003, + "loss": 4.4089, + "step": 1129 + }, + { + "epoch": 0.0113, + "grad_norm": 0.5640044781984855, + "learning_rate": 0.003, + "loss": 4.3915, + "step": 1130 + }, + { + "epoch": 0.01131, + "grad_norm": 0.5291031908216413, + "learning_rate": 0.003, + "loss": 4.4024, + "step": 1131 + }, + { + "epoch": 0.01132, + "grad_norm": 0.5330070088798269, + "learning_rate": 0.003, + "loss": 4.4286, + "step": 1132 + }, + { + "epoch": 0.01133, + "grad_norm": 0.49343294212766914, + "learning_rate": 0.003, + "loss": 4.387, + "step": 1133 + }, + { + "epoch": 0.01134, + "grad_norm": 0.5456485707169718, + "learning_rate": 0.003, + "loss": 4.4014, + "step": 1134 + }, + { + "epoch": 0.01135, + "grad_norm": 0.5446260165807105, + "learning_rate": 0.003, + "loss": 4.4306, + "step": 1135 + }, + { + "epoch": 0.01136, + "grad_norm": 0.4995675256097383, + "learning_rate": 0.003, + "loss": 4.4015, + "step": 1136 + }, + { + "epoch": 0.01137, + "grad_norm": 0.5676213324650841, + "learning_rate": 0.003, + "loss": 4.4302, + "step": 1137 + }, + { + "epoch": 0.01138, + "grad_norm": 0.5808574548770593, + "learning_rate": 0.003, + "loss": 4.3883, + "step": 1138 + }, + { + "epoch": 0.01139, + "grad_norm": 0.5783839384535551, + "learning_rate": 0.003, + "loss": 4.3964, + "step": 1139 + }, + { + "epoch": 0.0114, + "grad_norm": 0.5416297129288978, + "learning_rate": 0.003, + "loss": 4.3794, + "step": 1140 + }, + { + "epoch": 0.01141, + "grad_norm": 0.5159183474876468, + "learning_rate": 0.003, + "loss": 4.4038, + "step": 1141 + }, + { + "epoch": 0.01142, + "grad_norm": 0.5569718794922074, + "learning_rate": 0.003, + "loss": 4.3824, + "step": 1142 + }, + { + "epoch": 0.01143, + "grad_norm": 0.5472188962598419, + "learning_rate": 0.003, + "loss": 4.3984, + "step": 1143 + }, + { + "epoch": 0.01144, + "grad_norm": 0.5020345156310454, + "learning_rate": 0.003, + "loss": 4.3625, + "step": 1144 + }, + { + "epoch": 0.01145, + "grad_norm": 0.5548649777515073, + "learning_rate": 0.003, + "loss": 4.4027, + "step": 1145 + }, + { + "epoch": 0.01146, + "grad_norm": 0.5316552996009986, + "learning_rate": 0.003, + "loss": 4.3864, + "step": 1146 + }, + { + "epoch": 0.01147, + "grad_norm": 0.580862846664092, + "learning_rate": 0.003, + "loss": 4.3809, + "step": 1147 + }, + { + "epoch": 0.01148, + "grad_norm": 0.6205202997095316, + "learning_rate": 0.003, + "loss": 4.3827, + "step": 1148 + }, + { + "epoch": 0.01149, + "grad_norm": 0.6731514287407836, + "learning_rate": 0.003, + "loss": 4.4075, + "step": 1149 + }, + { + "epoch": 0.0115, + "grad_norm": 0.8717295567714907, + "learning_rate": 0.003, + "loss": 4.4098, + "step": 1150 + }, + { + "epoch": 0.01151, + "grad_norm": 0.8032465834892395, + "learning_rate": 0.003, + "loss": 4.4148, + "step": 1151 + }, + { + "epoch": 0.01152, + "grad_norm": 0.7381597286505803, + "learning_rate": 0.003, + "loss": 4.4217, + "step": 1152 + }, + { + "epoch": 0.01153, + "grad_norm": 0.7429010521184413, + "learning_rate": 0.003, + "loss": 4.4395, + "step": 1153 + }, + { + "epoch": 0.01154, + "grad_norm": 0.6900014831219232, + "learning_rate": 0.003, + "loss": 4.4154, + "step": 1154 + }, + { + "epoch": 0.01155, + "grad_norm": 0.64430413026539, + "learning_rate": 0.003, + "loss": 4.4019, + "step": 1155 + }, + { + "epoch": 0.01156, + "grad_norm": 0.5602174601436266, + "learning_rate": 0.003, + "loss": 4.3999, + "step": 1156 + }, + { + "epoch": 0.01157, + "grad_norm": 0.5870036133781706, + "learning_rate": 0.003, + "loss": 4.4301, + "step": 1157 + }, + { + "epoch": 0.01158, + "grad_norm": 0.49506132828209426, + "learning_rate": 0.003, + "loss": 4.3945, + "step": 1158 + }, + { + "epoch": 0.01159, + "grad_norm": 0.4315860669439085, + "learning_rate": 0.003, + "loss": 4.3852, + "step": 1159 + }, + { + "epoch": 0.0116, + "grad_norm": 0.48532942177035343, + "learning_rate": 0.003, + "loss": 4.4026, + "step": 1160 + }, + { + "epoch": 0.01161, + "grad_norm": 0.441658698740049, + "learning_rate": 0.003, + "loss": 4.3739, + "step": 1161 + }, + { + "epoch": 0.01162, + "grad_norm": 0.4784702532673203, + "learning_rate": 0.003, + "loss": 4.3867, + "step": 1162 + }, + { + "epoch": 0.01163, + "grad_norm": 0.44588993451057907, + "learning_rate": 0.003, + "loss": 4.4046, + "step": 1163 + }, + { + "epoch": 0.01164, + "grad_norm": 0.3982553498496815, + "learning_rate": 0.003, + "loss": 4.3917, + "step": 1164 + }, + { + "epoch": 0.01165, + "grad_norm": 0.3715052589980553, + "learning_rate": 0.003, + "loss": 4.3843, + "step": 1165 + }, + { + "epoch": 0.01166, + "grad_norm": 0.35254776690075756, + "learning_rate": 0.003, + "loss": 4.3817, + "step": 1166 + }, + { + "epoch": 0.01167, + "grad_norm": 0.41049866169548815, + "learning_rate": 0.003, + "loss": 4.3499, + "step": 1167 + }, + { + "epoch": 0.01168, + "grad_norm": 0.43814969571755685, + "learning_rate": 0.003, + "loss": 4.3577, + "step": 1168 + }, + { + "epoch": 0.01169, + "grad_norm": 0.5199727636769494, + "learning_rate": 0.003, + "loss": 4.401, + "step": 1169 + }, + { + "epoch": 0.0117, + "grad_norm": 0.7308392317868168, + "learning_rate": 0.003, + "loss": 4.3971, + "step": 1170 + }, + { + "epoch": 0.01171, + "grad_norm": 0.8534230998766525, + "learning_rate": 0.003, + "loss": 4.3895, + "step": 1171 + }, + { + "epoch": 0.01172, + "grad_norm": 0.7264855860795836, + "learning_rate": 0.003, + "loss": 4.3801, + "step": 1172 + }, + { + "epoch": 0.01173, + "grad_norm": 0.7253809298036523, + "learning_rate": 0.003, + "loss": 4.4199, + "step": 1173 + }, + { + "epoch": 0.01174, + "grad_norm": 0.7347248301973536, + "learning_rate": 0.003, + "loss": 4.3956, + "step": 1174 + }, + { + "epoch": 0.01175, + "grad_norm": 0.837594156223567, + "learning_rate": 0.003, + "loss": 4.3885, + "step": 1175 + }, + { + "epoch": 0.01176, + "grad_norm": 0.6959008759390608, + "learning_rate": 0.003, + "loss": 4.4162, + "step": 1176 + }, + { + "epoch": 0.01177, + "grad_norm": 0.7136269362957118, + "learning_rate": 0.003, + "loss": 4.3914, + "step": 1177 + }, + { + "epoch": 0.01178, + "grad_norm": 0.5923303437356635, + "learning_rate": 0.003, + "loss": 4.4002, + "step": 1178 + }, + { + "epoch": 0.01179, + "grad_norm": 0.5305527199591268, + "learning_rate": 0.003, + "loss": 4.3783, + "step": 1179 + }, + { + "epoch": 0.0118, + "grad_norm": 0.5526769016584179, + "learning_rate": 0.003, + "loss": 4.4415, + "step": 1180 + }, + { + "epoch": 0.01181, + "grad_norm": 0.603168145699105, + "learning_rate": 0.003, + "loss": 4.4326, + "step": 1181 + }, + { + "epoch": 0.01182, + "grad_norm": 0.5234478564139211, + "learning_rate": 0.003, + "loss": 4.3879, + "step": 1182 + }, + { + "epoch": 0.01183, + "grad_norm": 0.571647059903747, + "learning_rate": 0.003, + "loss": 4.3807, + "step": 1183 + }, + { + "epoch": 0.01184, + "grad_norm": 0.6778225586382575, + "learning_rate": 0.003, + "loss": 4.4261, + "step": 1184 + }, + { + "epoch": 0.01185, + "grad_norm": 0.7498329664059358, + "learning_rate": 0.003, + "loss": 4.4017, + "step": 1185 + }, + { + "epoch": 0.01186, + "grad_norm": 0.7043825932339357, + "learning_rate": 0.003, + "loss": 4.4247, + "step": 1186 + }, + { + "epoch": 0.01187, + "grad_norm": 0.6471541142959871, + "learning_rate": 0.003, + "loss": 4.3867, + "step": 1187 + }, + { + "epoch": 0.01188, + "grad_norm": 0.498120631856624, + "learning_rate": 0.003, + "loss": 4.3947, + "step": 1188 + }, + { + "epoch": 0.01189, + "grad_norm": 0.4776874390876697, + "learning_rate": 0.003, + "loss": 4.3881, + "step": 1189 + }, + { + "epoch": 0.0119, + "grad_norm": 0.5052480096056097, + "learning_rate": 0.003, + "loss": 4.3727, + "step": 1190 + }, + { + "epoch": 0.01191, + "grad_norm": 0.554587298838607, + "learning_rate": 0.003, + "loss": 4.3534, + "step": 1191 + }, + { + "epoch": 0.01192, + "grad_norm": 0.652885110576955, + "learning_rate": 0.003, + "loss": 4.3938, + "step": 1192 + }, + { + "epoch": 0.01193, + "grad_norm": 0.814652753692073, + "learning_rate": 0.003, + "loss": 4.4033, + "step": 1193 + }, + { + "epoch": 0.01194, + "grad_norm": 0.8149173654121779, + "learning_rate": 0.003, + "loss": 4.4185, + "step": 1194 + }, + { + "epoch": 0.01195, + "grad_norm": 0.770137476772292, + "learning_rate": 0.003, + "loss": 4.3916, + "step": 1195 + }, + { + "epoch": 0.01196, + "grad_norm": 0.7050379864259693, + "learning_rate": 0.003, + "loss": 4.3964, + "step": 1196 + }, + { + "epoch": 0.01197, + "grad_norm": 0.5988825851814717, + "learning_rate": 0.003, + "loss": 4.3556, + "step": 1197 + }, + { + "epoch": 0.01198, + "grad_norm": 0.5626266858189237, + "learning_rate": 0.003, + "loss": 4.4296, + "step": 1198 + }, + { + "epoch": 0.01199, + "grad_norm": 0.5539141200560117, + "learning_rate": 0.003, + "loss": 4.4003, + "step": 1199 + }, + { + "epoch": 0.012, + "grad_norm": 0.446871069304691, + "learning_rate": 0.003, + "loss": 4.3872, + "step": 1200 + }, + { + "epoch": 0.01201, + "grad_norm": 0.41098308583368987, + "learning_rate": 0.003, + "loss": 4.4145, + "step": 1201 + }, + { + "epoch": 0.01202, + "grad_norm": 0.40482114944892034, + "learning_rate": 0.003, + "loss": 4.3801, + "step": 1202 + }, + { + "epoch": 0.01203, + "grad_norm": 0.3801203709283518, + "learning_rate": 0.003, + "loss": 4.3878, + "step": 1203 + }, + { + "epoch": 0.01204, + "grad_norm": 0.4044230744585578, + "learning_rate": 0.003, + "loss": 4.3855, + "step": 1204 + }, + { + "epoch": 0.01205, + "grad_norm": 0.4153028028477695, + "learning_rate": 0.003, + "loss": 4.3753, + "step": 1205 + }, + { + "epoch": 0.01206, + "grad_norm": 0.4253601027948775, + "learning_rate": 0.003, + "loss": 4.4192, + "step": 1206 + }, + { + "epoch": 0.01207, + "grad_norm": 0.42584001554399725, + "learning_rate": 0.003, + "loss": 4.3415, + "step": 1207 + }, + { + "epoch": 0.01208, + "grad_norm": 0.38034721696084095, + "learning_rate": 0.003, + "loss": 4.3983, + "step": 1208 + }, + { + "epoch": 0.01209, + "grad_norm": 0.38812168043845363, + "learning_rate": 0.003, + "loss": 4.3807, + "step": 1209 + }, + { + "epoch": 0.0121, + "grad_norm": 0.48056015790597395, + "learning_rate": 0.003, + "loss": 4.3838, + "step": 1210 + }, + { + "epoch": 0.01211, + "grad_norm": 0.5637402598516663, + "learning_rate": 0.003, + "loss": 4.3728, + "step": 1211 + }, + { + "epoch": 0.01212, + "grad_norm": 0.678908053443676, + "learning_rate": 0.003, + "loss": 4.4009, + "step": 1212 + }, + { + "epoch": 0.01213, + "grad_norm": 0.7872996276111477, + "learning_rate": 0.003, + "loss": 4.3904, + "step": 1213 + }, + { + "epoch": 0.01214, + "grad_norm": 0.737245569468942, + "learning_rate": 0.003, + "loss": 4.3712, + "step": 1214 + }, + { + "epoch": 0.01215, + "grad_norm": 0.7444446627290159, + "learning_rate": 0.003, + "loss": 4.3825, + "step": 1215 + }, + { + "epoch": 0.01216, + "grad_norm": 0.7493613956712142, + "learning_rate": 0.003, + "loss": 4.3829, + "step": 1216 + }, + { + "epoch": 0.01217, + "grad_norm": 0.7355810094680961, + "learning_rate": 0.003, + "loss": 4.3915, + "step": 1217 + }, + { + "epoch": 0.01218, + "grad_norm": 0.8012910697526976, + "learning_rate": 0.003, + "loss": 4.3871, + "step": 1218 + }, + { + "epoch": 0.01219, + "grad_norm": 0.8693012687676964, + "learning_rate": 0.003, + "loss": 4.3988, + "step": 1219 + }, + { + "epoch": 0.0122, + "grad_norm": 0.8958839342651639, + "learning_rate": 0.003, + "loss": 4.3996, + "step": 1220 + }, + { + "epoch": 0.01221, + "grad_norm": 1.0403569844700198, + "learning_rate": 0.003, + "loss": 4.4291, + "step": 1221 + }, + { + "epoch": 0.01222, + "grad_norm": 1.1355134224594017, + "learning_rate": 0.003, + "loss": 4.4293, + "step": 1222 + }, + { + "epoch": 0.01223, + "grad_norm": 0.9630650396281519, + "learning_rate": 0.003, + "loss": 4.4167, + "step": 1223 + }, + { + "epoch": 0.01224, + "grad_norm": 0.8763852343022005, + "learning_rate": 0.003, + "loss": 4.4062, + "step": 1224 + }, + { + "epoch": 0.01225, + "grad_norm": 0.8983930620848783, + "learning_rate": 0.003, + "loss": 4.4328, + "step": 1225 + }, + { + "epoch": 0.01226, + "grad_norm": 0.7843004711214445, + "learning_rate": 0.003, + "loss": 4.4231, + "step": 1226 + }, + { + "epoch": 0.01227, + "grad_norm": 0.6802825597762372, + "learning_rate": 0.003, + "loss": 4.4455, + "step": 1227 + }, + { + "epoch": 0.01228, + "grad_norm": 0.7175774602889506, + "learning_rate": 0.003, + "loss": 4.433, + "step": 1228 + }, + { + "epoch": 0.01229, + "grad_norm": 0.7124552785986754, + "learning_rate": 0.003, + "loss": 4.4177, + "step": 1229 + }, + { + "epoch": 0.0123, + "grad_norm": 0.7173646190359848, + "learning_rate": 0.003, + "loss": 4.4462, + "step": 1230 + }, + { + "epoch": 0.01231, + "grad_norm": 0.6710417652647681, + "learning_rate": 0.003, + "loss": 4.4192, + "step": 1231 + }, + { + "epoch": 0.01232, + "grad_norm": 0.5319364475378244, + "learning_rate": 0.003, + "loss": 4.3837, + "step": 1232 + }, + { + "epoch": 0.01233, + "grad_norm": 0.5107726510651173, + "learning_rate": 0.003, + "loss": 4.3892, + "step": 1233 + }, + { + "epoch": 0.01234, + "grad_norm": 0.525523603676889, + "learning_rate": 0.003, + "loss": 4.4032, + "step": 1234 + }, + { + "epoch": 0.01235, + "grad_norm": 0.4385933034868044, + "learning_rate": 0.003, + "loss": 4.3895, + "step": 1235 + }, + { + "epoch": 0.01236, + "grad_norm": 0.4417827360571799, + "learning_rate": 0.003, + "loss": 4.3922, + "step": 1236 + }, + { + "epoch": 0.01237, + "grad_norm": 0.3870170981569727, + "learning_rate": 0.003, + "loss": 4.3983, + "step": 1237 + }, + { + "epoch": 0.01238, + "grad_norm": 0.3450028372069669, + "learning_rate": 0.003, + "loss": 4.3918, + "step": 1238 + }, + { + "epoch": 0.01239, + "grad_norm": 0.32255856446948694, + "learning_rate": 0.003, + "loss": 4.3727, + "step": 1239 + }, + { + "epoch": 0.0124, + "grad_norm": 0.3078475989244171, + "learning_rate": 0.003, + "loss": 4.3537, + "step": 1240 + }, + { + "epoch": 0.01241, + "grad_norm": 0.34649646171341814, + "learning_rate": 0.003, + "loss": 4.3646, + "step": 1241 + }, + { + "epoch": 0.01242, + "grad_norm": 0.3965097983444254, + "learning_rate": 0.003, + "loss": 4.3706, + "step": 1242 + }, + { + "epoch": 0.01243, + "grad_norm": 0.4535687938279078, + "learning_rate": 0.003, + "loss": 4.3517, + "step": 1243 + }, + { + "epoch": 0.01244, + "grad_norm": 0.5186172619894055, + "learning_rate": 0.003, + "loss": 4.3669, + "step": 1244 + }, + { + "epoch": 0.01245, + "grad_norm": 0.6425018812487958, + "learning_rate": 0.003, + "loss": 4.363, + "step": 1245 + }, + { + "epoch": 0.01246, + "grad_norm": 0.6245723386860975, + "learning_rate": 0.003, + "loss": 4.3446, + "step": 1246 + }, + { + "epoch": 0.01247, + "grad_norm": 0.44377555171018157, + "learning_rate": 0.003, + "loss": 4.3915, + "step": 1247 + }, + { + "epoch": 0.01248, + "grad_norm": 0.5278029739976049, + "learning_rate": 0.003, + "loss": 4.3761, + "step": 1248 + }, + { + "epoch": 0.01249, + "grad_norm": 0.5102876241848929, + "learning_rate": 0.003, + "loss": 4.3677, + "step": 1249 + }, + { + "epoch": 0.0125, + "grad_norm": 0.39559739569893493, + "learning_rate": 0.003, + "loss": 4.3348, + "step": 1250 + }, + { + "epoch": 0.01251, + "grad_norm": 0.4774815455759189, + "learning_rate": 0.003, + "loss": 4.373, + "step": 1251 + }, + { + "epoch": 0.01252, + "grad_norm": 0.40459109080676114, + "learning_rate": 0.003, + "loss": 4.3467, + "step": 1252 + }, + { + "epoch": 0.01253, + "grad_norm": 0.3506155695588933, + "learning_rate": 0.003, + "loss": 4.3238, + "step": 1253 + }, + { + "epoch": 0.01254, + "grad_norm": 0.36046868647854385, + "learning_rate": 0.003, + "loss": 4.3697, + "step": 1254 + }, + { + "epoch": 0.01255, + "grad_norm": 0.400469314411457, + "learning_rate": 0.003, + "loss": 4.3202, + "step": 1255 + }, + { + "epoch": 0.01256, + "grad_norm": 0.42579169102032727, + "learning_rate": 0.003, + "loss": 4.3478, + "step": 1256 + }, + { + "epoch": 0.01257, + "grad_norm": 0.43666374954115567, + "learning_rate": 0.003, + "loss": 4.3342, + "step": 1257 + }, + { + "epoch": 0.01258, + "grad_norm": 0.3885559357601015, + "learning_rate": 0.003, + "loss": 4.3411, + "step": 1258 + }, + { + "epoch": 0.01259, + "grad_norm": 0.3450685274689168, + "learning_rate": 0.003, + "loss": 4.3417, + "step": 1259 + }, + { + "epoch": 0.0126, + "grad_norm": 0.3296913229590147, + "learning_rate": 0.003, + "loss": 4.3689, + "step": 1260 + }, + { + "epoch": 0.01261, + "grad_norm": 0.3469234561542926, + "learning_rate": 0.003, + "loss": 4.3628, + "step": 1261 + }, + { + "epoch": 0.01262, + "grad_norm": 0.40339400226308325, + "learning_rate": 0.003, + "loss": 4.3558, + "step": 1262 + }, + { + "epoch": 0.01263, + "grad_norm": 0.44597422220213867, + "learning_rate": 0.003, + "loss": 4.3512, + "step": 1263 + }, + { + "epoch": 0.01264, + "grad_norm": 0.4427407408205576, + "learning_rate": 0.003, + "loss": 4.338, + "step": 1264 + }, + { + "epoch": 0.01265, + "grad_norm": 0.41573445698488437, + "learning_rate": 0.003, + "loss": 4.3506, + "step": 1265 + }, + { + "epoch": 0.01266, + "grad_norm": 0.45186560215837035, + "learning_rate": 0.003, + "loss": 4.349, + "step": 1266 + }, + { + "epoch": 0.01267, + "grad_norm": 0.6290443655616889, + "learning_rate": 0.003, + "loss": 4.3767, + "step": 1267 + }, + { + "epoch": 0.01268, + "grad_norm": 0.8841541603744382, + "learning_rate": 0.003, + "loss": 4.3763, + "step": 1268 + }, + { + "epoch": 0.01269, + "grad_norm": 0.9324063273299563, + "learning_rate": 0.003, + "loss": 4.3633, + "step": 1269 + }, + { + "epoch": 0.0127, + "grad_norm": 0.7531387064097018, + "learning_rate": 0.003, + "loss": 4.389, + "step": 1270 + }, + { + "epoch": 0.01271, + "grad_norm": 0.7173511219844001, + "learning_rate": 0.003, + "loss": 4.3689, + "step": 1271 + }, + { + "epoch": 0.01272, + "grad_norm": 0.6910307872241689, + "learning_rate": 0.003, + "loss": 4.3626, + "step": 1272 + }, + { + "epoch": 0.01273, + "grad_norm": 0.6392091020786222, + "learning_rate": 0.003, + "loss": 4.3902, + "step": 1273 + }, + { + "epoch": 0.01274, + "grad_norm": 0.5683923983758625, + "learning_rate": 0.003, + "loss": 4.3811, + "step": 1274 + }, + { + "epoch": 0.01275, + "grad_norm": 0.48501080462586865, + "learning_rate": 0.003, + "loss": 4.3433, + "step": 1275 + }, + { + "epoch": 0.01276, + "grad_norm": 0.4388665728327947, + "learning_rate": 0.003, + "loss": 4.3487, + "step": 1276 + }, + { + "epoch": 0.01277, + "grad_norm": 0.5024100232553123, + "learning_rate": 0.003, + "loss": 4.358, + "step": 1277 + }, + { + "epoch": 0.01278, + "grad_norm": 0.5072917507898231, + "learning_rate": 0.003, + "loss": 4.3646, + "step": 1278 + }, + { + "epoch": 0.01279, + "grad_norm": 0.48942775338603506, + "learning_rate": 0.003, + "loss": 4.3461, + "step": 1279 + }, + { + "epoch": 0.0128, + "grad_norm": 0.5180680805997396, + "learning_rate": 0.003, + "loss": 4.3453, + "step": 1280 + }, + { + "epoch": 0.01281, + "grad_norm": 0.589882368752518, + "learning_rate": 0.003, + "loss": 4.3309, + "step": 1281 + }, + { + "epoch": 0.01282, + "grad_norm": 0.7217688053955748, + "learning_rate": 0.003, + "loss": 4.3833, + "step": 1282 + }, + { + "epoch": 0.01283, + "grad_norm": 0.849897015093446, + "learning_rate": 0.003, + "loss": 4.3595, + "step": 1283 + }, + { + "epoch": 0.01284, + "grad_norm": 0.769319717001856, + "learning_rate": 0.003, + "loss": 4.3713, + "step": 1284 + }, + { + "epoch": 0.01285, + "grad_norm": 0.7372234573055417, + "learning_rate": 0.003, + "loss": 4.3679, + "step": 1285 + }, + { + "epoch": 0.01286, + "grad_norm": 0.6185984538132406, + "learning_rate": 0.003, + "loss": 4.3884, + "step": 1286 + }, + { + "epoch": 0.01287, + "grad_norm": 0.566489365455937, + "learning_rate": 0.003, + "loss": 4.3487, + "step": 1287 + }, + { + "epoch": 0.01288, + "grad_norm": 0.583268123970244, + "learning_rate": 0.003, + "loss": 4.3654, + "step": 1288 + }, + { + "epoch": 0.01289, + "grad_norm": 0.5365604798494432, + "learning_rate": 0.003, + "loss": 4.3546, + "step": 1289 + }, + { + "epoch": 0.0129, + "grad_norm": 0.5898461372899422, + "learning_rate": 0.003, + "loss": 4.3784, + "step": 1290 + }, + { + "epoch": 0.01291, + "grad_norm": 0.7454645554405436, + "learning_rate": 0.003, + "loss": 4.3672, + "step": 1291 + }, + { + "epoch": 0.01292, + "grad_norm": 0.8872276988300176, + "learning_rate": 0.003, + "loss": 4.4157, + "step": 1292 + }, + { + "epoch": 0.01293, + "grad_norm": 0.8660558149247537, + "learning_rate": 0.003, + "loss": 4.4053, + "step": 1293 + }, + { + "epoch": 0.01294, + "grad_norm": 0.7869826032444561, + "learning_rate": 0.003, + "loss": 4.3679, + "step": 1294 + }, + { + "epoch": 0.01295, + "grad_norm": 0.7577761481466739, + "learning_rate": 0.003, + "loss": 4.3975, + "step": 1295 + }, + { + "epoch": 0.01296, + "grad_norm": 0.6748799445050047, + "learning_rate": 0.003, + "loss": 4.3598, + "step": 1296 + }, + { + "epoch": 0.01297, + "grad_norm": 0.670162676491956, + "learning_rate": 0.003, + "loss": 4.376, + "step": 1297 + }, + { + "epoch": 0.01298, + "grad_norm": 0.5691583859101702, + "learning_rate": 0.003, + "loss": 4.3472, + "step": 1298 + }, + { + "epoch": 0.01299, + "grad_norm": 0.5072926331461023, + "learning_rate": 0.003, + "loss": 4.3555, + "step": 1299 + }, + { + "epoch": 0.013, + "grad_norm": 0.4828769685141814, + "learning_rate": 0.003, + "loss": 4.343, + "step": 1300 + }, + { + "epoch": 0.01301, + "grad_norm": 0.4691822990462979, + "learning_rate": 0.003, + "loss": 4.3558, + "step": 1301 + }, + { + "epoch": 0.01302, + "grad_norm": 0.4140113973052622, + "learning_rate": 0.003, + "loss": 4.4006, + "step": 1302 + }, + { + "epoch": 0.01303, + "grad_norm": 0.3685109579455993, + "learning_rate": 0.003, + "loss": 4.3436, + "step": 1303 + }, + { + "epoch": 0.01304, + "grad_norm": 0.3760405201721622, + "learning_rate": 0.003, + "loss": 4.3696, + "step": 1304 + }, + { + "epoch": 0.01305, + "grad_norm": 0.3221665114274485, + "learning_rate": 0.003, + "loss": 4.3518, + "step": 1305 + }, + { + "epoch": 0.01306, + "grad_norm": 0.33492469868124675, + "learning_rate": 0.003, + "loss": 4.3452, + "step": 1306 + }, + { + "epoch": 0.01307, + "grad_norm": 0.33475611430641294, + "learning_rate": 0.003, + "loss": 4.3626, + "step": 1307 + }, + { + "epoch": 0.01308, + "grad_norm": 0.4027154015406206, + "learning_rate": 0.003, + "loss": 4.3385, + "step": 1308 + }, + { + "epoch": 0.01309, + "grad_norm": 0.5286332892700527, + "learning_rate": 0.003, + "loss": 4.3804, + "step": 1309 + }, + { + "epoch": 0.0131, + "grad_norm": 0.7090250449949596, + "learning_rate": 0.003, + "loss": 4.3359, + "step": 1310 + }, + { + "epoch": 0.01311, + "grad_norm": 0.9374592813554533, + "learning_rate": 0.003, + "loss": 4.3914, + "step": 1311 + }, + { + "epoch": 0.01312, + "grad_norm": 0.8386895366440477, + "learning_rate": 0.003, + "loss": 4.3696, + "step": 1312 + }, + { + "epoch": 0.01313, + "grad_norm": 0.7384206623774927, + "learning_rate": 0.003, + "loss": 4.3534, + "step": 1313 + }, + { + "epoch": 0.01314, + "grad_norm": 0.7139194720668496, + "learning_rate": 0.003, + "loss": 4.3527, + "step": 1314 + }, + { + "epoch": 0.01315, + "grad_norm": 0.7085227699461494, + "learning_rate": 0.003, + "loss": 4.3809, + "step": 1315 + }, + { + "epoch": 0.01316, + "grad_norm": 0.6924444519815, + "learning_rate": 0.003, + "loss": 4.3653, + "step": 1316 + }, + { + "epoch": 0.01317, + "grad_norm": 0.6387153931562799, + "learning_rate": 0.003, + "loss": 4.3902, + "step": 1317 + }, + { + "epoch": 0.01318, + "grad_norm": 0.6344112280159295, + "learning_rate": 0.003, + "loss": 4.3652, + "step": 1318 + }, + { + "epoch": 0.01319, + "grad_norm": 0.7270675767653099, + "learning_rate": 0.003, + "loss": 4.3841, + "step": 1319 + }, + { + "epoch": 0.0132, + "grad_norm": 0.6321319111156113, + "learning_rate": 0.003, + "loss": 4.3725, + "step": 1320 + }, + { + "epoch": 0.01321, + "grad_norm": 0.6141991678487796, + "learning_rate": 0.003, + "loss": 4.3508, + "step": 1321 + }, + { + "epoch": 0.01322, + "grad_norm": 0.5694545226818948, + "learning_rate": 0.003, + "loss": 4.3756, + "step": 1322 + }, + { + "epoch": 0.01323, + "grad_norm": 0.7445509945833578, + "learning_rate": 0.003, + "loss": 4.3766, + "step": 1323 + }, + { + "epoch": 0.01324, + "grad_norm": 0.7345275618591681, + "learning_rate": 0.003, + "loss": 4.3604, + "step": 1324 + }, + { + "epoch": 0.01325, + "grad_norm": 0.5731952380277242, + "learning_rate": 0.003, + "loss": 4.3595, + "step": 1325 + }, + { + "epoch": 0.01326, + "grad_norm": 0.49399828185062894, + "learning_rate": 0.003, + "loss": 4.3432, + "step": 1326 + }, + { + "epoch": 0.01327, + "grad_norm": 0.4625597253069682, + "learning_rate": 0.003, + "loss": 4.3586, + "step": 1327 + }, + { + "epoch": 0.01328, + "grad_norm": 0.46176548306971715, + "learning_rate": 0.003, + "loss": 4.3477, + "step": 1328 + }, + { + "epoch": 0.01329, + "grad_norm": 0.4120837821389968, + "learning_rate": 0.003, + "loss": 4.3099, + "step": 1329 + }, + { + "epoch": 0.0133, + "grad_norm": 0.3625953571207396, + "learning_rate": 0.003, + "loss": 4.3494, + "step": 1330 + }, + { + "epoch": 0.01331, + "grad_norm": 0.36184528613842726, + "learning_rate": 0.003, + "loss": 4.3351, + "step": 1331 + }, + { + "epoch": 0.01332, + "grad_norm": 0.3568024153641722, + "learning_rate": 0.003, + "loss": 4.3532, + "step": 1332 + }, + { + "epoch": 0.01333, + "grad_norm": 0.3204482353647099, + "learning_rate": 0.003, + "loss": 4.3252, + "step": 1333 + }, + { + "epoch": 0.01334, + "grad_norm": 0.29957411498207237, + "learning_rate": 0.003, + "loss": 4.3114, + "step": 1334 + }, + { + "epoch": 0.01335, + "grad_norm": 0.31171408266049466, + "learning_rate": 0.003, + "loss": 4.3334, + "step": 1335 + }, + { + "epoch": 0.01336, + "grad_norm": 0.28708995347069843, + "learning_rate": 0.003, + "loss": 4.3294, + "step": 1336 + }, + { + "epoch": 0.01337, + "grad_norm": 0.29278842241858566, + "learning_rate": 0.003, + "loss": 4.3434, + "step": 1337 + }, + { + "epoch": 0.01338, + "grad_norm": 0.2729723674299326, + "learning_rate": 0.003, + "loss": 4.3088, + "step": 1338 + }, + { + "epoch": 0.01339, + "grad_norm": 0.29431343213978217, + "learning_rate": 0.003, + "loss": 4.3327, + "step": 1339 + }, + { + "epoch": 0.0134, + "grad_norm": 0.3294175545018703, + "learning_rate": 0.003, + "loss": 4.3159, + "step": 1340 + }, + { + "epoch": 0.01341, + "grad_norm": 0.4346116539808444, + "learning_rate": 0.003, + "loss": 4.3329, + "step": 1341 + }, + { + "epoch": 0.01342, + "grad_norm": 0.6006528789210607, + "learning_rate": 0.003, + "loss": 4.3107, + "step": 1342 + }, + { + "epoch": 0.01343, + "grad_norm": 0.9468238356913852, + "learning_rate": 0.003, + "loss": 4.3323, + "step": 1343 + }, + { + "epoch": 0.01344, + "grad_norm": 0.9600917389412905, + "learning_rate": 0.003, + "loss": 4.3573, + "step": 1344 + }, + { + "epoch": 0.01345, + "grad_norm": 0.7454028532315747, + "learning_rate": 0.003, + "loss": 4.3824, + "step": 1345 + }, + { + "epoch": 0.01346, + "grad_norm": 0.9412385374275949, + "learning_rate": 0.003, + "loss": 4.3817, + "step": 1346 + }, + { + "epoch": 0.01347, + "grad_norm": 0.8019477836355356, + "learning_rate": 0.003, + "loss": 4.3621, + "step": 1347 + }, + { + "epoch": 0.01348, + "grad_norm": 0.822147627695476, + "learning_rate": 0.003, + "loss": 4.3672, + "step": 1348 + }, + { + "epoch": 0.01349, + "grad_norm": 0.7468015076519685, + "learning_rate": 0.003, + "loss": 4.3844, + "step": 1349 + }, + { + "epoch": 0.0135, + "grad_norm": 0.7150496437877271, + "learning_rate": 0.003, + "loss": 4.3769, + "step": 1350 + }, + { + "epoch": 0.01351, + "grad_norm": 0.6792275106065346, + "learning_rate": 0.003, + "loss": 4.4037, + "step": 1351 + }, + { + "epoch": 0.01352, + "grad_norm": 0.6749873536633729, + "learning_rate": 0.003, + "loss": 4.3795, + "step": 1352 + }, + { + "epoch": 0.01353, + "grad_norm": 0.5975751363464156, + "learning_rate": 0.003, + "loss": 4.3625, + "step": 1353 + }, + { + "epoch": 0.01354, + "grad_norm": 0.6367703716959885, + "learning_rate": 0.003, + "loss": 4.3473, + "step": 1354 + }, + { + "epoch": 0.01355, + "grad_norm": 0.6903253322513454, + "learning_rate": 0.003, + "loss": 4.3617, + "step": 1355 + }, + { + "epoch": 0.01356, + "grad_norm": 0.7830002665034361, + "learning_rate": 0.003, + "loss": 4.3589, + "step": 1356 + }, + { + "epoch": 0.01357, + "grad_norm": 0.7634637746675061, + "learning_rate": 0.003, + "loss": 4.351, + "step": 1357 + }, + { + "epoch": 0.01358, + "grad_norm": 0.6912113286037168, + "learning_rate": 0.003, + "loss": 4.3426, + "step": 1358 + }, + { + "epoch": 0.01359, + "grad_norm": 0.608100998568284, + "learning_rate": 0.003, + "loss": 4.3802, + "step": 1359 + }, + { + "epoch": 0.0136, + "grad_norm": 0.4174874313338161, + "learning_rate": 0.003, + "loss": 4.3375, + "step": 1360 + }, + { + "epoch": 0.01361, + "grad_norm": 0.4136844583981887, + "learning_rate": 0.003, + "loss": 4.3533, + "step": 1361 + }, + { + "epoch": 0.01362, + "grad_norm": 0.34986686555598134, + "learning_rate": 0.003, + "loss": 4.3597, + "step": 1362 + }, + { + "epoch": 0.01363, + "grad_norm": 0.39445259692219764, + "learning_rate": 0.003, + "loss": 4.3605, + "step": 1363 + }, + { + "epoch": 0.01364, + "grad_norm": 0.3392587016329843, + "learning_rate": 0.003, + "loss": 4.3508, + "step": 1364 + }, + { + "epoch": 0.01365, + "grad_norm": 0.3286929763598885, + "learning_rate": 0.003, + "loss": 4.3666, + "step": 1365 + }, + { + "epoch": 0.01366, + "grad_norm": 0.30698836061044843, + "learning_rate": 0.003, + "loss": 4.3194, + "step": 1366 + }, + { + "epoch": 0.01367, + "grad_norm": 0.3258356748399825, + "learning_rate": 0.003, + "loss": 4.3664, + "step": 1367 + }, + { + "epoch": 0.01368, + "grad_norm": 0.28974975652956814, + "learning_rate": 0.003, + "loss": 4.3364, + "step": 1368 + }, + { + "epoch": 0.01369, + "grad_norm": 0.3029711054161145, + "learning_rate": 0.003, + "loss": 4.3277, + "step": 1369 + }, + { + "epoch": 0.0137, + "grad_norm": 0.2864574506759695, + "learning_rate": 0.003, + "loss": 4.3508, + "step": 1370 + }, + { + "epoch": 0.01371, + "grad_norm": 0.2944593491964238, + "learning_rate": 0.003, + "loss": 4.3421, + "step": 1371 + }, + { + "epoch": 0.01372, + "grad_norm": 0.29733135265028415, + "learning_rate": 0.003, + "loss": 4.29, + "step": 1372 + }, + { + "epoch": 0.01373, + "grad_norm": 0.3030922885731935, + "learning_rate": 0.003, + "loss": 4.3287, + "step": 1373 + }, + { + "epoch": 0.01374, + "grad_norm": 0.3135661353258684, + "learning_rate": 0.003, + "loss": 4.3353, + "step": 1374 + }, + { + "epoch": 0.01375, + "grad_norm": 0.31552830475247895, + "learning_rate": 0.003, + "loss": 4.2954, + "step": 1375 + }, + { + "epoch": 0.01376, + "grad_norm": 0.3753685901400331, + "learning_rate": 0.003, + "loss": 4.3218, + "step": 1376 + }, + { + "epoch": 0.01377, + "grad_norm": 0.4931158688793232, + "learning_rate": 0.003, + "loss": 4.3247, + "step": 1377 + }, + { + "epoch": 0.01378, + "grad_norm": 0.6194459766224392, + "learning_rate": 0.003, + "loss": 4.3586, + "step": 1378 + }, + { + "epoch": 0.01379, + "grad_norm": 0.7157871954469385, + "learning_rate": 0.003, + "loss": 4.3344, + "step": 1379 + }, + { + "epoch": 0.0138, + "grad_norm": 0.7778989267777976, + "learning_rate": 0.003, + "loss": 4.3379, + "step": 1380 + }, + { + "epoch": 0.01381, + "grad_norm": 0.7540005493939272, + "learning_rate": 0.003, + "loss": 4.3478, + "step": 1381 + }, + { + "epoch": 0.01382, + "grad_norm": 0.6552744600197745, + "learning_rate": 0.003, + "loss": 4.3434, + "step": 1382 + }, + { + "epoch": 0.01383, + "grad_norm": 0.5244295500211882, + "learning_rate": 0.003, + "loss": 4.332, + "step": 1383 + }, + { + "epoch": 0.01384, + "grad_norm": 0.5199048618667436, + "learning_rate": 0.003, + "loss": 4.3604, + "step": 1384 + }, + { + "epoch": 0.01385, + "grad_norm": 0.5115879970370135, + "learning_rate": 0.003, + "loss": 4.3484, + "step": 1385 + }, + { + "epoch": 0.01386, + "grad_norm": 0.5337761524536188, + "learning_rate": 0.003, + "loss": 4.3229, + "step": 1386 + }, + { + "epoch": 0.01387, + "grad_norm": 0.49971027233062854, + "learning_rate": 0.003, + "loss": 4.3141, + "step": 1387 + }, + { + "epoch": 0.01388, + "grad_norm": 0.45570239975200477, + "learning_rate": 0.003, + "loss": 4.3501, + "step": 1388 + }, + { + "epoch": 0.01389, + "grad_norm": 0.4952533360995893, + "learning_rate": 0.003, + "loss": 4.329, + "step": 1389 + }, + { + "epoch": 0.0139, + "grad_norm": 0.5836752219457788, + "learning_rate": 0.003, + "loss": 4.3354, + "step": 1390 + }, + { + "epoch": 0.01391, + "grad_norm": 0.661197601877714, + "learning_rate": 0.003, + "loss": 4.3388, + "step": 1391 + }, + { + "epoch": 0.01392, + "grad_norm": 0.7580066821140803, + "learning_rate": 0.003, + "loss": 4.3629, + "step": 1392 + }, + { + "epoch": 0.01393, + "grad_norm": 0.864097109773997, + "learning_rate": 0.003, + "loss": 4.3351, + "step": 1393 + }, + { + "epoch": 0.01394, + "grad_norm": 0.8137871207398635, + "learning_rate": 0.003, + "loss": 4.3329, + "step": 1394 + }, + { + "epoch": 0.01395, + "grad_norm": 0.6452659589342782, + "learning_rate": 0.003, + "loss": 4.3528, + "step": 1395 + }, + { + "epoch": 0.01396, + "grad_norm": 0.7169962932582273, + "learning_rate": 0.003, + "loss": 4.3526, + "step": 1396 + }, + { + "epoch": 0.01397, + "grad_norm": 0.7722092445904787, + "learning_rate": 0.003, + "loss": 4.37, + "step": 1397 + }, + { + "epoch": 0.01398, + "grad_norm": 0.9201404272023904, + "learning_rate": 0.003, + "loss": 4.3584, + "step": 1398 + }, + { + "epoch": 0.01399, + "grad_norm": 0.8604376144856999, + "learning_rate": 0.003, + "loss": 4.3863, + "step": 1399 + }, + { + "epoch": 0.014, + "grad_norm": 0.7356178947310503, + "learning_rate": 0.003, + "loss": 4.3371, + "step": 1400 + }, + { + "epoch": 0.01401, + "grad_norm": 0.6698615745523159, + "learning_rate": 0.003, + "loss": 4.3548, + "step": 1401 + }, + { + "epoch": 0.01402, + "grad_norm": 0.678651105292387, + "learning_rate": 0.003, + "loss": 4.367, + "step": 1402 + }, + { + "epoch": 0.01403, + "grad_norm": 0.7164501675372886, + "learning_rate": 0.003, + "loss": 4.3428, + "step": 1403 + }, + { + "epoch": 0.01404, + "grad_norm": 0.7780771367935839, + "learning_rate": 0.003, + "loss": 4.3706, + "step": 1404 + }, + { + "epoch": 0.01405, + "grad_norm": 0.8097299555256038, + "learning_rate": 0.003, + "loss": 4.3921, + "step": 1405 + }, + { + "epoch": 0.01406, + "grad_norm": 0.8397836379510005, + "learning_rate": 0.003, + "loss": 4.3748, + "step": 1406 + }, + { + "epoch": 0.01407, + "grad_norm": 0.798434347116743, + "learning_rate": 0.003, + "loss": 4.3886, + "step": 1407 + }, + { + "epoch": 0.01408, + "grad_norm": 0.690162960674035, + "learning_rate": 0.003, + "loss": 4.3438, + "step": 1408 + }, + { + "epoch": 0.01409, + "grad_norm": 0.7158182494367376, + "learning_rate": 0.003, + "loss": 4.3755, + "step": 1409 + }, + { + "epoch": 0.0141, + "grad_norm": 0.6626220406570043, + "learning_rate": 0.003, + "loss": 4.3521, + "step": 1410 + }, + { + "epoch": 0.01411, + "grad_norm": 0.6080932877854873, + "learning_rate": 0.003, + "loss": 4.3227, + "step": 1411 + }, + { + "epoch": 0.01412, + "grad_norm": 0.6342264759843587, + "learning_rate": 0.003, + "loss": 4.3642, + "step": 1412 + }, + { + "epoch": 0.01413, + "grad_norm": 0.5857768731552475, + "learning_rate": 0.003, + "loss": 4.3559, + "step": 1413 + }, + { + "epoch": 0.01414, + "grad_norm": 0.575683339939827, + "learning_rate": 0.003, + "loss": 4.336, + "step": 1414 + }, + { + "epoch": 0.01415, + "grad_norm": 0.5478713782500327, + "learning_rate": 0.003, + "loss": 4.3284, + "step": 1415 + }, + { + "epoch": 0.01416, + "grad_norm": 0.48495351731286235, + "learning_rate": 0.003, + "loss": 4.3497, + "step": 1416 + }, + { + "epoch": 0.01417, + "grad_norm": 0.5582320312672604, + "learning_rate": 0.003, + "loss": 4.3419, + "step": 1417 + }, + { + "epoch": 0.01418, + "grad_norm": 0.6202372623477388, + "learning_rate": 0.003, + "loss": 4.3622, + "step": 1418 + }, + { + "epoch": 0.01419, + "grad_norm": 0.6521670079720687, + "learning_rate": 0.003, + "loss": 4.3534, + "step": 1419 + }, + { + "epoch": 0.0142, + "grad_norm": 0.5549508977810897, + "learning_rate": 0.003, + "loss": 4.3578, + "step": 1420 + }, + { + "epoch": 0.01421, + "grad_norm": 0.45613666396542774, + "learning_rate": 0.003, + "loss": 4.3197, + "step": 1421 + }, + { + "epoch": 0.01422, + "grad_norm": 0.445372260007137, + "learning_rate": 0.003, + "loss": 4.322, + "step": 1422 + }, + { + "epoch": 0.01423, + "grad_norm": 0.38402591989219925, + "learning_rate": 0.003, + "loss": 4.3246, + "step": 1423 + }, + { + "epoch": 0.01424, + "grad_norm": 0.38806384974852803, + "learning_rate": 0.003, + "loss": 4.3401, + "step": 1424 + }, + { + "epoch": 0.01425, + "grad_norm": 0.4132316355770368, + "learning_rate": 0.003, + "loss": 4.3436, + "step": 1425 + }, + { + "epoch": 0.01426, + "grad_norm": 0.3529318435487905, + "learning_rate": 0.003, + "loss": 4.3165, + "step": 1426 + }, + { + "epoch": 0.01427, + "grad_norm": 0.3824674454914564, + "learning_rate": 0.003, + "loss": 4.3137, + "step": 1427 + }, + { + "epoch": 0.01428, + "grad_norm": 0.40903598720809137, + "learning_rate": 0.003, + "loss": 4.3085, + "step": 1428 + }, + { + "epoch": 0.01429, + "grad_norm": 0.48492866215569996, + "learning_rate": 0.003, + "loss": 4.3211, + "step": 1429 + }, + { + "epoch": 0.0143, + "grad_norm": 0.5657427651213296, + "learning_rate": 0.003, + "loss": 4.3342, + "step": 1430 + }, + { + "epoch": 0.01431, + "grad_norm": 0.5675857894892379, + "learning_rate": 0.003, + "loss": 4.3246, + "step": 1431 + }, + { + "epoch": 0.01432, + "grad_norm": 0.5138100030227308, + "learning_rate": 0.003, + "loss": 4.3123, + "step": 1432 + }, + { + "epoch": 0.01433, + "grad_norm": 0.3834195074788409, + "learning_rate": 0.003, + "loss": 4.3046, + "step": 1433 + }, + { + "epoch": 0.01434, + "grad_norm": 0.380142493390664, + "learning_rate": 0.003, + "loss": 4.2898, + "step": 1434 + }, + { + "epoch": 0.01435, + "grad_norm": 0.40030955161206494, + "learning_rate": 0.003, + "loss": 4.2992, + "step": 1435 + }, + { + "epoch": 0.01436, + "grad_norm": 0.3954780968032791, + "learning_rate": 0.003, + "loss": 4.3173, + "step": 1436 + }, + { + "epoch": 0.01437, + "grad_norm": 0.3507385052853375, + "learning_rate": 0.003, + "loss": 4.3285, + "step": 1437 + }, + { + "epoch": 0.01438, + "grad_norm": 0.37418173745501515, + "learning_rate": 0.003, + "loss": 4.3228, + "step": 1438 + }, + { + "epoch": 0.01439, + "grad_norm": 0.3823240257736278, + "learning_rate": 0.003, + "loss": 4.2864, + "step": 1439 + }, + { + "epoch": 0.0144, + "grad_norm": 0.4883909418304791, + "learning_rate": 0.003, + "loss": 4.3011, + "step": 1440 + }, + { + "epoch": 0.01441, + "grad_norm": 0.5551175791026393, + "learning_rate": 0.003, + "loss": 4.3126, + "step": 1441 + }, + { + "epoch": 0.01442, + "grad_norm": 0.5566257513382789, + "learning_rate": 0.003, + "loss": 4.2964, + "step": 1442 + }, + { + "epoch": 0.01443, + "grad_norm": 0.5444967253409229, + "learning_rate": 0.003, + "loss": 4.3586, + "step": 1443 + }, + { + "epoch": 0.01444, + "grad_norm": 0.5868401236318755, + "learning_rate": 0.003, + "loss": 4.3246, + "step": 1444 + }, + { + "epoch": 0.01445, + "grad_norm": 0.49247571907636584, + "learning_rate": 0.003, + "loss": 4.282, + "step": 1445 + }, + { + "epoch": 0.01446, + "grad_norm": 0.4611996904427917, + "learning_rate": 0.003, + "loss": 4.2968, + "step": 1446 + }, + { + "epoch": 0.01447, + "grad_norm": 0.4704096683460519, + "learning_rate": 0.003, + "loss": 4.3067, + "step": 1447 + }, + { + "epoch": 0.01448, + "grad_norm": 0.39306824337135715, + "learning_rate": 0.003, + "loss": 4.3134, + "step": 1448 + }, + { + "epoch": 0.01449, + "grad_norm": 0.41700654092261286, + "learning_rate": 0.003, + "loss": 4.3094, + "step": 1449 + }, + { + "epoch": 0.0145, + "grad_norm": 0.407843055117697, + "learning_rate": 0.003, + "loss": 4.3225, + "step": 1450 + }, + { + "epoch": 0.01451, + "grad_norm": 0.46619408673841983, + "learning_rate": 0.003, + "loss": 4.304, + "step": 1451 + }, + { + "epoch": 0.01452, + "grad_norm": 0.5845723252800918, + "learning_rate": 0.003, + "loss": 4.3078, + "step": 1452 + }, + { + "epoch": 0.01453, + "grad_norm": 0.6930673151633832, + "learning_rate": 0.003, + "loss": 4.3299, + "step": 1453 + }, + { + "epoch": 0.01454, + "grad_norm": 0.7096402728772245, + "learning_rate": 0.003, + "loss": 4.2995, + "step": 1454 + }, + { + "epoch": 0.01455, + "grad_norm": 0.6610327702633573, + "learning_rate": 0.003, + "loss": 4.3327, + "step": 1455 + }, + { + "epoch": 0.01456, + "grad_norm": 0.6903492208096148, + "learning_rate": 0.003, + "loss": 4.3118, + "step": 1456 + }, + { + "epoch": 0.01457, + "grad_norm": 0.5695969137152378, + "learning_rate": 0.003, + "loss": 4.2978, + "step": 1457 + }, + { + "epoch": 0.01458, + "grad_norm": 0.5248689907637836, + "learning_rate": 0.003, + "loss": 4.2909, + "step": 1458 + }, + { + "epoch": 0.01459, + "grad_norm": 0.49793282629814667, + "learning_rate": 0.003, + "loss": 4.318, + "step": 1459 + }, + { + "epoch": 0.0146, + "grad_norm": 0.5259290959891172, + "learning_rate": 0.003, + "loss": 4.3252, + "step": 1460 + }, + { + "epoch": 0.01461, + "grad_norm": 0.5423165316350705, + "learning_rate": 0.003, + "loss": 4.3026, + "step": 1461 + }, + { + "epoch": 0.01462, + "grad_norm": 0.5122102135827509, + "learning_rate": 0.003, + "loss": 4.3083, + "step": 1462 + }, + { + "epoch": 0.01463, + "grad_norm": 0.48806792365665164, + "learning_rate": 0.003, + "loss": 4.304, + "step": 1463 + }, + { + "epoch": 0.01464, + "grad_norm": 0.4826165334345253, + "learning_rate": 0.003, + "loss": 4.2803, + "step": 1464 + }, + { + "epoch": 0.01465, + "grad_norm": 0.48806303528119876, + "learning_rate": 0.003, + "loss": 4.325, + "step": 1465 + }, + { + "epoch": 0.01466, + "grad_norm": 0.5288880864983577, + "learning_rate": 0.003, + "loss": 4.2948, + "step": 1466 + }, + { + "epoch": 0.01467, + "grad_norm": 0.6678963880365116, + "learning_rate": 0.003, + "loss": 4.3367, + "step": 1467 + }, + { + "epoch": 0.01468, + "grad_norm": 0.8507141482558338, + "learning_rate": 0.003, + "loss": 4.3259, + "step": 1468 + }, + { + "epoch": 0.01469, + "grad_norm": 0.9674104680499097, + "learning_rate": 0.003, + "loss": 4.3427, + "step": 1469 + }, + { + "epoch": 0.0147, + "grad_norm": 0.9577096701541407, + "learning_rate": 0.003, + "loss": 4.3502, + "step": 1470 + }, + { + "epoch": 0.01471, + "grad_norm": 0.9021599056855796, + "learning_rate": 0.003, + "loss": 4.3567, + "step": 1471 + }, + { + "epoch": 0.01472, + "grad_norm": 0.8933064062959534, + "learning_rate": 0.003, + "loss": 4.3477, + "step": 1472 + }, + { + "epoch": 0.01473, + "grad_norm": 0.8969620681941299, + "learning_rate": 0.003, + "loss": 4.3708, + "step": 1473 + }, + { + "epoch": 0.01474, + "grad_norm": 0.6674847167774206, + "learning_rate": 0.003, + "loss": 4.3402, + "step": 1474 + }, + { + "epoch": 0.01475, + "grad_norm": 0.7252535747045, + "learning_rate": 0.003, + "loss": 4.3675, + "step": 1475 + }, + { + "epoch": 0.01476, + "grad_norm": 0.7222358160132806, + "learning_rate": 0.003, + "loss": 4.3503, + "step": 1476 + }, + { + "epoch": 0.01477, + "grad_norm": 0.837925142735814, + "learning_rate": 0.003, + "loss": 4.3614, + "step": 1477 + }, + { + "epoch": 0.01478, + "grad_norm": 0.8571430973709657, + "learning_rate": 0.003, + "loss": 4.3541, + "step": 1478 + }, + { + "epoch": 0.01479, + "grad_norm": 0.7748951970932865, + "learning_rate": 0.003, + "loss": 4.3832, + "step": 1479 + }, + { + "epoch": 0.0148, + "grad_norm": 0.5769653553075942, + "learning_rate": 0.003, + "loss": 4.3406, + "step": 1480 + }, + { + "epoch": 0.01481, + "grad_norm": 0.5617397637544572, + "learning_rate": 0.003, + "loss": 4.343, + "step": 1481 + }, + { + "epoch": 0.01482, + "grad_norm": 0.494589006608917, + "learning_rate": 0.003, + "loss": 4.3215, + "step": 1482 + }, + { + "epoch": 0.01483, + "grad_norm": 0.5358352358514579, + "learning_rate": 0.003, + "loss": 4.3218, + "step": 1483 + }, + { + "epoch": 0.01484, + "grad_norm": 0.4638148892097882, + "learning_rate": 0.003, + "loss": 4.3656, + "step": 1484 + }, + { + "epoch": 0.01485, + "grad_norm": 0.5225092823087141, + "learning_rate": 0.003, + "loss": 4.3308, + "step": 1485 + }, + { + "epoch": 0.01486, + "grad_norm": 0.5536760113665742, + "learning_rate": 0.003, + "loss": 4.3262, + "step": 1486 + }, + { + "epoch": 0.01487, + "grad_norm": 0.5280609352965188, + "learning_rate": 0.003, + "loss": 4.3256, + "step": 1487 + }, + { + "epoch": 0.01488, + "grad_norm": 0.6169285208705055, + "learning_rate": 0.003, + "loss": 4.3379, + "step": 1488 + }, + { + "epoch": 0.01489, + "grad_norm": 0.7420878936102314, + "learning_rate": 0.003, + "loss": 4.3425, + "step": 1489 + }, + { + "epoch": 0.0149, + "grad_norm": 0.7781493014062594, + "learning_rate": 0.003, + "loss": 4.3387, + "step": 1490 + }, + { + "epoch": 0.01491, + "grad_norm": 0.6235654472208051, + "learning_rate": 0.003, + "loss": 4.3459, + "step": 1491 + }, + { + "epoch": 0.01492, + "grad_norm": 0.5493424470537548, + "learning_rate": 0.003, + "loss": 4.3048, + "step": 1492 + }, + { + "epoch": 0.01493, + "grad_norm": 0.5353895236188659, + "learning_rate": 0.003, + "loss": 4.3336, + "step": 1493 + }, + { + "epoch": 0.01494, + "grad_norm": 0.4821593967850022, + "learning_rate": 0.003, + "loss": 4.3113, + "step": 1494 + }, + { + "epoch": 0.01495, + "grad_norm": 0.4536978313935068, + "learning_rate": 0.003, + "loss": 4.3201, + "step": 1495 + }, + { + "epoch": 0.01496, + "grad_norm": 0.4449517853772135, + "learning_rate": 0.003, + "loss": 4.294, + "step": 1496 + }, + { + "epoch": 0.01497, + "grad_norm": 0.39867192565794446, + "learning_rate": 0.003, + "loss": 4.2656, + "step": 1497 + }, + { + "epoch": 0.01498, + "grad_norm": 0.3910710132293917, + "learning_rate": 0.003, + "loss": 4.3202, + "step": 1498 + }, + { + "epoch": 0.01499, + "grad_norm": 0.368047206254813, + "learning_rate": 0.003, + "loss": 4.339, + "step": 1499 + }, + { + "epoch": 0.015, + "grad_norm": 0.34050808541322203, + "learning_rate": 0.003, + "loss": 4.2902, + "step": 1500 + }, + { + "epoch": 0.01501, + "grad_norm": 0.36125568650846523, + "learning_rate": 0.003, + "loss": 4.2886, + "step": 1501 + }, + { + "epoch": 0.01502, + "grad_norm": 0.3250619223936781, + "learning_rate": 0.003, + "loss": 4.3299, + "step": 1502 + }, + { + "epoch": 0.01503, + "grad_norm": 0.362004630486072, + "learning_rate": 0.003, + "loss": 4.3091, + "step": 1503 + }, + { + "epoch": 0.01504, + "grad_norm": 0.42640653384457183, + "learning_rate": 0.003, + "loss": 4.3092, + "step": 1504 + }, + { + "epoch": 0.01505, + "grad_norm": 0.4655634074328374, + "learning_rate": 0.003, + "loss": 4.2935, + "step": 1505 + }, + { + "epoch": 0.01506, + "grad_norm": 0.4830681135087923, + "learning_rate": 0.003, + "loss": 4.3131, + "step": 1506 + }, + { + "epoch": 0.01507, + "grad_norm": 0.45072861631675815, + "learning_rate": 0.003, + "loss": 4.3252, + "step": 1507 + }, + { + "epoch": 0.01508, + "grad_norm": 0.41884675318056874, + "learning_rate": 0.003, + "loss": 4.2996, + "step": 1508 + }, + { + "epoch": 0.01509, + "grad_norm": 0.45582678472670524, + "learning_rate": 0.003, + "loss": 4.32, + "step": 1509 + }, + { + "epoch": 0.0151, + "grad_norm": 0.4555994257034133, + "learning_rate": 0.003, + "loss": 4.2923, + "step": 1510 + }, + { + "epoch": 0.01511, + "grad_norm": 0.47932862889061606, + "learning_rate": 0.003, + "loss": 4.2807, + "step": 1511 + }, + { + "epoch": 0.01512, + "grad_norm": 0.4824629582897255, + "learning_rate": 0.003, + "loss": 4.2928, + "step": 1512 + }, + { + "epoch": 0.01513, + "grad_norm": 0.4551520372558624, + "learning_rate": 0.003, + "loss": 4.2958, + "step": 1513 + }, + { + "epoch": 0.01514, + "grad_norm": 0.40280658680118386, + "learning_rate": 0.003, + "loss": 4.3004, + "step": 1514 + }, + { + "epoch": 0.01515, + "grad_norm": 0.4291367693804096, + "learning_rate": 0.003, + "loss": 4.2938, + "step": 1515 + }, + { + "epoch": 0.01516, + "grad_norm": 0.5124558841404536, + "learning_rate": 0.003, + "loss": 4.3014, + "step": 1516 + }, + { + "epoch": 0.01517, + "grad_norm": 0.662211893130953, + "learning_rate": 0.003, + "loss": 4.2591, + "step": 1517 + }, + { + "epoch": 0.01518, + "grad_norm": 0.7783947565165081, + "learning_rate": 0.003, + "loss": 4.3029, + "step": 1518 + }, + { + "epoch": 0.01519, + "grad_norm": 0.7115931895318321, + "learning_rate": 0.003, + "loss": 4.3425, + "step": 1519 + }, + { + "epoch": 0.0152, + "grad_norm": 0.727509066973132, + "learning_rate": 0.003, + "loss": 4.2935, + "step": 1520 + }, + { + "epoch": 0.01521, + "grad_norm": 0.8200160562394482, + "learning_rate": 0.003, + "loss": 4.3183, + "step": 1521 + }, + { + "epoch": 0.01522, + "grad_norm": 0.785335657111215, + "learning_rate": 0.003, + "loss": 4.3154, + "step": 1522 + }, + { + "epoch": 0.01523, + "grad_norm": 0.6269328513970822, + "learning_rate": 0.003, + "loss": 4.3029, + "step": 1523 + }, + { + "epoch": 0.01524, + "grad_norm": 0.6280834911768081, + "learning_rate": 0.003, + "loss": 4.3392, + "step": 1524 + }, + { + "epoch": 0.01525, + "grad_norm": 0.562699648791423, + "learning_rate": 0.003, + "loss": 4.3367, + "step": 1525 + }, + { + "epoch": 0.01526, + "grad_norm": 0.6402790783462683, + "learning_rate": 0.003, + "loss": 4.3277, + "step": 1526 + }, + { + "epoch": 0.01527, + "grad_norm": 0.6038705757661785, + "learning_rate": 0.003, + "loss": 4.2979, + "step": 1527 + }, + { + "epoch": 0.01528, + "grad_norm": 0.636946924703658, + "learning_rate": 0.003, + "loss": 4.3174, + "step": 1528 + }, + { + "epoch": 0.01529, + "grad_norm": 0.6521902920068134, + "learning_rate": 0.003, + "loss": 4.3136, + "step": 1529 + }, + { + "epoch": 0.0153, + "grad_norm": 0.7380737016000076, + "learning_rate": 0.003, + "loss": 4.3247, + "step": 1530 + }, + { + "epoch": 0.01531, + "grad_norm": 0.7548829272902333, + "learning_rate": 0.003, + "loss": 4.3243, + "step": 1531 + }, + { + "epoch": 0.01532, + "grad_norm": 0.7137520123617993, + "learning_rate": 0.003, + "loss": 4.3014, + "step": 1532 + }, + { + "epoch": 0.01533, + "grad_norm": 0.7523921063923493, + "learning_rate": 0.003, + "loss": 4.3327, + "step": 1533 + }, + { + "epoch": 0.01534, + "grad_norm": 0.7506503205085225, + "learning_rate": 0.003, + "loss": 4.3336, + "step": 1534 + }, + { + "epoch": 0.01535, + "grad_norm": 0.7303716243088675, + "learning_rate": 0.003, + "loss": 4.3418, + "step": 1535 + }, + { + "epoch": 0.01536, + "grad_norm": 0.6839262408941584, + "learning_rate": 0.003, + "loss": 4.32, + "step": 1536 + }, + { + "epoch": 0.01537, + "grad_norm": 0.7109733936082817, + "learning_rate": 0.003, + "loss": 4.3199, + "step": 1537 + }, + { + "epoch": 0.01538, + "grad_norm": 0.7061208974263588, + "learning_rate": 0.003, + "loss": 4.296, + "step": 1538 + }, + { + "epoch": 0.01539, + "grad_norm": 0.6986672189571356, + "learning_rate": 0.003, + "loss": 4.3349, + "step": 1539 + }, + { + "epoch": 0.0154, + "grad_norm": 0.6534468982180698, + "learning_rate": 0.003, + "loss": 4.333, + "step": 1540 + }, + { + "epoch": 0.01541, + "grad_norm": 0.49825734343698913, + "learning_rate": 0.003, + "loss": 4.3068, + "step": 1541 + }, + { + "epoch": 0.01542, + "grad_norm": 0.44352017421476997, + "learning_rate": 0.003, + "loss": 4.3332, + "step": 1542 + }, + { + "epoch": 0.01543, + "grad_norm": 0.38211515055441725, + "learning_rate": 0.003, + "loss": 4.3129, + "step": 1543 + }, + { + "epoch": 0.01544, + "grad_norm": 0.38911843319302664, + "learning_rate": 0.003, + "loss": 4.2994, + "step": 1544 + }, + { + "epoch": 0.01545, + "grad_norm": 0.3624309615071896, + "learning_rate": 0.003, + "loss": 4.3625, + "step": 1545 + }, + { + "epoch": 0.01546, + "grad_norm": 0.4569500368392657, + "learning_rate": 0.003, + "loss": 4.3313, + "step": 1546 + }, + { + "epoch": 0.01547, + "grad_norm": 0.5377623135091468, + "learning_rate": 0.003, + "loss": 4.3355, + "step": 1547 + }, + { + "epoch": 0.01548, + "grad_norm": 0.6297787769238979, + "learning_rate": 0.003, + "loss": 4.3034, + "step": 1548 + }, + { + "epoch": 0.01549, + "grad_norm": 0.644439142847034, + "learning_rate": 0.003, + "loss": 4.2897, + "step": 1549 + }, + { + "epoch": 0.0155, + "grad_norm": 0.618142851349942, + "learning_rate": 0.003, + "loss": 4.3109, + "step": 1550 + }, + { + "epoch": 0.01551, + "grad_norm": 0.5448565014455535, + "learning_rate": 0.003, + "loss": 4.2869, + "step": 1551 + }, + { + "epoch": 0.01552, + "grad_norm": 0.5723351701615428, + "learning_rate": 0.003, + "loss": 4.3071, + "step": 1552 + }, + { + "epoch": 0.01553, + "grad_norm": 0.6022810466873098, + "learning_rate": 0.003, + "loss": 4.3328, + "step": 1553 + }, + { + "epoch": 0.01554, + "grad_norm": 0.6989931225913981, + "learning_rate": 0.003, + "loss": 4.324, + "step": 1554 + }, + { + "epoch": 0.01555, + "grad_norm": 0.6580206580270094, + "learning_rate": 0.003, + "loss": 4.2931, + "step": 1555 + }, + { + "epoch": 0.01556, + "grad_norm": 0.6533983591917718, + "learning_rate": 0.003, + "loss": 4.3308, + "step": 1556 + }, + { + "epoch": 0.01557, + "grad_norm": 0.6295945004011335, + "learning_rate": 0.003, + "loss": 4.3036, + "step": 1557 + }, + { + "epoch": 0.01558, + "grad_norm": 0.5371854889087798, + "learning_rate": 0.003, + "loss": 4.2914, + "step": 1558 + }, + { + "epoch": 0.01559, + "grad_norm": 0.4632942102113827, + "learning_rate": 0.003, + "loss": 4.326, + "step": 1559 + }, + { + "epoch": 0.0156, + "grad_norm": 0.5051005361340883, + "learning_rate": 0.003, + "loss": 4.3169, + "step": 1560 + }, + { + "epoch": 0.01561, + "grad_norm": 0.5283819882561609, + "learning_rate": 0.003, + "loss": 4.299, + "step": 1561 + }, + { + "epoch": 0.01562, + "grad_norm": 0.591562612461458, + "learning_rate": 0.003, + "loss": 4.2958, + "step": 1562 + }, + { + "epoch": 0.01563, + "grad_norm": 0.6357918082942466, + "learning_rate": 0.003, + "loss": 4.3041, + "step": 1563 + }, + { + "epoch": 0.01564, + "grad_norm": 0.6669974739393074, + "learning_rate": 0.003, + "loss": 4.3336, + "step": 1564 + }, + { + "epoch": 0.01565, + "grad_norm": 0.691137766433848, + "learning_rate": 0.003, + "loss": 4.3141, + "step": 1565 + }, + { + "epoch": 0.01566, + "grad_norm": 0.5930174244490458, + "learning_rate": 0.003, + "loss": 4.3298, + "step": 1566 + }, + { + "epoch": 0.01567, + "grad_norm": 0.551100674489701, + "learning_rate": 0.003, + "loss": 4.319, + "step": 1567 + }, + { + "epoch": 0.01568, + "grad_norm": 0.5879757029751648, + "learning_rate": 0.003, + "loss": 4.3039, + "step": 1568 + }, + { + "epoch": 0.01569, + "grad_norm": 0.5302512625540108, + "learning_rate": 0.003, + "loss": 4.3069, + "step": 1569 + }, + { + "epoch": 0.0157, + "grad_norm": 0.49569383773361164, + "learning_rate": 0.003, + "loss": 4.3233, + "step": 1570 + }, + { + "epoch": 0.01571, + "grad_norm": 0.4733165510784743, + "learning_rate": 0.003, + "loss": 4.3065, + "step": 1571 + }, + { + "epoch": 0.01572, + "grad_norm": 0.4449250863339169, + "learning_rate": 0.003, + "loss": 4.3135, + "step": 1572 + }, + { + "epoch": 0.01573, + "grad_norm": 0.3929931575857813, + "learning_rate": 0.003, + "loss": 4.2941, + "step": 1573 + }, + { + "epoch": 0.01574, + "grad_norm": 0.4142053233129594, + "learning_rate": 0.003, + "loss": 4.3098, + "step": 1574 + }, + { + "epoch": 0.01575, + "grad_norm": 0.3709436905014968, + "learning_rate": 0.003, + "loss": 4.2702, + "step": 1575 + }, + { + "epoch": 0.01576, + "grad_norm": 0.36495283580117344, + "learning_rate": 0.003, + "loss": 4.2914, + "step": 1576 + }, + { + "epoch": 0.01577, + "grad_norm": 0.3915297767332071, + "learning_rate": 0.003, + "loss": 4.2525, + "step": 1577 + }, + { + "epoch": 0.01578, + "grad_norm": 0.5001638890424236, + "learning_rate": 0.003, + "loss": 4.284, + "step": 1578 + }, + { + "epoch": 0.01579, + "grad_norm": 0.6179833768395876, + "learning_rate": 0.003, + "loss": 4.2911, + "step": 1579 + }, + { + "epoch": 0.0158, + "grad_norm": 0.6342638472068223, + "learning_rate": 0.003, + "loss": 4.299, + "step": 1580 + }, + { + "epoch": 0.01581, + "grad_norm": 0.5432322505669018, + "learning_rate": 0.003, + "loss": 4.3062, + "step": 1581 + }, + { + "epoch": 0.01582, + "grad_norm": 0.4889222240722665, + "learning_rate": 0.003, + "loss": 4.2846, + "step": 1582 + }, + { + "epoch": 0.01583, + "grad_norm": 0.5515618698965413, + "learning_rate": 0.003, + "loss": 4.274, + "step": 1583 + }, + { + "epoch": 0.01584, + "grad_norm": 0.549844036531772, + "learning_rate": 0.003, + "loss": 4.3079, + "step": 1584 + }, + { + "epoch": 0.01585, + "grad_norm": 0.48716680666102685, + "learning_rate": 0.003, + "loss": 4.2981, + "step": 1585 + }, + { + "epoch": 0.01586, + "grad_norm": 0.623103221118658, + "learning_rate": 0.003, + "loss": 4.2774, + "step": 1586 + }, + { + "epoch": 0.01587, + "grad_norm": 0.7984823726854448, + "learning_rate": 0.003, + "loss": 4.3077, + "step": 1587 + }, + { + "epoch": 0.01588, + "grad_norm": 0.7627654453676783, + "learning_rate": 0.003, + "loss": 4.3371, + "step": 1588 + }, + { + "epoch": 0.01589, + "grad_norm": 0.7599095947865857, + "learning_rate": 0.003, + "loss": 4.3312, + "step": 1589 + }, + { + "epoch": 0.0159, + "grad_norm": 0.8159908089496294, + "learning_rate": 0.003, + "loss": 4.33, + "step": 1590 + }, + { + "epoch": 0.01591, + "grad_norm": 0.8803872581529493, + "learning_rate": 0.003, + "loss": 4.3476, + "step": 1591 + }, + { + "epoch": 0.01592, + "grad_norm": 0.9820230872746389, + "learning_rate": 0.003, + "loss": 4.349, + "step": 1592 + }, + { + "epoch": 0.01593, + "grad_norm": 0.9673129749259365, + "learning_rate": 0.003, + "loss": 4.352, + "step": 1593 + }, + { + "epoch": 0.01594, + "grad_norm": 0.9293561890551637, + "learning_rate": 0.003, + "loss": 4.3487, + "step": 1594 + }, + { + "epoch": 0.01595, + "grad_norm": 0.9561454087951702, + "learning_rate": 0.003, + "loss": 4.3811, + "step": 1595 + }, + { + "epoch": 0.01596, + "grad_norm": 1.0372746249351796, + "learning_rate": 0.003, + "loss": 4.3847, + "step": 1596 + }, + { + "epoch": 0.01597, + "grad_norm": 0.9040299607313536, + "learning_rate": 0.003, + "loss": 4.3427, + "step": 1597 + }, + { + "epoch": 0.01598, + "grad_norm": 1.1389328260746334, + "learning_rate": 0.003, + "loss": 4.3536, + "step": 1598 + }, + { + "epoch": 0.01599, + "grad_norm": 1.097139764937509, + "learning_rate": 0.003, + "loss": 4.3856, + "step": 1599 + }, + { + "epoch": 0.016, + "grad_norm": 0.9605869668937348, + "learning_rate": 0.003, + "loss": 4.3534, + "step": 1600 + }, + { + "epoch": 0.01601, + "grad_norm": 0.7696982537009223, + "learning_rate": 0.003, + "loss": 4.3899, + "step": 1601 + }, + { + "epoch": 0.01602, + "grad_norm": 0.7083103903580285, + "learning_rate": 0.003, + "loss": 4.3647, + "step": 1602 + }, + { + "epoch": 0.01603, + "grad_norm": 0.5787396432841793, + "learning_rate": 0.003, + "loss": 4.3713, + "step": 1603 + }, + { + "epoch": 0.01604, + "grad_norm": 0.5028516500547362, + "learning_rate": 0.003, + "loss": 4.3577, + "step": 1604 + }, + { + "epoch": 0.01605, + "grad_norm": 0.4938813453051197, + "learning_rate": 0.003, + "loss": 4.3306, + "step": 1605 + }, + { + "epoch": 0.01606, + "grad_norm": 0.5541639051688618, + "learning_rate": 0.003, + "loss": 4.3313, + "step": 1606 + }, + { + "epoch": 0.01607, + "grad_norm": 0.5139661852261925, + "learning_rate": 0.003, + "loss": 4.3576, + "step": 1607 + }, + { + "epoch": 0.01608, + "grad_norm": 0.48348767192426967, + "learning_rate": 0.003, + "loss": 4.3253, + "step": 1608 + }, + { + "epoch": 0.01609, + "grad_norm": 0.40136757384139854, + "learning_rate": 0.003, + "loss": 4.3283, + "step": 1609 + }, + { + "epoch": 0.0161, + "grad_norm": 0.4218183451690125, + "learning_rate": 0.003, + "loss": 4.2908, + "step": 1610 + }, + { + "epoch": 0.01611, + "grad_norm": 0.460848545000983, + "learning_rate": 0.003, + "loss": 4.31, + "step": 1611 + }, + { + "epoch": 0.01612, + "grad_norm": 0.4659444292137467, + "learning_rate": 0.003, + "loss": 4.3003, + "step": 1612 + }, + { + "epoch": 0.01613, + "grad_norm": 0.4112363124807694, + "learning_rate": 0.003, + "loss": 4.3411, + "step": 1613 + }, + { + "epoch": 0.01614, + "grad_norm": 0.3883756371913985, + "learning_rate": 0.003, + "loss": 4.3624, + "step": 1614 + }, + { + "epoch": 0.01615, + "grad_norm": 0.3536906211225033, + "learning_rate": 0.003, + "loss": 4.3264, + "step": 1615 + }, + { + "epoch": 0.01616, + "grad_norm": 0.31345780077250956, + "learning_rate": 0.003, + "loss": 4.3029, + "step": 1616 + }, + { + "epoch": 0.01617, + "grad_norm": 0.30058841204286124, + "learning_rate": 0.003, + "loss": 4.2916, + "step": 1617 + }, + { + "epoch": 0.01618, + "grad_norm": 0.27199761658041965, + "learning_rate": 0.003, + "loss": 4.2925, + "step": 1618 + }, + { + "epoch": 0.01619, + "grad_norm": 0.24962840642226738, + "learning_rate": 0.003, + "loss": 4.2798, + "step": 1619 + }, + { + "epoch": 0.0162, + "grad_norm": 0.2624146116891587, + "learning_rate": 0.003, + "loss": 4.2863, + "step": 1620 + }, + { + "epoch": 0.01621, + "grad_norm": 0.2537767483196262, + "learning_rate": 0.003, + "loss": 4.2811, + "step": 1621 + }, + { + "epoch": 0.01622, + "grad_norm": 0.3339010418458089, + "learning_rate": 0.003, + "loss": 4.3143, + "step": 1622 + }, + { + "epoch": 0.01623, + "grad_norm": 0.4481356309307089, + "learning_rate": 0.003, + "loss": 4.2931, + "step": 1623 + }, + { + "epoch": 0.01624, + "grad_norm": 0.6923232940858078, + "learning_rate": 0.003, + "loss": 4.3139, + "step": 1624 + }, + { + "epoch": 0.01625, + "grad_norm": 0.796128597644668, + "learning_rate": 0.003, + "loss": 4.3039, + "step": 1625 + }, + { + "epoch": 0.01626, + "grad_norm": 0.4972515352836982, + "learning_rate": 0.003, + "loss": 4.3127, + "step": 1626 + }, + { + "epoch": 0.01627, + "grad_norm": 0.5373908693508559, + "learning_rate": 0.003, + "loss": 4.2845, + "step": 1627 + }, + { + "epoch": 0.01628, + "grad_norm": 0.576906019908009, + "learning_rate": 0.003, + "loss": 4.2936, + "step": 1628 + }, + { + "epoch": 0.01629, + "grad_norm": 0.4857532494288208, + "learning_rate": 0.003, + "loss": 4.2886, + "step": 1629 + }, + { + "epoch": 0.0163, + "grad_norm": 0.6536351383709068, + "learning_rate": 0.003, + "loss": 4.284, + "step": 1630 + }, + { + "epoch": 0.01631, + "grad_norm": 0.6136531914830949, + "learning_rate": 0.003, + "loss": 4.3036, + "step": 1631 + }, + { + "epoch": 0.01632, + "grad_norm": 0.5441543421309533, + "learning_rate": 0.003, + "loss": 4.3133, + "step": 1632 + }, + { + "epoch": 0.01633, + "grad_norm": 0.5546151460088474, + "learning_rate": 0.003, + "loss": 4.2852, + "step": 1633 + }, + { + "epoch": 0.01634, + "grad_norm": 0.5283961307973114, + "learning_rate": 0.003, + "loss": 4.3028, + "step": 1634 + }, + { + "epoch": 0.01635, + "grad_norm": 0.4867354243481517, + "learning_rate": 0.003, + "loss": 4.2796, + "step": 1635 + }, + { + "epoch": 0.01636, + "grad_norm": 0.565294038636392, + "learning_rate": 0.003, + "loss": 4.2994, + "step": 1636 + }, + { + "epoch": 0.01637, + "grad_norm": 0.571317155010679, + "learning_rate": 0.003, + "loss": 4.2844, + "step": 1637 + }, + { + "epoch": 0.01638, + "grad_norm": 0.5573027415974858, + "learning_rate": 0.003, + "loss": 4.2921, + "step": 1638 + }, + { + "epoch": 0.01639, + "grad_norm": 0.6223176506662222, + "learning_rate": 0.003, + "loss": 4.3165, + "step": 1639 + }, + { + "epoch": 0.0164, + "grad_norm": 0.692259125142237, + "learning_rate": 0.003, + "loss": 4.3033, + "step": 1640 + }, + { + "epoch": 0.01641, + "grad_norm": 0.5775190344518869, + "learning_rate": 0.003, + "loss": 4.2697, + "step": 1641 + }, + { + "epoch": 0.01642, + "grad_norm": 0.5727565418624034, + "learning_rate": 0.003, + "loss": 4.316, + "step": 1642 + }, + { + "epoch": 0.01643, + "grad_norm": 0.49414286165639626, + "learning_rate": 0.003, + "loss": 4.2959, + "step": 1643 + }, + { + "epoch": 0.01644, + "grad_norm": 0.4716768356076658, + "learning_rate": 0.003, + "loss": 4.315, + "step": 1644 + }, + { + "epoch": 0.01645, + "grad_norm": 0.4203296262888758, + "learning_rate": 0.003, + "loss": 4.2571, + "step": 1645 + }, + { + "epoch": 0.01646, + "grad_norm": 0.42896262234184174, + "learning_rate": 0.003, + "loss": 4.2952, + "step": 1646 + }, + { + "epoch": 0.01647, + "grad_norm": 0.4170543756256854, + "learning_rate": 0.003, + "loss": 4.2886, + "step": 1647 + }, + { + "epoch": 0.01648, + "grad_norm": 0.42426821016871286, + "learning_rate": 0.003, + "loss": 4.2659, + "step": 1648 + }, + { + "epoch": 0.01649, + "grad_norm": 0.4471925912241752, + "learning_rate": 0.003, + "loss": 4.3027, + "step": 1649 + }, + { + "epoch": 0.0165, + "grad_norm": 0.512725534748134, + "learning_rate": 0.003, + "loss": 4.2806, + "step": 1650 + }, + { + "epoch": 0.01651, + "grad_norm": 0.7005737333281123, + "learning_rate": 0.003, + "loss": 4.2999, + "step": 1651 + }, + { + "epoch": 0.01652, + "grad_norm": 0.7594561646979391, + "learning_rate": 0.003, + "loss": 4.2643, + "step": 1652 + }, + { + "epoch": 0.01653, + "grad_norm": 0.5870095922616082, + "learning_rate": 0.003, + "loss": 4.2994, + "step": 1653 + }, + { + "epoch": 0.01654, + "grad_norm": 0.6476208006318457, + "learning_rate": 0.003, + "loss": 4.289, + "step": 1654 + }, + { + "epoch": 0.01655, + "grad_norm": 0.6928825876927465, + "learning_rate": 0.003, + "loss": 4.3102, + "step": 1655 + }, + { + "epoch": 0.01656, + "grad_norm": 0.6117569915496687, + "learning_rate": 0.003, + "loss": 4.2725, + "step": 1656 + }, + { + "epoch": 0.01657, + "grad_norm": 0.614527863212692, + "learning_rate": 0.003, + "loss": 4.2915, + "step": 1657 + }, + { + "epoch": 0.01658, + "grad_norm": 0.5818340523746026, + "learning_rate": 0.003, + "loss": 4.2809, + "step": 1658 + }, + { + "epoch": 0.01659, + "grad_norm": 0.5122965623311254, + "learning_rate": 0.003, + "loss": 4.2829, + "step": 1659 + }, + { + "epoch": 0.0166, + "grad_norm": 0.41721830635550766, + "learning_rate": 0.003, + "loss": 4.2624, + "step": 1660 + }, + { + "epoch": 0.01661, + "grad_norm": 0.3974257488587888, + "learning_rate": 0.003, + "loss": 4.2659, + "step": 1661 + }, + { + "epoch": 0.01662, + "grad_norm": 0.4134218787914048, + "learning_rate": 0.003, + "loss": 4.2699, + "step": 1662 + }, + { + "epoch": 0.01663, + "grad_norm": 0.4716687664436207, + "learning_rate": 0.003, + "loss": 4.2937, + "step": 1663 + }, + { + "epoch": 0.01664, + "grad_norm": 0.5068611880377488, + "learning_rate": 0.003, + "loss": 4.31, + "step": 1664 + }, + { + "epoch": 0.01665, + "grad_norm": 0.49884944088963573, + "learning_rate": 0.003, + "loss": 4.2734, + "step": 1665 + }, + { + "epoch": 0.01666, + "grad_norm": 0.5134497113162678, + "learning_rate": 0.003, + "loss": 4.277, + "step": 1666 + }, + { + "epoch": 0.01667, + "grad_norm": 0.4951307012977702, + "learning_rate": 0.003, + "loss": 4.2916, + "step": 1667 + }, + { + "epoch": 0.01668, + "grad_norm": 0.45857349182650015, + "learning_rate": 0.003, + "loss": 4.293, + "step": 1668 + }, + { + "epoch": 0.01669, + "grad_norm": 0.49049575707127974, + "learning_rate": 0.003, + "loss": 4.2801, + "step": 1669 + }, + { + "epoch": 0.0167, + "grad_norm": 0.4594967732301719, + "learning_rate": 0.003, + "loss": 4.2692, + "step": 1670 + }, + { + "epoch": 0.01671, + "grad_norm": 0.5516090968921173, + "learning_rate": 0.003, + "loss": 4.2597, + "step": 1671 + }, + { + "epoch": 0.01672, + "grad_norm": 0.6238914771966855, + "learning_rate": 0.003, + "loss": 4.2771, + "step": 1672 + }, + { + "epoch": 0.01673, + "grad_norm": 0.6444039300913702, + "learning_rate": 0.003, + "loss": 4.2759, + "step": 1673 + }, + { + "epoch": 0.01674, + "grad_norm": 0.6140826000846973, + "learning_rate": 0.003, + "loss": 4.2936, + "step": 1674 + }, + { + "epoch": 0.01675, + "grad_norm": 0.5952627086225425, + "learning_rate": 0.003, + "loss": 4.2797, + "step": 1675 + }, + { + "epoch": 0.01676, + "grad_norm": 0.7397767991708332, + "learning_rate": 0.003, + "loss": 4.295, + "step": 1676 + }, + { + "epoch": 0.01677, + "grad_norm": 0.6917176398613138, + "learning_rate": 0.003, + "loss": 4.287, + "step": 1677 + }, + { + "epoch": 0.01678, + "grad_norm": 0.6139182789896808, + "learning_rate": 0.003, + "loss": 4.2961, + "step": 1678 + }, + { + "epoch": 0.01679, + "grad_norm": 0.5929821413447228, + "learning_rate": 0.003, + "loss": 4.28, + "step": 1679 + }, + { + "epoch": 0.0168, + "grad_norm": 0.5995626344940229, + "learning_rate": 0.003, + "loss": 4.2962, + "step": 1680 + }, + { + "epoch": 0.01681, + "grad_norm": 0.5718502872460155, + "learning_rate": 0.003, + "loss": 4.2946, + "step": 1681 + }, + { + "epoch": 0.01682, + "grad_norm": 0.5792048131668902, + "learning_rate": 0.003, + "loss": 4.2929, + "step": 1682 + }, + { + "epoch": 0.01683, + "grad_norm": 0.5439228911454255, + "learning_rate": 0.003, + "loss": 4.3021, + "step": 1683 + }, + { + "epoch": 0.01684, + "grad_norm": 0.5499788418796372, + "learning_rate": 0.003, + "loss": 4.2762, + "step": 1684 + }, + { + "epoch": 0.01685, + "grad_norm": 0.5923021072734571, + "learning_rate": 0.003, + "loss": 4.2948, + "step": 1685 + }, + { + "epoch": 0.01686, + "grad_norm": 0.5945832006013596, + "learning_rate": 0.003, + "loss": 4.2875, + "step": 1686 + }, + { + "epoch": 0.01687, + "grad_norm": 0.5137142109366883, + "learning_rate": 0.003, + "loss": 4.2801, + "step": 1687 + }, + { + "epoch": 0.01688, + "grad_norm": 0.5492618386501558, + "learning_rate": 0.003, + "loss": 4.2695, + "step": 1688 + }, + { + "epoch": 0.01689, + "grad_norm": 0.5905725806017027, + "learning_rate": 0.003, + "loss": 4.2834, + "step": 1689 + }, + { + "epoch": 0.0169, + "grad_norm": 0.6017735937539882, + "learning_rate": 0.003, + "loss": 4.2739, + "step": 1690 + }, + { + "epoch": 0.01691, + "grad_norm": 0.5609944056824007, + "learning_rate": 0.003, + "loss": 4.2833, + "step": 1691 + }, + { + "epoch": 0.01692, + "grad_norm": 0.640275407175671, + "learning_rate": 0.003, + "loss": 4.2898, + "step": 1692 + }, + { + "epoch": 0.01693, + "grad_norm": 0.6644174640253917, + "learning_rate": 0.003, + "loss": 4.3205, + "step": 1693 + }, + { + "epoch": 0.01694, + "grad_norm": 0.6415163103575349, + "learning_rate": 0.003, + "loss": 4.2685, + "step": 1694 + }, + { + "epoch": 0.01695, + "grad_norm": 0.5843260495529953, + "learning_rate": 0.003, + "loss": 4.2981, + "step": 1695 + }, + { + "epoch": 0.01696, + "grad_norm": 0.6242095105916499, + "learning_rate": 0.003, + "loss": 4.2919, + "step": 1696 + }, + { + "epoch": 0.01697, + "grad_norm": 0.6266335557548692, + "learning_rate": 0.003, + "loss": 4.2683, + "step": 1697 + }, + { + "epoch": 0.01698, + "grad_norm": 0.5465027100918087, + "learning_rate": 0.003, + "loss": 4.2606, + "step": 1698 + }, + { + "epoch": 0.01699, + "grad_norm": 0.4757955199873524, + "learning_rate": 0.003, + "loss": 4.2671, + "step": 1699 + }, + { + "epoch": 0.017, + "grad_norm": 0.5193155427226998, + "learning_rate": 0.003, + "loss": 4.2877, + "step": 1700 + }, + { + "epoch": 0.01701, + "grad_norm": 0.5016711628234212, + "learning_rate": 0.003, + "loss": 4.2685, + "step": 1701 + }, + { + "epoch": 0.01702, + "grad_norm": 0.5283542213739725, + "learning_rate": 0.003, + "loss": 4.2934, + "step": 1702 + }, + { + "epoch": 0.01703, + "grad_norm": 0.5693915553392532, + "learning_rate": 0.003, + "loss": 4.2816, + "step": 1703 + }, + { + "epoch": 0.01704, + "grad_norm": 0.5510842689706329, + "learning_rate": 0.003, + "loss": 4.2655, + "step": 1704 + }, + { + "epoch": 0.01705, + "grad_norm": 0.5433284015620915, + "learning_rate": 0.003, + "loss": 4.2713, + "step": 1705 + }, + { + "epoch": 0.01706, + "grad_norm": 0.5361651744968902, + "learning_rate": 0.003, + "loss": 4.2666, + "step": 1706 + }, + { + "epoch": 0.01707, + "grad_norm": 0.49807673030468796, + "learning_rate": 0.003, + "loss": 4.2867, + "step": 1707 + }, + { + "epoch": 0.01708, + "grad_norm": 0.47060326864649304, + "learning_rate": 0.003, + "loss": 4.2615, + "step": 1708 + }, + { + "epoch": 0.01709, + "grad_norm": 0.5858233174308028, + "learning_rate": 0.003, + "loss": 4.2895, + "step": 1709 + }, + { + "epoch": 0.0171, + "grad_norm": 0.6958457968419427, + "learning_rate": 0.003, + "loss": 4.2423, + "step": 1710 + }, + { + "epoch": 0.01711, + "grad_norm": 0.6480228055861614, + "learning_rate": 0.003, + "loss": 4.2726, + "step": 1711 + }, + { + "epoch": 0.01712, + "grad_norm": 0.5133833526312795, + "learning_rate": 0.003, + "loss": 4.2785, + "step": 1712 + }, + { + "epoch": 0.01713, + "grad_norm": 0.6473419774088255, + "learning_rate": 0.003, + "loss": 4.2578, + "step": 1713 + }, + { + "epoch": 0.01714, + "grad_norm": 0.6885593409140894, + "learning_rate": 0.003, + "loss": 4.3021, + "step": 1714 + }, + { + "epoch": 0.01715, + "grad_norm": 0.7110176805262337, + "learning_rate": 0.003, + "loss": 4.3086, + "step": 1715 + }, + { + "epoch": 0.01716, + "grad_norm": 0.6383663373581888, + "learning_rate": 0.003, + "loss": 4.284, + "step": 1716 + }, + { + "epoch": 0.01717, + "grad_norm": 0.5609012358245258, + "learning_rate": 0.003, + "loss": 4.3151, + "step": 1717 + }, + { + "epoch": 0.01718, + "grad_norm": 0.4893869305325425, + "learning_rate": 0.003, + "loss": 4.279, + "step": 1718 + }, + { + "epoch": 0.01719, + "grad_norm": 0.4683591562635787, + "learning_rate": 0.003, + "loss": 4.3002, + "step": 1719 + }, + { + "epoch": 0.0172, + "grad_norm": 0.43007495593928735, + "learning_rate": 0.003, + "loss": 4.2576, + "step": 1720 + }, + { + "epoch": 0.01721, + "grad_norm": 0.4793855122225898, + "learning_rate": 0.003, + "loss": 4.2731, + "step": 1721 + }, + { + "epoch": 0.01722, + "grad_norm": 0.5257529092437175, + "learning_rate": 0.003, + "loss": 4.2675, + "step": 1722 + }, + { + "epoch": 0.01723, + "grad_norm": 0.6135435997236554, + "learning_rate": 0.003, + "loss": 4.2811, + "step": 1723 + }, + { + "epoch": 0.01724, + "grad_norm": 0.6429236565141022, + "learning_rate": 0.003, + "loss": 4.2853, + "step": 1724 + }, + { + "epoch": 0.01725, + "grad_norm": 0.6238007075890811, + "learning_rate": 0.003, + "loss": 4.27, + "step": 1725 + }, + { + "epoch": 0.01726, + "grad_norm": 0.5743130238790891, + "learning_rate": 0.003, + "loss": 4.2614, + "step": 1726 + }, + { + "epoch": 0.01727, + "grad_norm": 0.5512027746636241, + "learning_rate": 0.003, + "loss": 4.266, + "step": 1727 + }, + { + "epoch": 0.01728, + "grad_norm": 0.5565544152920284, + "learning_rate": 0.003, + "loss": 4.2687, + "step": 1728 + }, + { + "epoch": 0.01729, + "grad_norm": 0.6053220105417224, + "learning_rate": 0.003, + "loss": 4.2796, + "step": 1729 + }, + { + "epoch": 0.0173, + "grad_norm": 0.7004864603830978, + "learning_rate": 0.003, + "loss": 4.256, + "step": 1730 + }, + { + "epoch": 0.01731, + "grad_norm": 0.6677625209808086, + "learning_rate": 0.003, + "loss": 4.2797, + "step": 1731 + }, + { + "epoch": 0.01732, + "grad_norm": 0.6458934000853458, + "learning_rate": 0.003, + "loss": 4.2877, + "step": 1732 + }, + { + "epoch": 0.01733, + "grad_norm": 0.7455706085018527, + "learning_rate": 0.003, + "loss": 4.2732, + "step": 1733 + }, + { + "epoch": 0.01734, + "grad_norm": 0.6265592537450486, + "learning_rate": 0.003, + "loss": 4.2823, + "step": 1734 + }, + { + "epoch": 0.01735, + "grad_norm": 0.5392857728589878, + "learning_rate": 0.003, + "loss": 4.2489, + "step": 1735 + }, + { + "epoch": 0.01736, + "grad_norm": 0.5707588496610313, + "learning_rate": 0.003, + "loss": 4.2503, + "step": 1736 + }, + { + "epoch": 0.01737, + "grad_norm": 0.5560424700678082, + "learning_rate": 0.003, + "loss": 4.2812, + "step": 1737 + }, + { + "epoch": 0.01738, + "grad_norm": 0.5730683002180033, + "learning_rate": 0.003, + "loss": 4.2686, + "step": 1738 + }, + { + "epoch": 0.01739, + "grad_norm": 0.6224581195400443, + "learning_rate": 0.003, + "loss": 4.2874, + "step": 1739 + }, + { + "epoch": 0.0174, + "grad_norm": 0.6572820925374313, + "learning_rate": 0.003, + "loss": 4.2884, + "step": 1740 + }, + { + "epoch": 0.01741, + "grad_norm": 0.7158790913353127, + "learning_rate": 0.003, + "loss": 4.3015, + "step": 1741 + }, + { + "epoch": 0.01742, + "grad_norm": 0.7666622653552904, + "learning_rate": 0.003, + "loss": 4.2911, + "step": 1742 + }, + { + "epoch": 0.01743, + "grad_norm": 0.7426449847041513, + "learning_rate": 0.003, + "loss": 4.3102, + "step": 1743 + }, + { + "epoch": 0.01744, + "grad_norm": 0.7275834660740842, + "learning_rate": 0.003, + "loss": 4.2916, + "step": 1744 + }, + { + "epoch": 0.01745, + "grad_norm": 0.6002910482818848, + "learning_rate": 0.003, + "loss": 4.2539, + "step": 1745 + }, + { + "epoch": 0.01746, + "grad_norm": 0.5594118226409661, + "learning_rate": 0.003, + "loss": 4.2741, + "step": 1746 + }, + { + "epoch": 0.01747, + "grad_norm": 0.5290431452891758, + "learning_rate": 0.003, + "loss": 4.2708, + "step": 1747 + }, + { + "epoch": 0.01748, + "grad_norm": 0.5055142153030036, + "learning_rate": 0.003, + "loss": 4.263, + "step": 1748 + }, + { + "epoch": 0.01749, + "grad_norm": 0.5966872692997083, + "learning_rate": 0.003, + "loss": 4.2613, + "step": 1749 + }, + { + "epoch": 0.0175, + "grad_norm": 0.5900848154260279, + "learning_rate": 0.003, + "loss": 4.2801, + "step": 1750 + }, + { + "epoch": 0.01751, + "grad_norm": 0.6245345037465497, + "learning_rate": 0.003, + "loss": 4.2737, + "step": 1751 + }, + { + "epoch": 0.01752, + "grad_norm": 0.5951652358284407, + "learning_rate": 0.003, + "loss": 4.3127, + "step": 1752 + }, + { + "epoch": 0.01753, + "grad_norm": 0.5691433797032257, + "learning_rate": 0.003, + "loss": 4.2613, + "step": 1753 + }, + { + "epoch": 0.01754, + "grad_norm": 0.5243977651759597, + "learning_rate": 0.003, + "loss": 4.2904, + "step": 1754 + }, + { + "epoch": 0.01755, + "grad_norm": 0.5047009634705759, + "learning_rate": 0.003, + "loss": 4.23, + "step": 1755 + }, + { + "epoch": 0.01756, + "grad_norm": 0.5542549196861234, + "learning_rate": 0.003, + "loss": 4.3093, + "step": 1756 + }, + { + "epoch": 0.01757, + "grad_norm": 0.5493798919225148, + "learning_rate": 0.003, + "loss": 4.2887, + "step": 1757 + }, + { + "epoch": 0.01758, + "grad_norm": 0.5731223181742938, + "learning_rate": 0.003, + "loss": 4.2949, + "step": 1758 + }, + { + "epoch": 0.01759, + "grad_norm": 0.5640596577781694, + "learning_rate": 0.003, + "loss": 4.2619, + "step": 1759 + }, + { + "epoch": 0.0176, + "grad_norm": 0.49712782984125864, + "learning_rate": 0.003, + "loss": 4.2816, + "step": 1760 + }, + { + "epoch": 0.01761, + "grad_norm": 0.49486559076430775, + "learning_rate": 0.003, + "loss": 4.2648, + "step": 1761 + }, + { + "epoch": 0.01762, + "grad_norm": 0.5110004857223106, + "learning_rate": 0.003, + "loss": 4.2854, + "step": 1762 + }, + { + "epoch": 0.01763, + "grad_norm": 0.5633857856922079, + "learning_rate": 0.003, + "loss": 4.2792, + "step": 1763 + }, + { + "epoch": 0.01764, + "grad_norm": 0.7662844615211925, + "learning_rate": 0.003, + "loss": 4.255, + "step": 1764 + }, + { + "epoch": 0.01765, + "grad_norm": 0.8231706617421852, + "learning_rate": 0.003, + "loss": 4.2957, + "step": 1765 + }, + { + "epoch": 0.01766, + "grad_norm": 0.6112884625612803, + "learning_rate": 0.003, + "loss": 4.2946, + "step": 1766 + }, + { + "epoch": 0.01767, + "grad_norm": 0.5304115562781407, + "learning_rate": 0.003, + "loss": 4.2562, + "step": 1767 + }, + { + "epoch": 0.01768, + "grad_norm": 0.5693392116057188, + "learning_rate": 0.003, + "loss": 4.2663, + "step": 1768 + }, + { + "epoch": 0.01769, + "grad_norm": 0.5007419067309367, + "learning_rate": 0.003, + "loss": 4.2627, + "step": 1769 + }, + { + "epoch": 0.0177, + "grad_norm": 0.5337006106644646, + "learning_rate": 0.003, + "loss": 4.2481, + "step": 1770 + }, + { + "epoch": 0.01771, + "grad_norm": 0.4993647653872291, + "learning_rate": 0.003, + "loss": 4.278, + "step": 1771 + }, + { + "epoch": 0.01772, + "grad_norm": 0.49378669154496685, + "learning_rate": 0.003, + "loss": 4.2492, + "step": 1772 + }, + { + "epoch": 0.01773, + "grad_norm": 0.49668963256381116, + "learning_rate": 0.003, + "loss": 4.2435, + "step": 1773 + }, + { + "epoch": 0.01774, + "grad_norm": 0.4890453055402567, + "learning_rate": 0.003, + "loss": 4.2647, + "step": 1774 + }, + { + "epoch": 0.01775, + "grad_norm": 0.4981133550102386, + "learning_rate": 0.003, + "loss": 4.2509, + "step": 1775 + }, + { + "epoch": 0.01776, + "grad_norm": 0.5331679722135182, + "learning_rate": 0.003, + "loss": 4.2445, + "step": 1776 + }, + { + "epoch": 0.01777, + "grad_norm": 0.5614293963448983, + "learning_rate": 0.003, + "loss": 4.269, + "step": 1777 + }, + { + "epoch": 0.01778, + "grad_norm": 0.5337530525849652, + "learning_rate": 0.003, + "loss": 4.2591, + "step": 1778 + }, + { + "epoch": 0.01779, + "grad_norm": 0.5219382960786771, + "learning_rate": 0.003, + "loss": 4.277, + "step": 1779 + }, + { + "epoch": 0.0178, + "grad_norm": 0.4801993403704907, + "learning_rate": 0.003, + "loss": 4.2462, + "step": 1780 + }, + { + "epoch": 0.01781, + "grad_norm": 0.5353966732851616, + "learning_rate": 0.003, + "loss": 4.2633, + "step": 1781 + }, + { + "epoch": 0.01782, + "grad_norm": 0.5707424190875742, + "learning_rate": 0.003, + "loss": 4.2884, + "step": 1782 + }, + { + "epoch": 0.01783, + "grad_norm": 0.6083204008367944, + "learning_rate": 0.003, + "loss": 4.269, + "step": 1783 + }, + { + "epoch": 0.01784, + "grad_norm": 0.5414123353142472, + "learning_rate": 0.003, + "loss": 4.2666, + "step": 1784 + }, + { + "epoch": 0.01785, + "grad_norm": 0.6099552330414602, + "learning_rate": 0.003, + "loss": 4.2322, + "step": 1785 + }, + { + "epoch": 0.01786, + "grad_norm": 0.6938449080529763, + "learning_rate": 0.003, + "loss": 4.2624, + "step": 1786 + }, + { + "epoch": 0.01787, + "grad_norm": 0.7097606505143458, + "learning_rate": 0.003, + "loss": 4.2787, + "step": 1787 + }, + { + "epoch": 0.01788, + "grad_norm": 0.7987231782657276, + "learning_rate": 0.003, + "loss": 4.29, + "step": 1788 + }, + { + "epoch": 0.01789, + "grad_norm": 0.9170384713763422, + "learning_rate": 0.003, + "loss": 4.2752, + "step": 1789 + }, + { + "epoch": 0.0179, + "grad_norm": 0.8912339815181158, + "learning_rate": 0.003, + "loss": 4.2973, + "step": 1790 + }, + { + "epoch": 0.01791, + "grad_norm": 0.7681052193269913, + "learning_rate": 0.003, + "loss": 4.3056, + "step": 1791 + }, + { + "epoch": 0.01792, + "grad_norm": 0.6429477629622415, + "learning_rate": 0.003, + "loss": 4.3028, + "step": 1792 + }, + { + "epoch": 0.01793, + "grad_norm": 0.7186534811578004, + "learning_rate": 0.003, + "loss": 4.2729, + "step": 1793 + }, + { + "epoch": 0.01794, + "grad_norm": 0.5765860920967305, + "learning_rate": 0.003, + "loss": 4.2799, + "step": 1794 + }, + { + "epoch": 0.01795, + "grad_norm": 0.5000657697525336, + "learning_rate": 0.003, + "loss": 4.2762, + "step": 1795 + }, + { + "epoch": 0.01796, + "grad_norm": 0.5320047446020336, + "learning_rate": 0.003, + "loss": 4.2904, + "step": 1796 + }, + { + "epoch": 0.01797, + "grad_norm": 0.4504365845865774, + "learning_rate": 0.003, + "loss": 4.2509, + "step": 1797 + }, + { + "epoch": 0.01798, + "grad_norm": 0.40462213723419516, + "learning_rate": 0.003, + "loss": 4.2636, + "step": 1798 + }, + { + "epoch": 0.01799, + "grad_norm": 0.35282082316813834, + "learning_rate": 0.003, + "loss": 4.2763, + "step": 1799 + }, + { + "epoch": 0.018, + "grad_norm": 0.3411687376041599, + "learning_rate": 0.003, + "loss": 4.2632, + "step": 1800 + }, + { + "epoch": 0.01801, + "grad_norm": 0.3497200911452386, + "learning_rate": 0.003, + "loss": 4.2568, + "step": 1801 + }, + { + "epoch": 0.01802, + "grad_norm": 0.35422809832207447, + "learning_rate": 0.003, + "loss": 4.2731, + "step": 1802 + }, + { + "epoch": 0.01803, + "grad_norm": 0.3279859543333952, + "learning_rate": 0.003, + "loss": 4.277, + "step": 1803 + }, + { + "epoch": 0.01804, + "grad_norm": 0.37278065596161997, + "learning_rate": 0.003, + "loss": 4.2479, + "step": 1804 + }, + { + "epoch": 0.01805, + "grad_norm": 0.4167024120969314, + "learning_rate": 0.003, + "loss": 4.268, + "step": 1805 + }, + { + "epoch": 0.01806, + "grad_norm": 0.45338026173808493, + "learning_rate": 0.003, + "loss": 4.228, + "step": 1806 + }, + { + "epoch": 0.01807, + "grad_norm": 0.4492923467683076, + "learning_rate": 0.003, + "loss": 4.2649, + "step": 1807 + }, + { + "epoch": 0.01808, + "grad_norm": 0.4519532920743694, + "learning_rate": 0.003, + "loss": 4.2552, + "step": 1808 + }, + { + "epoch": 0.01809, + "grad_norm": 0.4794685836261005, + "learning_rate": 0.003, + "loss": 4.2751, + "step": 1809 + }, + { + "epoch": 0.0181, + "grad_norm": 0.6301644557370442, + "learning_rate": 0.003, + "loss": 4.275, + "step": 1810 + }, + { + "epoch": 0.01811, + "grad_norm": 0.8128009946323582, + "learning_rate": 0.003, + "loss": 4.2718, + "step": 1811 + }, + { + "epoch": 0.01812, + "grad_norm": 0.8427115921852621, + "learning_rate": 0.003, + "loss": 4.2615, + "step": 1812 + }, + { + "epoch": 0.01813, + "grad_norm": 0.7199083817416421, + "learning_rate": 0.003, + "loss": 4.3052, + "step": 1813 + }, + { + "epoch": 0.01814, + "grad_norm": 0.7440856146842654, + "learning_rate": 0.003, + "loss": 4.2773, + "step": 1814 + }, + { + "epoch": 0.01815, + "grad_norm": 0.6604668614903264, + "learning_rate": 0.003, + "loss": 4.2603, + "step": 1815 + }, + { + "epoch": 0.01816, + "grad_norm": 0.7213183194060501, + "learning_rate": 0.003, + "loss": 4.289, + "step": 1816 + }, + { + "epoch": 0.01817, + "grad_norm": 0.707160301614863, + "learning_rate": 0.003, + "loss": 4.2802, + "step": 1817 + }, + { + "epoch": 0.01818, + "grad_norm": 0.636608110327377, + "learning_rate": 0.003, + "loss": 4.2785, + "step": 1818 + }, + { + "epoch": 0.01819, + "grad_norm": 0.5804906977024753, + "learning_rate": 0.003, + "loss": 4.2581, + "step": 1819 + }, + { + "epoch": 0.0182, + "grad_norm": 0.5372919614803818, + "learning_rate": 0.003, + "loss": 4.2768, + "step": 1820 + }, + { + "epoch": 0.01821, + "grad_norm": 0.5873367244339422, + "learning_rate": 0.003, + "loss": 4.2652, + "step": 1821 + }, + { + "epoch": 0.01822, + "grad_norm": 0.6367155622372229, + "learning_rate": 0.003, + "loss": 4.278, + "step": 1822 + }, + { + "epoch": 0.01823, + "grad_norm": 0.7098801106756836, + "learning_rate": 0.003, + "loss": 4.2989, + "step": 1823 + }, + { + "epoch": 0.01824, + "grad_norm": 0.6249291972074501, + "learning_rate": 0.003, + "loss": 4.2755, + "step": 1824 + }, + { + "epoch": 0.01825, + "grad_norm": 0.5242106659607212, + "learning_rate": 0.003, + "loss": 4.2762, + "step": 1825 + }, + { + "epoch": 0.01826, + "grad_norm": 0.644099441571583, + "learning_rate": 0.003, + "loss": 4.2915, + "step": 1826 + }, + { + "epoch": 0.01827, + "grad_norm": 0.6224752218569206, + "learning_rate": 0.003, + "loss": 4.2591, + "step": 1827 + }, + { + "epoch": 0.01828, + "grad_norm": 0.543186641981227, + "learning_rate": 0.003, + "loss": 4.2695, + "step": 1828 + }, + { + "epoch": 0.01829, + "grad_norm": 0.6845506683310987, + "learning_rate": 0.003, + "loss": 4.2898, + "step": 1829 + }, + { + "epoch": 0.0183, + "grad_norm": 0.879601600440537, + "learning_rate": 0.003, + "loss": 4.2972, + "step": 1830 + }, + { + "epoch": 0.01831, + "grad_norm": 0.9844682114288815, + "learning_rate": 0.003, + "loss": 4.2992, + "step": 1831 + }, + { + "epoch": 0.01832, + "grad_norm": 0.8821724273705098, + "learning_rate": 0.003, + "loss": 4.2978, + "step": 1832 + }, + { + "epoch": 0.01833, + "grad_norm": 0.7330000924703792, + "learning_rate": 0.003, + "loss": 4.3267, + "step": 1833 + }, + { + "epoch": 0.01834, + "grad_norm": 0.68997205114043, + "learning_rate": 0.003, + "loss": 4.2918, + "step": 1834 + }, + { + "epoch": 0.01835, + "grad_norm": 0.7006645684897048, + "learning_rate": 0.003, + "loss": 4.2921, + "step": 1835 + }, + { + "epoch": 0.01836, + "grad_norm": 0.7370828132611139, + "learning_rate": 0.003, + "loss": 4.2868, + "step": 1836 + }, + { + "epoch": 0.01837, + "grad_norm": 0.8137814608022381, + "learning_rate": 0.003, + "loss": 4.2934, + "step": 1837 + }, + { + "epoch": 0.01838, + "grad_norm": 0.6630708544495384, + "learning_rate": 0.003, + "loss": 4.2935, + "step": 1838 + }, + { + "epoch": 0.01839, + "grad_norm": 0.6770506893324072, + "learning_rate": 0.003, + "loss": 4.3163, + "step": 1839 + }, + { + "epoch": 0.0184, + "grad_norm": 0.772243829580562, + "learning_rate": 0.003, + "loss": 4.315, + "step": 1840 + }, + { + "epoch": 0.01841, + "grad_norm": 0.7260652894274943, + "learning_rate": 0.003, + "loss": 4.2507, + "step": 1841 + }, + { + "epoch": 0.01842, + "grad_norm": 0.7455618963662838, + "learning_rate": 0.003, + "loss": 4.2889, + "step": 1842 + }, + { + "epoch": 0.01843, + "grad_norm": 0.5629263660723788, + "learning_rate": 0.003, + "loss": 4.2632, + "step": 1843 + }, + { + "epoch": 0.01844, + "grad_norm": 0.44782374021160304, + "learning_rate": 0.003, + "loss": 4.2542, + "step": 1844 + }, + { + "epoch": 0.01845, + "grad_norm": 0.44338335058672285, + "learning_rate": 0.003, + "loss": 4.2569, + "step": 1845 + }, + { + "epoch": 0.01846, + "grad_norm": 0.3533576160559802, + "learning_rate": 0.003, + "loss": 4.2775, + "step": 1846 + }, + { + "epoch": 0.01847, + "grad_norm": 0.36624369667968887, + "learning_rate": 0.003, + "loss": 4.2733, + "step": 1847 + }, + { + "epoch": 0.01848, + "grad_norm": 0.3515311739049859, + "learning_rate": 0.003, + "loss": 4.2847, + "step": 1848 + }, + { + "epoch": 0.01849, + "grad_norm": 0.31316963159896893, + "learning_rate": 0.003, + "loss": 4.2827, + "step": 1849 + }, + { + "epoch": 0.0185, + "grad_norm": 0.3416820848274596, + "learning_rate": 0.003, + "loss": 4.2703, + "step": 1850 + }, + { + "epoch": 0.01851, + "grad_norm": 0.3489689830102001, + "learning_rate": 0.003, + "loss": 4.2512, + "step": 1851 + }, + { + "epoch": 0.01852, + "grad_norm": 0.3703418463232587, + "learning_rate": 0.003, + "loss": 4.2517, + "step": 1852 + }, + { + "epoch": 0.01853, + "grad_norm": 0.41368285825954554, + "learning_rate": 0.003, + "loss": 4.2519, + "step": 1853 + }, + { + "epoch": 0.01854, + "grad_norm": 0.43320899613747116, + "learning_rate": 0.003, + "loss": 4.2834, + "step": 1854 + }, + { + "epoch": 0.01855, + "grad_norm": 0.5253237000651575, + "learning_rate": 0.003, + "loss": 4.2689, + "step": 1855 + }, + { + "epoch": 0.01856, + "grad_norm": 0.5779002753152843, + "learning_rate": 0.003, + "loss": 4.2553, + "step": 1856 + }, + { + "epoch": 0.01857, + "grad_norm": 0.593959858084494, + "learning_rate": 0.003, + "loss": 4.241, + "step": 1857 + }, + { + "epoch": 0.01858, + "grad_norm": 0.5126648296135959, + "learning_rate": 0.003, + "loss": 4.2479, + "step": 1858 + }, + { + "epoch": 0.01859, + "grad_norm": 0.45559130929525077, + "learning_rate": 0.003, + "loss": 4.2412, + "step": 1859 + }, + { + "epoch": 0.0186, + "grad_norm": 0.4806055065781757, + "learning_rate": 0.003, + "loss": 4.2254, + "step": 1860 + }, + { + "epoch": 0.01861, + "grad_norm": 0.48294706675854066, + "learning_rate": 0.003, + "loss": 4.2694, + "step": 1861 + }, + { + "epoch": 0.01862, + "grad_norm": 0.4664947919884159, + "learning_rate": 0.003, + "loss": 4.2335, + "step": 1862 + }, + { + "epoch": 0.01863, + "grad_norm": 0.4819198640587951, + "learning_rate": 0.003, + "loss": 4.2428, + "step": 1863 + }, + { + "epoch": 0.01864, + "grad_norm": 0.5946115673610074, + "learning_rate": 0.003, + "loss": 4.2797, + "step": 1864 + }, + { + "epoch": 0.01865, + "grad_norm": 0.7290398384671329, + "learning_rate": 0.003, + "loss": 4.2637, + "step": 1865 + }, + { + "epoch": 0.01866, + "grad_norm": 0.8160446708452547, + "learning_rate": 0.003, + "loss": 4.2688, + "step": 1866 + }, + { + "epoch": 0.01867, + "grad_norm": 0.7596737913536388, + "learning_rate": 0.003, + "loss": 4.2479, + "step": 1867 + }, + { + "epoch": 0.01868, + "grad_norm": 0.7512897467429733, + "learning_rate": 0.003, + "loss": 4.265, + "step": 1868 + }, + { + "epoch": 0.01869, + "grad_norm": 0.899865336798572, + "learning_rate": 0.003, + "loss": 4.2896, + "step": 1869 + }, + { + "epoch": 0.0187, + "grad_norm": 0.8087963711485776, + "learning_rate": 0.003, + "loss": 4.2851, + "step": 1870 + }, + { + "epoch": 0.01871, + "grad_norm": 0.6481673959520611, + "learning_rate": 0.003, + "loss": 4.2524, + "step": 1871 + }, + { + "epoch": 0.01872, + "grad_norm": 0.5923778344000376, + "learning_rate": 0.003, + "loss": 4.2969, + "step": 1872 + }, + { + "epoch": 0.01873, + "grad_norm": 0.5580453248479986, + "learning_rate": 0.003, + "loss": 4.2558, + "step": 1873 + }, + { + "epoch": 0.01874, + "grad_norm": 0.6129883764486191, + "learning_rate": 0.003, + "loss": 4.2619, + "step": 1874 + }, + { + "epoch": 0.01875, + "grad_norm": 0.5660814639175555, + "learning_rate": 0.003, + "loss": 4.2767, + "step": 1875 + }, + { + "epoch": 0.01876, + "grad_norm": 0.5484550288366968, + "learning_rate": 0.003, + "loss": 4.2804, + "step": 1876 + }, + { + "epoch": 0.01877, + "grad_norm": 0.5413620281670742, + "learning_rate": 0.003, + "loss": 4.2852, + "step": 1877 + }, + { + "epoch": 0.01878, + "grad_norm": 0.5151059596185084, + "learning_rate": 0.003, + "loss": 4.2658, + "step": 1878 + }, + { + "epoch": 0.01879, + "grad_norm": 0.5108820927917921, + "learning_rate": 0.003, + "loss": 4.2428, + "step": 1879 + }, + { + "epoch": 0.0188, + "grad_norm": 0.480687658081291, + "learning_rate": 0.003, + "loss": 4.253, + "step": 1880 + }, + { + "epoch": 0.01881, + "grad_norm": 0.4006710963490709, + "learning_rate": 0.003, + "loss": 4.2442, + "step": 1881 + }, + { + "epoch": 0.01882, + "grad_norm": 0.4041555926045392, + "learning_rate": 0.003, + "loss": 4.2713, + "step": 1882 + }, + { + "epoch": 0.01883, + "grad_norm": 0.35889370988759256, + "learning_rate": 0.003, + "loss": 4.2426, + "step": 1883 + }, + { + "epoch": 0.01884, + "grad_norm": 0.3548857794793813, + "learning_rate": 0.003, + "loss": 4.2417, + "step": 1884 + }, + { + "epoch": 0.01885, + "grad_norm": 0.35029061193490524, + "learning_rate": 0.003, + "loss": 4.2179, + "step": 1885 + }, + { + "epoch": 0.01886, + "grad_norm": 0.44501447232013214, + "learning_rate": 0.003, + "loss": 4.2578, + "step": 1886 + }, + { + "epoch": 0.01887, + "grad_norm": 0.5229549741250542, + "learning_rate": 0.003, + "loss": 4.2541, + "step": 1887 + }, + { + "epoch": 0.01888, + "grad_norm": 0.6745619001945896, + "learning_rate": 0.003, + "loss": 4.2454, + "step": 1888 + }, + { + "epoch": 0.01889, + "grad_norm": 0.6923119713487843, + "learning_rate": 0.003, + "loss": 4.265, + "step": 1889 + }, + { + "epoch": 0.0189, + "grad_norm": 0.58679702829643, + "learning_rate": 0.003, + "loss": 4.2409, + "step": 1890 + }, + { + "epoch": 0.01891, + "grad_norm": 0.646800864954939, + "learning_rate": 0.003, + "loss": 4.2919, + "step": 1891 + }, + { + "epoch": 0.01892, + "grad_norm": 0.6502623001706804, + "learning_rate": 0.003, + "loss": 4.2707, + "step": 1892 + }, + { + "epoch": 0.01893, + "grad_norm": 0.5615534003702374, + "learning_rate": 0.003, + "loss": 4.2469, + "step": 1893 + }, + { + "epoch": 0.01894, + "grad_norm": 0.5991987877315249, + "learning_rate": 0.003, + "loss": 4.254, + "step": 1894 + }, + { + "epoch": 0.01895, + "grad_norm": 0.5685602484818931, + "learning_rate": 0.003, + "loss": 4.2409, + "step": 1895 + }, + { + "epoch": 0.01896, + "grad_norm": 0.5273740611308925, + "learning_rate": 0.003, + "loss": 4.2621, + "step": 1896 + }, + { + "epoch": 0.01897, + "grad_norm": 0.4634723554903588, + "learning_rate": 0.003, + "loss": 4.2587, + "step": 1897 + }, + { + "epoch": 0.01898, + "grad_norm": 0.49012578983884775, + "learning_rate": 0.003, + "loss": 4.2271, + "step": 1898 + }, + { + "epoch": 0.01899, + "grad_norm": 0.5391967127125681, + "learning_rate": 0.003, + "loss": 4.2462, + "step": 1899 + }, + { + "epoch": 0.019, + "grad_norm": 0.5314902668241436, + "learning_rate": 0.003, + "loss": 4.2384, + "step": 1900 + }, + { + "epoch": 0.01901, + "grad_norm": 0.5694638994637579, + "learning_rate": 0.003, + "loss": 4.2871, + "step": 1901 + }, + { + "epoch": 0.01902, + "grad_norm": 0.5590323874513298, + "learning_rate": 0.003, + "loss": 4.2641, + "step": 1902 + }, + { + "epoch": 0.01903, + "grad_norm": 0.5475545528889447, + "learning_rate": 0.003, + "loss": 4.2196, + "step": 1903 + }, + { + "epoch": 0.01904, + "grad_norm": 0.6218453695681301, + "learning_rate": 0.003, + "loss": 4.2791, + "step": 1904 + }, + { + "epoch": 0.01905, + "grad_norm": 0.591162548550656, + "learning_rate": 0.003, + "loss": 4.2534, + "step": 1905 + }, + { + "epoch": 0.01906, + "grad_norm": 0.5128889662513827, + "learning_rate": 0.003, + "loss": 4.2676, + "step": 1906 + }, + { + "epoch": 0.01907, + "grad_norm": 0.5047288261347648, + "learning_rate": 0.003, + "loss": 4.2599, + "step": 1907 + }, + { + "epoch": 0.01908, + "grad_norm": 0.4667283203300201, + "learning_rate": 0.003, + "loss": 4.2726, + "step": 1908 + }, + { + "epoch": 0.01909, + "grad_norm": 0.5022149031562609, + "learning_rate": 0.003, + "loss": 4.2382, + "step": 1909 + }, + { + "epoch": 0.0191, + "grad_norm": 0.5432392826551734, + "learning_rate": 0.003, + "loss": 4.2518, + "step": 1910 + }, + { + "epoch": 0.01911, + "grad_norm": 0.49523295523680083, + "learning_rate": 0.003, + "loss": 4.2241, + "step": 1911 + }, + { + "epoch": 0.01912, + "grad_norm": 0.5206775073204631, + "learning_rate": 0.003, + "loss": 4.2363, + "step": 1912 + }, + { + "epoch": 0.01913, + "grad_norm": 0.523325426364381, + "learning_rate": 0.003, + "loss": 4.2544, + "step": 1913 + }, + { + "epoch": 0.01914, + "grad_norm": 0.5843680788676122, + "learning_rate": 0.003, + "loss": 4.2743, + "step": 1914 + }, + { + "epoch": 0.01915, + "grad_norm": 0.6905336390991205, + "learning_rate": 0.003, + "loss": 4.2632, + "step": 1915 + }, + { + "epoch": 0.01916, + "grad_norm": 0.8314031188331161, + "learning_rate": 0.003, + "loss": 4.2512, + "step": 1916 + }, + { + "epoch": 0.01917, + "grad_norm": 0.9750007322129252, + "learning_rate": 0.003, + "loss": 4.2904, + "step": 1917 + }, + { + "epoch": 0.01918, + "grad_norm": 0.9797925708116749, + "learning_rate": 0.003, + "loss": 4.2842, + "step": 1918 + }, + { + "epoch": 0.01919, + "grad_norm": 0.7747220152275403, + "learning_rate": 0.003, + "loss": 4.2834, + "step": 1919 + }, + { + "epoch": 0.0192, + "grad_norm": 0.7189857079350012, + "learning_rate": 0.003, + "loss": 4.2819, + "step": 1920 + }, + { + "epoch": 0.01921, + "grad_norm": 0.7219748871978388, + "learning_rate": 0.003, + "loss": 4.2705, + "step": 1921 + }, + { + "epoch": 0.01922, + "grad_norm": 0.7146468956621638, + "learning_rate": 0.003, + "loss": 4.2862, + "step": 1922 + }, + { + "epoch": 0.01923, + "grad_norm": 0.6697224209681402, + "learning_rate": 0.003, + "loss": 4.2923, + "step": 1923 + }, + { + "epoch": 0.01924, + "grad_norm": 0.7063194554901026, + "learning_rate": 0.003, + "loss": 4.2924, + "step": 1924 + }, + { + "epoch": 0.01925, + "grad_norm": 0.6608146496442958, + "learning_rate": 0.003, + "loss": 4.2782, + "step": 1925 + }, + { + "epoch": 0.01926, + "grad_norm": 0.6058286924144092, + "learning_rate": 0.003, + "loss": 4.2871, + "step": 1926 + }, + { + "epoch": 0.01927, + "grad_norm": 0.5582247211706641, + "learning_rate": 0.003, + "loss": 4.2731, + "step": 1927 + }, + { + "epoch": 0.01928, + "grad_norm": 0.5497301592853885, + "learning_rate": 0.003, + "loss": 4.2438, + "step": 1928 + }, + { + "epoch": 0.01929, + "grad_norm": 0.5697976403038864, + "learning_rate": 0.003, + "loss": 4.2661, + "step": 1929 + }, + { + "epoch": 0.0193, + "grad_norm": 0.5669233105595727, + "learning_rate": 0.003, + "loss": 4.253, + "step": 1930 + }, + { + "epoch": 0.01931, + "grad_norm": 0.4328335869920709, + "learning_rate": 0.003, + "loss": 4.2488, + "step": 1931 + }, + { + "epoch": 0.01932, + "grad_norm": 0.37950505638182785, + "learning_rate": 0.003, + "loss": 4.2566, + "step": 1932 + }, + { + "epoch": 0.01933, + "grad_norm": 0.3504964342922692, + "learning_rate": 0.003, + "loss": 4.2408, + "step": 1933 + }, + { + "epoch": 0.01934, + "grad_norm": 0.3616114126450499, + "learning_rate": 0.003, + "loss": 4.2472, + "step": 1934 + }, + { + "epoch": 0.01935, + "grad_norm": 0.3586196056492206, + "learning_rate": 0.003, + "loss": 4.259, + "step": 1935 + }, + { + "epoch": 0.01936, + "grad_norm": 0.4036363099668179, + "learning_rate": 0.003, + "loss": 4.2404, + "step": 1936 + }, + { + "epoch": 0.01937, + "grad_norm": 0.41909297435676146, + "learning_rate": 0.003, + "loss": 4.2527, + "step": 1937 + }, + { + "epoch": 0.01938, + "grad_norm": 0.439340208584005, + "learning_rate": 0.003, + "loss": 4.2528, + "step": 1938 + }, + { + "epoch": 0.01939, + "grad_norm": 0.4942036615177103, + "learning_rate": 0.003, + "loss": 4.2748, + "step": 1939 + }, + { + "epoch": 0.0194, + "grad_norm": 0.5753579784669907, + "learning_rate": 0.003, + "loss": 4.2537, + "step": 1940 + }, + { + "epoch": 0.01941, + "grad_norm": 0.7525206839134082, + "learning_rate": 0.003, + "loss": 4.2429, + "step": 1941 + }, + { + "epoch": 0.01942, + "grad_norm": 0.8127296854745015, + "learning_rate": 0.003, + "loss": 4.2788, + "step": 1942 + }, + { + "epoch": 0.01943, + "grad_norm": 0.7505954536075328, + "learning_rate": 0.003, + "loss": 4.2474, + "step": 1943 + }, + { + "epoch": 0.01944, + "grad_norm": 0.6966879842895444, + "learning_rate": 0.003, + "loss": 4.292, + "step": 1944 + }, + { + "epoch": 0.01945, + "grad_norm": 0.5317620511293283, + "learning_rate": 0.003, + "loss": 4.2513, + "step": 1945 + }, + { + "epoch": 0.01946, + "grad_norm": 0.5396271588936495, + "learning_rate": 0.003, + "loss": 4.2483, + "step": 1946 + }, + { + "epoch": 0.01947, + "grad_norm": 0.4953395916242608, + "learning_rate": 0.003, + "loss": 4.2539, + "step": 1947 + }, + { + "epoch": 0.01948, + "grad_norm": 0.43095282528265144, + "learning_rate": 0.003, + "loss": 4.2724, + "step": 1948 + }, + { + "epoch": 0.01949, + "grad_norm": 0.4581100416309599, + "learning_rate": 0.003, + "loss": 4.2647, + "step": 1949 + }, + { + "epoch": 0.0195, + "grad_norm": 0.4138069104419874, + "learning_rate": 0.003, + "loss": 4.2509, + "step": 1950 + }, + { + "epoch": 0.01951, + "grad_norm": 0.453383618119804, + "learning_rate": 0.003, + "loss": 4.2469, + "step": 1951 + }, + { + "epoch": 0.01952, + "grad_norm": 0.5050716895331453, + "learning_rate": 0.003, + "loss": 4.2541, + "step": 1952 + }, + { + "epoch": 0.01953, + "grad_norm": 0.5444826544170946, + "learning_rate": 0.003, + "loss": 4.2612, + "step": 1953 + }, + { + "epoch": 0.01954, + "grad_norm": 0.5817368201949349, + "learning_rate": 0.003, + "loss": 4.2642, + "step": 1954 + }, + { + "epoch": 0.01955, + "grad_norm": 0.5828427095685109, + "learning_rate": 0.003, + "loss": 4.257, + "step": 1955 + }, + { + "epoch": 0.01956, + "grad_norm": 0.6033336279941961, + "learning_rate": 0.003, + "loss": 4.2559, + "step": 1956 + }, + { + "epoch": 0.01957, + "grad_norm": 0.4961523479078479, + "learning_rate": 0.003, + "loss": 4.2179, + "step": 1957 + }, + { + "epoch": 0.01958, + "grad_norm": 0.44186549075636594, + "learning_rate": 0.003, + "loss": 4.2115, + "step": 1958 + }, + { + "epoch": 0.01959, + "grad_norm": 0.5087237373575997, + "learning_rate": 0.003, + "loss": 4.2394, + "step": 1959 + }, + { + "epoch": 0.0196, + "grad_norm": 0.5262591797086301, + "learning_rate": 0.003, + "loss": 4.2322, + "step": 1960 + }, + { + "epoch": 0.01961, + "grad_norm": 0.68395452085867, + "learning_rate": 0.003, + "loss": 4.2394, + "step": 1961 + }, + { + "epoch": 0.01962, + "grad_norm": 0.8016832589931029, + "learning_rate": 0.003, + "loss": 4.2481, + "step": 1962 + }, + { + "epoch": 0.01963, + "grad_norm": 0.75220900113814, + "learning_rate": 0.003, + "loss": 4.2388, + "step": 1963 + }, + { + "epoch": 0.01964, + "grad_norm": 0.6357936481488424, + "learning_rate": 0.003, + "loss": 4.2611, + "step": 1964 + }, + { + "epoch": 0.01965, + "grad_norm": 0.5451687729946599, + "learning_rate": 0.003, + "loss": 4.2493, + "step": 1965 + }, + { + "epoch": 0.01966, + "grad_norm": 0.5793603889291398, + "learning_rate": 0.003, + "loss": 4.2538, + "step": 1966 + }, + { + "epoch": 0.01967, + "grad_norm": 0.5157681292282557, + "learning_rate": 0.003, + "loss": 4.2253, + "step": 1967 + }, + { + "epoch": 0.01968, + "grad_norm": 0.5440002063072249, + "learning_rate": 0.003, + "loss": 4.2502, + "step": 1968 + }, + { + "epoch": 0.01969, + "grad_norm": 0.5300695855996388, + "learning_rate": 0.003, + "loss": 4.2446, + "step": 1969 + }, + { + "epoch": 0.0197, + "grad_norm": 0.47595129333192576, + "learning_rate": 0.003, + "loss": 4.2426, + "step": 1970 + }, + { + "epoch": 0.01971, + "grad_norm": 0.5285948280548484, + "learning_rate": 0.003, + "loss": 4.2314, + "step": 1971 + }, + { + "epoch": 0.01972, + "grad_norm": 0.5935023814870325, + "learning_rate": 0.003, + "loss": 4.2369, + "step": 1972 + }, + { + "epoch": 0.01973, + "grad_norm": 0.6689295429120606, + "learning_rate": 0.003, + "loss": 4.2228, + "step": 1973 + }, + { + "epoch": 0.01974, + "grad_norm": 0.6805455931144389, + "learning_rate": 0.003, + "loss": 4.2627, + "step": 1974 + }, + { + "epoch": 0.01975, + "grad_norm": 0.6851649356245609, + "learning_rate": 0.003, + "loss": 4.2574, + "step": 1975 + }, + { + "epoch": 0.01976, + "grad_norm": 0.6359068486728622, + "learning_rate": 0.003, + "loss": 4.2445, + "step": 1976 + }, + { + "epoch": 0.01977, + "grad_norm": 0.5752983961700786, + "learning_rate": 0.003, + "loss": 4.2775, + "step": 1977 + }, + { + "epoch": 0.01978, + "grad_norm": 0.5885328424294697, + "learning_rate": 0.003, + "loss": 4.2773, + "step": 1978 + }, + { + "epoch": 0.01979, + "grad_norm": 0.6114820249853535, + "learning_rate": 0.003, + "loss": 4.266, + "step": 1979 + }, + { + "epoch": 0.0198, + "grad_norm": 0.6725244919020471, + "learning_rate": 0.003, + "loss": 4.2495, + "step": 1980 + }, + { + "epoch": 0.01981, + "grad_norm": 0.6899410283984171, + "learning_rate": 0.003, + "loss": 4.2347, + "step": 1981 + }, + { + "epoch": 0.01982, + "grad_norm": 0.7680678867968328, + "learning_rate": 0.003, + "loss": 4.2688, + "step": 1982 + }, + { + "epoch": 0.01983, + "grad_norm": 0.8666192600294306, + "learning_rate": 0.003, + "loss": 4.2665, + "step": 1983 + }, + { + "epoch": 0.01984, + "grad_norm": 0.7590492652886811, + "learning_rate": 0.003, + "loss": 4.2796, + "step": 1984 + }, + { + "epoch": 0.01985, + "grad_norm": 0.6539060914690586, + "learning_rate": 0.003, + "loss": 4.2645, + "step": 1985 + }, + { + "epoch": 0.01986, + "grad_norm": 0.715503037740265, + "learning_rate": 0.003, + "loss": 4.2958, + "step": 1986 + }, + { + "epoch": 0.01987, + "grad_norm": 0.7397978022844682, + "learning_rate": 0.003, + "loss": 4.254, + "step": 1987 + }, + { + "epoch": 0.01988, + "grad_norm": 0.7632034859099814, + "learning_rate": 0.003, + "loss": 4.2717, + "step": 1988 + }, + { + "epoch": 0.01989, + "grad_norm": 0.7294621095424992, + "learning_rate": 0.003, + "loss": 4.2599, + "step": 1989 + }, + { + "epoch": 0.0199, + "grad_norm": 0.7424352842692145, + "learning_rate": 0.003, + "loss": 4.274, + "step": 1990 + }, + { + "epoch": 0.01991, + "grad_norm": 0.6805432395640066, + "learning_rate": 0.003, + "loss": 4.2481, + "step": 1991 + }, + { + "epoch": 0.01992, + "grad_norm": 0.6518823852851582, + "learning_rate": 0.003, + "loss": 4.2906, + "step": 1992 + }, + { + "epoch": 0.01993, + "grad_norm": 0.5385914746778909, + "learning_rate": 0.003, + "loss": 4.2663, + "step": 1993 + }, + { + "epoch": 0.01994, + "grad_norm": 0.5600374553588182, + "learning_rate": 0.003, + "loss": 4.2843, + "step": 1994 + }, + { + "epoch": 0.01995, + "grad_norm": 0.5777853734403589, + "learning_rate": 0.003, + "loss": 4.2772, + "step": 1995 + }, + { + "epoch": 0.01996, + "grad_norm": 0.5901601073427315, + "learning_rate": 0.003, + "loss": 4.2466, + "step": 1996 + }, + { + "epoch": 0.01997, + "grad_norm": 0.5583731211765088, + "learning_rate": 0.003, + "loss": 4.2745, + "step": 1997 + }, + { + "epoch": 0.01998, + "grad_norm": 0.5499515734950207, + "learning_rate": 0.003, + "loss": 4.2592, + "step": 1998 + }, + { + "epoch": 0.01999, + "grad_norm": 0.5390040915979398, + "learning_rate": 0.003, + "loss": 4.2488, + "step": 1999 + }, + { + "epoch": 0.02, + "grad_norm": 0.6085613854495133, + "learning_rate": 0.003, + "loss": 4.2357, + "step": 2000 + }, + { + "epoch": 0.02001, + "grad_norm": 0.6642300277845461, + "learning_rate": 0.003, + "loss": 4.2761, + "step": 2001 + }, + { + "epoch": 0.02002, + "grad_norm": 0.6718563957297456, + "learning_rate": 0.003, + "loss": 4.2714, + "step": 2002 + }, + { + "epoch": 0.02003, + "grad_norm": 0.5656785141879644, + "learning_rate": 0.003, + "loss": 4.2645, + "step": 2003 + }, + { + "epoch": 0.02004, + "grad_norm": 0.5079141091367905, + "learning_rate": 0.003, + "loss": 4.2507, + "step": 2004 + }, + { + "epoch": 0.02005, + "grad_norm": 0.5217306980533027, + "learning_rate": 0.003, + "loss": 4.254, + "step": 2005 + }, + { + "epoch": 0.02006, + "grad_norm": 0.5701023982360506, + "learning_rate": 0.003, + "loss": 4.2527, + "step": 2006 + }, + { + "epoch": 0.02007, + "grad_norm": 0.5456537194359483, + "learning_rate": 0.003, + "loss": 4.2671, + "step": 2007 + }, + { + "epoch": 0.02008, + "grad_norm": 0.4424819860056267, + "learning_rate": 0.003, + "loss": 4.2538, + "step": 2008 + }, + { + "epoch": 0.02009, + "grad_norm": 0.4167499120193345, + "learning_rate": 0.003, + "loss": 4.2487, + "step": 2009 + }, + { + "epoch": 0.0201, + "grad_norm": 0.4192128686531188, + "learning_rate": 0.003, + "loss": 4.259, + "step": 2010 + }, + { + "epoch": 0.02011, + "grad_norm": 0.3848165653499132, + "learning_rate": 0.003, + "loss": 4.2447, + "step": 2011 + }, + { + "epoch": 0.02012, + "grad_norm": 0.433602941803945, + "learning_rate": 0.003, + "loss": 4.2428, + "step": 2012 + }, + { + "epoch": 0.02013, + "grad_norm": 0.4888730590149885, + "learning_rate": 0.003, + "loss": 4.2535, + "step": 2013 + }, + { + "epoch": 0.02014, + "grad_norm": 0.5673912976929151, + "learning_rate": 0.003, + "loss": 4.2387, + "step": 2014 + }, + { + "epoch": 0.02015, + "grad_norm": 0.5541499948622309, + "learning_rate": 0.003, + "loss": 4.2361, + "step": 2015 + }, + { + "epoch": 0.02016, + "grad_norm": 0.5126326745775238, + "learning_rate": 0.003, + "loss": 4.2417, + "step": 2016 + }, + { + "epoch": 0.02017, + "grad_norm": 0.4179261623064346, + "learning_rate": 0.003, + "loss": 4.2316, + "step": 2017 + }, + { + "epoch": 0.02018, + "grad_norm": 0.3898056312044461, + "learning_rate": 0.003, + "loss": 4.2341, + "step": 2018 + }, + { + "epoch": 0.02019, + "grad_norm": 0.3954814595294976, + "learning_rate": 0.003, + "loss": 4.2373, + "step": 2019 + }, + { + "epoch": 0.0202, + "grad_norm": 0.4213180029076697, + "learning_rate": 0.003, + "loss": 4.2455, + "step": 2020 + }, + { + "epoch": 0.02021, + "grad_norm": 0.46415695965874665, + "learning_rate": 0.003, + "loss": 4.2601, + "step": 2021 + }, + { + "epoch": 0.02022, + "grad_norm": 0.5255015046967251, + "learning_rate": 0.003, + "loss": 4.2204, + "step": 2022 + }, + { + "epoch": 0.02023, + "grad_norm": 0.494786978730351, + "learning_rate": 0.003, + "loss": 4.1939, + "step": 2023 + }, + { + "epoch": 0.02024, + "grad_norm": 0.4706750110587683, + "learning_rate": 0.003, + "loss": 4.2183, + "step": 2024 + }, + { + "epoch": 0.02025, + "grad_norm": 0.4845475406638408, + "learning_rate": 0.003, + "loss": 4.2698, + "step": 2025 + }, + { + "epoch": 0.02026, + "grad_norm": 0.5870205306653891, + "learning_rate": 0.003, + "loss": 4.2539, + "step": 2026 + }, + { + "epoch": 0.02027, + "grad_norm": 0.5994657871324464, + "learning_rate": 0.003, + "loss": 4.2502, + "step": 2027 + }, + { + "epoch": 0.02028, + "grad_norm": 0.5969136086797949, + "learning_rate": 0.003, + "loss": 4.2336, + "step": 2028 + }, + { + "epoch": 0.02029, + "grad_norm": 0.6010561087024289, + "learning_rate": 0.003, + "loss": 4.2419, + "step": 2029 + }, + { + "epoch": 0.0203, + "grad_norm": 0.6795949256132597, + "learning_rate": 0.003, + "loss": 4.2385, + "step": 2030 + }, + { + "epoch": 0.02031, + "grad_norm": 0.8760173401758673, + "learning_rate": 0.003, + "loss": 4.2538, + "step": 2031 + }, + { + "epoch": 0.02032, + "grad_norm": 0.9553317966985612, + "learning_rate": 0.003, + "loss": 4.2369, + "step": 2032 + }, + { + "epoch": 0.02033, + "grad_norm": 0.8062059561231115, + "learning_rate": 0.003, + "loss": 4.2804, + "step": 2033 + }, + { + "epoch": 0.02034, + "grad_norm": 0.6544594771524366, + "learning_rate": 0.003, + "loss": 4.2639, + "step": 2034 + }, + { + "epoch": 0.02035, + "grad_norm": 0.5631710363420367, + "learning_rate": 0.003, + "loss": 4.2451, + "step": 2035 + }, + { + "epoch": 0.02036, + "grad_norm": 0.5975164123168384, + "learning_rate": 0.003, + "loss": 4.2483, + "step": 2036 + }, + { + "epoch": 0.02037, + "grad_norm": 0.5354587611248767, + "learning_rate": 0.003, + "loss": 4.2582, + "step": 2037 + }, + { + "epoch": 0.02038, + "grad_norm": 0.421237418399427, + "learning_rate": 0.003, + "loss": 4.2353, + "step": 2038 + }, + { + "epoch": 0.02039, + "grad_norm": 0.4274320650935603, + "learning_rate": 0.003, + "loss": 4.2464, + "step": 2039 + }, + { + "epoch": 0.0204, + "grad_norm": 0.4164176001256727, + "learning_rate": 0.003, + "loss": 4.2496, + "step": 2040 + }, + { + "epoch": 0.02041, + "grad_norm": 0.4311810995214273, + "learning_rate": 0.003, + "loss": 4.2107, + "step": 2041 + }, + { + "epoch": 0.02042, + "grad_norm": 0.47918244910234625, + "learning_rate": 0.003, + "loss": 4.2125, + "step": 2042 + }, + { + "epoch": 0.02043, + "grad_norm": 0.5311186564484092, + "learning_rate": 0.003, + "loss": 4.2587, + "step": 2043 + }, + { + "epoch": 0.02044, + "grad_norm": 0.5775700762927194, + "learning_rate": 0.003, + "loss": 4.2349, + "step": 2044 + }, + { + "epoch": 0.02045, + "grad_norm": 0.64581468780677, + "learning_rate": 0.003, + "loss": 4.2514, + "step": 2045 + }, + { + "epoch": 0.02046, + "grad_norm": 0.6099280998923168, + "learning_rate": 0.003, + "loss": 4.2295, + "step": 2046 + }, + { + "epoch": 0.02047, + "grad_norm": 0.49206984816447724, + "learning_rate": 0.003, + "loss": 4.2619, + "step": 2047 + }, + { + "epoch": 0.02048, + "grad_norm": 0.6576416076375511, + "learning_rate": 0.003, + "loss": 4.2493, + "step": 2048 + }, + { + "epoch": 0.02049, + "grad_norm": 0.720911622296069, + "learning_rate": 0.003, + "loss": 4.2776, + "step": 2049 + }, + { + "epoch": 0.0205, + "grad_norm": 0.6221494156113543, + "learning_rate": 0.003, + "loss": 4.2218, + "step": 2050 + }, + { + "epoch": 0.02051, + "grad_norm": 0.5686539010207549, + "learning_rate": 0.003, + "loss": 4.2223, + "step": 2051 + }, + { + "epoch": 0.02052, + "grad_norm": 0.5121857318353746, + "learning_rate": 0.003, + "loss": 4.2222, + "step": 2052 + }, + { + "epoch": 0.02053, + "grad_norm": 0.4272252497200639, + "learning_rate": 0.003, + "loss": 4.2194, + "step": 2053 + }, + { + "epoch": 0.02054, + "grad_norm": 0.4939477792323304, + "learning_rate": 0.003, + "loss": 4.2591, + "step": 2054 + }, + { + "epoch": 0.02055, + "grad_norm": 0.5546078566627058, + "learning_rate": 0.003, + "loss": 4.2201, + "step": 2055 + }, + { + "epoch": 0.02056, + "grad_norm": 0.6366201965593732, + "learning_rate": 0.003, + "loss": 4.2231, + "step": 2056 + }, + { + "epoch": 0.02057, + "grad_norm": 0.6110086842051798, + "learning_rate": 0.003, + "loss": 4.2608, + "step": 2057 + }, + { + "epoch": 0.02058, + "grad_norm": 0.6217334659745377, + "learning_rate": 0.003, + "loss": 4.2367, + "step": 2058 + }, + { + "epoch": 0.02059, + "grad_norm": 0.651819516754762, + "learning_rate": 0.003, + "loss": 4.222, + "step": 2059 + }, + { + "epoch": 0.0206, + "grad_norm": 0.5754276638558378, + "learning_rate": 0.003, + "loss": 4.2638, + "step": 2060 + }, + { + "epoch": 0.02061, + "grad_norm": 0.5646246506114238, + "learning_rate": 0.003, + "loss": 4.2372, + "step": 2061 + }, + { + "epoch": 0.02062, + "grad_norm": 0.5560224904172448, + "learning_rate": 0.003, + "loss": 4.2373, + "step": 2062 + }, + { + "epoch": 0.02063, + "grad_norm": 0.6303655478175813, + "learning_rate": 0.003, + "loss": 4.2352, + "step": 2063 + }, + { + "epoch": 0.02064, + "grad_norm": 0.7300283957670998, + "learning_rate": 0.003, + "loss": 4.2505, + "step": 2064 + }, + { + "epoch": 0.02065, + "grad_norm": 0.7574662177371257, + "learning_rate": 0.003, + "loss": 4.2359, + "step": 2065 + }, + { + "epoch": 0.02066, + "grad_norm": 0.7411303035814747, + "learning_rate": 0.003, + "loss": 4.2531, + "step": 2066 + }, + { + "epoch": 0.02067, + "grad_norm": 0.650869346843019, + "learning_rate": 0.003, + "loss": 4.2447, + "step": 2067 + }, + { + "epoch": 0.02068, + "grad_norm": 0.6552127487874416, + "learning_rate": 0.003, + "loss": 4.2306, + "step": 2068 + }, + { + "epoch": 0.02069, + "grad_norm": 0.592143853498531, + "learning_rate": 0.003, + "loss": 4.2516, + "step": 2069 + }, + { + "epoch": 0.0207, + "grad_norm": 0.6207350896242446, + "learning_rate": 0.003, + "loss": 4.2263, + "step": 2070 + }, + { + "epoch": 0.02071, + "grad_norm": 0.5661720697139818, + "learning_rate": 0.003, + "loss": 4.2543, + "step": 2071 + }, + { + "epoch": 0.02072, + "grad_norm": 0.5613150829964545, + "learning_rate": 0.003, + "loss": 4.2589, + "step": 2072 + }, + { + "epoch": 0.02073, + "grad_norm": 0.5913403223082936, + "learning_rate": 0.003, + "loss": 4.24, + "step": 2073 + }, + { + "epoch": 0.02074, + "grad_norm": 0.5851338988165525, + "learning_rate": 0.003, + "loss": 4.2458, + "step": 2074 + }, + { + "epoch": 0.02075, + "grad_norm": 0.6236167441400716, + "learning_rate": 0.003, + "loss": 4.2754, + "step": 2075 + }, + { + "epoch": 0.02076, + "grad_norm": 0.6671767173763447, + "learning_rate": 0.003, + "loss": 4.2696, + "step": 2076 + }, + { + "epoch": 0.02077, + "grad_norm": 0.7466794061160641, + "learning_rate": 0.003, + "loss": 4.2397, + "step": 2077 + }, + { + "epoch": 0.02078, + "grad_norm": 0.7293207627425712, + "learning_rate": 0.003, + "loss": 4.2415, + "step": 2078 + }, + { + "epoch": 0.02079, + "grad_norm": 0.7479234412446394, + "learning_rate": 0.003, + "loss": 4.2267, + "step": 2079 + }, + { + "epoch": 0.0208, + "grad_norm": 0.8408782832743513, + "learning_rate": 0.003, + "loss": 4.2466, + "step": 2080 + }, + { + "epoch": 0.02081, + "grad_norm": 0.6858851728314246, + "learning_rate": 0.003, + "loss": 4.2727, + "step": 2081 + }, + { + "epoch": 0.02082, + "grad_norm": 0.5964291376338854, + "learning_rate": 0.003, + "loss": 4.2638, + "step": 2082 + }, + { + "epoch": 0.02083, + "grad_norm": 0.6428002164481011, + "learning_rate": 0.003, + "loss": 4.2628, + "step": 2083 + }, + { + "epoch": 0.02084, + "grad_norm": 0.5590580963015779, + "learning_rate": 0.003, + "loss": 4.2545, + "step": 2084 + }, + { + "epoch": 0.02085, + "grad_norm": 0.5709511558813687, + "learning_rate": 0.003, + "loss": 4.2275, + "step": 2085 + }, + { + "epoch": 0.02086, + "grad_norm": 0.49147509133747364, + "learning_rate": 0.003, + "loss": 4.2297, + "step": 2086 + }, + { + "epoch": 0.02087, + "grad_norm": 0.4526072737335104, + "learning_rate": 0.003, + "loss": 4.2294, + "step": 2087 + }, + { + "epoch": 0.02088, + "grad_norm": 0.4016974716447893, + "learning_rate": 0.003, + "loss": 4.2602, + "step": 2088 + }, + { + "epoch": 0.02089, + "grad_norm": 0.3822543479885703, + "learning_rate": 0.003, + "loss": 4.2249, + "step": 2089 + }, + { + "epoch": 0.0209, + "grad_norm": 0.36830581223819503, + "learning_rate": 0.003, + "loss": 4.2427, + "step": 2090 + }, + { + "epoch": 0.02091, + "grad_norm": 0.35577804274859626, + "learning_rate": 0.003, + "loss": 4.2285, + "step": 2091 + }, + { + "epoch": 0.02092, + "grad_norm": 0.4347176959080428, + "learning_rate": 0.003, + "loss": 4.2276, + "step": 2092 + }, + { + "epoch": 0.02093, + "grad_norm": 0.5304539612640176, + "learning_rate": 0.003, + "loss": 4.1974, + "step": 2093 + }, + { + "epoch": 0.02094, + "grad_norm": 0.7330731731573918, + "learning_rate": 0.003, + "loss": 4.2123, + "step": 2094 + }, + { + "epoch": 0.02095, + "grad_norm": 0.7847225937930644, + "learning_rate": 0.003, + "loss": 4.2645, + "step": 2095 + }, + { + "epoch": 0.02096, + "grad_norm": 0.6648124367791377, + "learning_rate": 0.003, + "loss": 4.2516, + "step": 2096 + }, + { + "epoch": 0.02097, + "grad_norm": 0.5924239660138091, + "learning_rate": 0.003, + "loss": 4.2279, + "step": 2097 + }, + { + "epoch": 0.02098, + "grad_norm": 0.5602628841134751, + "learning_rate": 0.003, + "loss": 4.2465, + "step": 2098 + }, + { + "epoch": 0.02099, + "grad_norm": 0.46052580380046326, + "learning_rate": 0.003, + "loss": 4.2512, + "step": 2099 + }, + { + "epoch": 0.021, + "grad_norm": 0.4339841503760461, + "learning_rate": 0.003, + "loss": 4.263, + "step": 2100 + }, + { + "epoch": 0.02101, + "grad_norm": 0.3839438876111581, + "learning_rate": 0.003, + "loss": 4.2511, + "step": 2101 + }, + { + "epoch": 0.02102, + "grad_norm": 0.3754480167103612, + "learning_rate": 0.003, + "loss": 4.2531, + "step": 2102 + }, + { + "epoch": 0.02103, + "grad_norm": 0.4162207299096809, + "learning_rate": 0.003, + "loss": 4.2151, + "step": 2103 + }, + { + "epoch": 0.02104, + "grad_norm": 0.46199291536196674, + "learning_rate": 0.003, + "loss": 4.2197, + "step": 2104 + }, + { + "epoch": 0.02105, + "grad_norm": 0.5379532841395008, + "learning_rate": 0.003, + "loss": 4.2264, + "step": 2105 + }, + { + "epoch": 0.02106, + "grad_norm": 0.5254543741501657, + "learning_rate": 0.003, + "loss": 4.2187, + "step": 2106 + }, + { + "epoch": 0.02107, + "grad_norm": 0.5366355457801288, + "learning_rate": 0.003, + "loss": 4.2328, + "step": 2107 + }, + { + "epoch": 0.02108, + "grad_norm": 0.6703887406069353, + "learning_rate": 0.003, + "loss": 4.2182, + "step": 2108 + }, + { + "epoch": 0.02109, + "grad_norm": 0.73797667148783, + "learning_rate": 0.003, + "loss": 4.2096, + "step": 2109 + }, + { + "epoch": 0.0211, + "grad_norm": 0.7306602624016222, + "learning_rate": 0.003, + "loss": 4.2289, + "step": 2110 + }, + { + "epoch": 0.02111, + "grad_norm": 0.7062809207206387, + "learning_rate": 0.003, + "loss": 4.2409, + "step": 2111 + }, + { + "epoch": 0.02112, + "grad_norm": 0.6626082871730016, + "learning_rate": 0.003, + "loss": 4.2378, + "step": 2112 + }, + { + "epoch": 0.02113, + "grad_norm": 0.7157807873847553, + "learning_rate": 0.003, + "loss": 4.238, + "step": 2113 + }, + { + "epoch": 0.02114, + "grad_norm": 0.7509167267520314, + "learning_rate": 0.003, + "loss": 4.2327, + "step": 2114 + }, + { + "epoch": 0.02115, + "grad_norm": 0.8631295846669226, + "learning_rate": 0.003, + "loss": 4.234, + "step": 2115 + }, + { + "epoch": 0.02116, + "grad_norm": 0.8818862519491879, + "learning_rate": 0.003, + "loss": 4.2687, + "step": 2116 + }, + { + "epoch": 0.02117, + "grad_norm": 0.9901874745713439, + "learning_rate": 0.003, + "loss": 4.2757, + "step": 2117 + }, + { + "epoch": 0.02118, + "grad_norm": 1.0129584193183192, + "learning_rate": 0.003, + "loss": 4.28, + "step": 2118 + }, + { + "epoch": 0.02119, + "grad_norm": 1.0428306836946426, + "learning_rate": 0.003, + "loss": 4.2999, + "step": 2119 + }, + { + "epoch": 0.0212, + "grad_norm": 0.8765906436767104, + "learning_rate": 0.003, + "loss": 4.2719, + "step": 2120 + }, + { + "epoch": 0.02121, + "grad_norm": 0.7948828861009111, + "learning_rate": 0.003, + "loss": 4.2627, + "step": 2121 + }, + { + "epoch": 0.02122, + "grad_norm": 0.7472865629342507, + "learning_rate": 0.003, + "loss": 4.2726, + "step": 2122 + }, + { + "epoch": 0.02123, + "grad_norm": 0.6385709948713884, + "learning_rate": 0.003, + "loss": 4.2438, + "step": 2123 + }, + { + "epoch": 0.02124, + "grad_norm": 0.557924731350531, + "learning_rate": 0.003, + "loss": 4.2611, + "step": 2124 + }, + { + "epoch": 0.02125, + "grad_norm": 0.5786579211625313, + "learning_rate": 0.003, + "loss": 4.2778, + "step": 2125 + }, + { + "epoch": 0.02126, + "grad_norm": 0.6027087958052927, + "learning_rate": 0.003, + "loss": 4.2718, + "step": 2126 + }, + { + "epoch": 0.02127, + "grad_norm": 0.5438113788648495, + "learning_rate": 0.003, + "loss": 4.2756, + "step": 2127 + }, + { + "epoch": 0.02128, + "grad_norm": 0.4557902210786888, + "learning_rate": 0.003, + "loss": 4.261, + "step": 2128 + }, + { + "epoch": 0.02129, + "grad_norm": 0.43397893502328583, + "learning_rate": 0.003, + "loss": 4.259, + "step": 2129 + }, + { + "epoch": 0.0213, + "grad_norm": 0.4555779798497891, + "learning_rate": 0.003, + "loss": 4.233, + "step": 2130 + }, + { + "epoch": 0.02131, + "grad_norm": 0.4779110278336917, + "learning_rate": 0.003, + "loss": 4.2569, + "step": 2131 + }, + { + "epoch": 0.02132, + "grad_norm": 0.5370327391448694, + "learning_rate": 0.003, + "loss": 4.2673, + "step": 2132 + }, + { + "epoch": 0.02133, + "grad_norm": 0.5952731573509253, + "learning_rate": 0.003, + "loss": 4.2603, + "step": 2133 + }, + { + "epoch": 0.02134, + "grad_norm": 0.570127740822763, + "learning_rate": 0.003, + "loss": 4.2602, + "step": 2134 + }, + { + "epoch": 0.02135, + "grad_norm": 0.4582370027457041, + "learning_rate": 0.003, + "loss": 4.2533, + "step": 2135 + }, + { + "epoch": 0.02136, + "grad_norm": 0.36641117706775855, + "learning_rate": 0.003, + "loss": 4.219, + "step": 2136 + }, + { + "epoch": 0.02137, + "grad_norm": 0.34800353584932325, + "learning_rate": 0.003, + "loss": 4.2578, + "step": 2137 + }, + { + "epoch": 0.02138, + "grad_norm": 0.30837616213325136, + "learning_rate": 0.003, + "loss": 4.2635, + "step": 2138 + }, + { + "epoch": 0.02139, + "grad_norm": 0.3564245359330032, + "learning_rate": 0.003, + "loss": 4.2543, + "step": 2139 + }, + { + "epoch": 0.0214, + "grad_norm": 0.35935426429256007, + "learning_rate": 0.003, + "loss": 4.2457, + "step": 2140 + }, + { + "epoch": 0.02141, + "grad_norm": 0.44450585444098134, + "learning_rate": 0.003, + "loss": 4.2278, + "step": 2141 + }, + { + "epoch": 0.02142, + "grad_norm": 0.5664462228803099, + "learning_rate": 0.003, + "loss": 4.2232, + "step": 2142 + }, + { + "epoch": 0.02143, + "grad_norm": 0.6979889941355174, + "learning_rate": 0.003, + "loss": 4.2444, + "step": 2143 + }, + { + "epoch": 0.02144, + "grad_norm": 0.6079500902139742, + "learning_rate": 0.003, + "loss": 4.2463, + "step": 2144 + }, + { + "epoch": 0.02145, + "grad_norm": 0.4705352279704145, + "learning_rate": 0.003, + "loss": 4.2234, + "step": 2145 + }, + { + "epoch": 0.02146, + "grad_norm": 0.6487033829979426, + "learning_rate": 0.003, + "loss": 4.242, + "step": 2146 + }, + { + "epoch": 0.02147, + "grad_norm": 0.646803097358189, + "learning_rate": 0.003, + "loss": 4.2643, + "step": 2147 + }, + { + "epoch": 0.02148, + "grad_norm": 0.5846288473404795, + "learning_rate": 0.003, + "loss": 4.2698, + "step": 2148 + }, + { + "epoch": 0.02149, + "grad_norm": 0.6324004546408943, + "learning_rate": 0.003, + "loss": 4.2592, + "step": 2149 + }, + { + "epoch": 0.0215, + "grad_norm": 0.6348968665767296, + "learning_rate": 0.003, + "loss": 4.2447, + "step": 2150 + }, + { + "epoch": 0.02151, + "grad_norm": 0.6038850039685164, + "learning_rate": 0.003, + "loss": 4.2496, + "step": 2151 + }, + { + "epoch": 0.02152, + "grad_norm": 0.6571231105206827, + "learning_rate": 0.003, + "loss": 4.229, + "step": 2152 + }, + { + "epoch": 0.02153, + "grad_norm": 0.5810533963397566, + "learning_rate": 0.003, + "loss": 4.2363, + "step": 2153 + }, + { + "epoch": 0.02154, + "grad_norm": 0.4956565749223054, + "learning_rate": 0.003, + "loss": 4.2226, + "step": 2154 + }, + { + "epoch": 0.02155, + "grad_norm": 0.437847806481028, + "learning_rate": 0.003, + "loss": 4.252, + "step": 2155 + }, + { + "epoch": 0.02156, + "grad_norm": 0.4180405852162412, + "learning_rate": 0.003, + "loss": 4.2371, + "step": 2156 + }, + { + "epoch": 0.02157, + "grad_norm": 0.3760963967155856, + "learning_rate": 0.003, + "loss": 4.2006, + "step": 2157 + }, + { + "epoch": 0.02158, + "grad_norm": 0.428171131535246, + "learning_rate": 0.003, + "loss": 4.2226, + "step": 2158 + }, + { + "epoch": 0.02159, + "grad_norm": 0.4821995287398208, + "learning_rate": 0.003, + "loss": 4.2389, + "step": 2159 + }, + { + "epoch": 0.0216, + "grad_norm": 0.6037348014921639, + "learning_rate": 0.003, + "loss": 4.2444, + "step": 2160 + }, + { + "epoch": 0.02161, + "grad_norm": 0.6609447933471442, + "learning_rate": 0.003, + "loss": 4.2362, + "step": 2161 + }, + { + "epoch": 0.02162, + "grad_norm": 0.6433252356071486, + "learning_rate": 0.003, + "loss": 4.2396, + "step": 2162 + }, + { + "epoch": 0.02163, + "grad_norm": 0.5709983086388406, + "learning_rate": 0.003, + "loss": 4.2528, + "step": 2163 + }, + { + "epoch": 0.02164, + "grad_norm": 0.6142803039373849, + "learning_rate": 0.003, + "loss": 4.2494, + "step": 2164 + }, + { + "epoch": 0.02165, + "grad_norm": 0.5872422750407742, + "learning_rate": 0.003, + "loss": 4.2104, + "step": 2165 + }, + { + "epoch": 0.02166, + "grad_norm": 0.5895046030386504, + "learning_rate": 0.003, + "loss": 4.2544, + "step": 2166 + }, + { + "epoch": 0.02167, + "grad_norm": 0.5695409441331826, + "learning_rate": 0.003, + "loss": 4.2313, + "step": 2167 + }, + { + "epoch": 0.02168, + "grad_norm": 0.5621779951550028, + "learning_rate": 0.003, + "loss": 4.2285, + "step": 2168 + }, + { + "epoch": 0.02169, + "grad_norm": 0.4890626633165933, + "learning_rate": 0.003, + "loss": 4.2251, + "step": 2169 + }, + { + "epoch": 0.0217, + "grad_norm": 0.5684142074498969, + "learning_rate": 0.003, + "loss": 4.2378, + "step": 2170 + }, + { + "epoch": 0.02171, + "grad_norm": 0.6068809167714128, + "learning_rate": 0.003, + "loss": 4.2505, + "step": 2171 + }, + { + "epoch": 0.02172, + "grad_norm": 0.5617111304615819, + "learning_rate": 0.003, + "loss": 4.2228, + "step": 2172 + }, + { + "epoch": 0.02173, + "grad_norm": 0.5652162783140336, + "learning_rate": 0.003, + "loss": 4.2246, + "step": 2173 + }, + { + "epoch": 0.02174, + "grad_norm": 0.6450142330375198, + "learning_rate": 0.003, + "loss": 4.2267, + "step": 2174 + }, + { + "epoch": 0.02175, + "grad_norm": 0.6696169755702878, + "learning_rate": 0.003, + "loss": 4.2515, + "step": 2175 + }, + { + "epoch": 0.02176, + "grad_norm": 0.6614934153481808, + "learning_rate": 0.003, + "loss": 4.2347, + "step": 2176 + }, + { + "epoch": 0.02177, + "grad_norm": 0.7712758358266676, + "learning_rate": 0.003, + "loss": 4.2426, + "step": 2177 + }, + { + "epoch": 0.02178, + "grad_norm": 0.8052241964952243, + "learning_rate": 0.003, + "loss": 4.234, + "step": 2178 + }, + { + "epoch": 0.02179, + "grad_norm": 0.8190327773882939, + "learning_rate": 0.003, + "loss": 4.2474, + "step": 2179 + }, + { + "epoch": 0.0218, + "grad_norm": 0.7633412782608947, + "learning_rate": 0.003, + "loss": 4.2656, + "step": 2180 + }, + { + "epoch": 0.02181, + "grad_norm": 0.6487141963532598, + "learning_rate": 0.003, + "loss": 4.2624, + "step": 2181 + }, + { + "epoch": 0.02182, + "grad_norm": 0.5938256063465354, + "learning_rate": 0.003, + "loss": 4.2443, + "step": 2182 + }, + { + "epoch": 0.02183, + "grad_norm": 0.572941959470127, + "learning_rate": 0.003, + "loss": 4.2546, + "step": 2183 + }, + { + "epoch": 0.02184, + "grad_norm": 0.5472840158022778, + "learning_rate": 0.003, + "loss": 4.2258, + "step": 2184 + }, + { + "epoch": 0.02185, + "grad_norm": 0.48428509908133466, + "learning_rate": 0.003, + "loss": 4.2187, + "step": 2185 + }, + { + "epoch": 0.02186, + "grad_norm": 0.41644305586561753, + "learning_rate": 0.003, + "loss": 4.2353, + "step": 2186 + }, + { + "epoch": 0.02187, + "grad_norm": 0.4220079190945348, + "learning_rate": 0.003, + "loss": 4.2314, + "step": 2187 + }, + { + "epoch": 0.02188, + "grad_norm": 0.4040099253154041, + "learning_rate": 0.003, + "loss": 4.2084, + "step": 2188 + }, + { + "epoch": 0.02189, + "grad_norm": 0.384791417229758, + "learning_rate": 0.003, + "loss": 4.2412, + "step": 2189 + }, + { + "epoch": 0.0219, + "grad_norm": 0.4200536352754627, + "learning_rate": 0.003, + "loss": 4.2271, + "step": 2190 + }, + { + "epoch": 0.02191, + "grad_norm": 0.4219256289842046, + "learning_rate": 0.003, + "loss": 4.2197, + "step": 2191 + }, + { + "epoch": 0.02192, + "grad_norm": 0.3796661113601938, + "learning_rate": 0.003, + "loss": 4.2468, + "step": 2192 + }, + { + "epoch": 0.02193, + "grad_norm": 0.3951332402646691, + "learning_rate": 0.003, + "loss": 4.2124, + "step": 2193 + }, + { + "epoch": 0.02194, + "grad_norm": 0.43785643693424503, + "learning_rate": 0.003, + "loss": 4.2632, + "step": 2194 + }, + { + "epoch": 0.02195, + "grad_norm": 0.40101316917194174, + "learning_rate": 0.003, + "loss": 4.2286, + "step": 2195 + }, + { + "epoch": 0.02196, + "grad_norm": 0.3895866783766733, + "learning_rate": 0.003, + "loss": 4.2337, + "step": 2196 + }, + { + "epoch": 0.02197, + "grad_norm": 0.43978912346203514, + "learning_rate": 0.003, + "loss": 4.2283, + "step": 2197 + }, + { + "epoch": 0.02198, + "grad_norm": 0.5128410451451384, + "learning_rate": 0.003, + "loss": 4.2199, + "step": 2198 + }, + { + "epoch": 0.02199, + "grad_norm": 0.6337030003238012, + "learning_rate": 0.003, + "loss": 4.192, + "step": 2199 + }, + { + "epoch": 0.022, + "grad_norm": 0.8998218740769401, + "learning_rate": 0.003, + "loss": 4.2327, + "step": 2200 + }, + { + "epoch": 0.02201, + "grad_norm": 0.9431446506923681, + "learning_rate": 0.003, + "loss": 4.266, + "step": 2201 + }, + { + "epoch": 0.02202, + "grad_norm": 0.7612219282493126, + "learning_rate": 0.003, + "loss": 4.2433, + "step": 2202 + }, + { + "epoch": 0.02203, + "grad_norm": 0.6635684162527457, + "learning_rate": 0.003, + "loss": 4.2702, + "step": 2203 + }, + { + "epoch": 0.02204, + "grad_norm": 0.7621929732197255, + "learning_rate": 0.003, + "loss": 4.2688, + "step": 2204 + }, + { + "epoch": 0.02205, + "grad_norm": 0.8741388018917097, + "learning_rate": 0.003, + "loss": 4.2762, + "step": 2205 + }, + { + "epoch": 0.02206, + "grad_norm": 0.9703192513292367, + "learning_rate": 0.003, + "loss": 4.26, + "step": 2206 + }, + { + "epoch": 0.02207, + "grad_norm": 0.8841717513591386, + "learning_rate": 0.003, + "loss": 4.2677, + "step": 2207 + }, + { + "epoch": 0.02208, + "grad_norm": 0.988099891532076, + "learning_rate": 0.003, + "loss": 4.2632, + "step": 2208 + }, + { + "epoch": 0.02209, + "grad_norm": 1.1146750705911779, + "learning_rate": 0.003, + "loss": 4.3, + "step": 2209 + }, + { + "epoch": 0.0221, + "grad_norm": 0.9650886464683209, + "learning_rate": 0.003, + "loss": 4.2786, + "step": 2210 + }, + { + "epoch": 0.02211, + "grad_norm": 0.8938651751677317, + "learning_rate": 0.003, + "loss": 4.289, + "step": 2211 + }, + { + "epoch": 0.02212, + "grad_norm": 0.8205609056746809, + "learning_rate": 0.003, + "loss": 4.2861, + "step": 2212 + }, + { + "epoch": 0.02213, + "grad_norm": 0.8079312852613237, + "learning_rate": 0.003, + "loss": 4.2922, + "step": 2213 + }, + { + "epoch": 0.02214, + "grad_norm": 0.6624076409511721, + "learning_rate": 0.003, + "loss": 4.2812, + "step": 2214 + }, + { + "epoch": 0.02215, + "grad_norm": 0.6346284348339961, + "learning_rate": 0.003, + "loss": 4.2836, + "step": 2215 + }, + { + "epoch": 0.02216, + "grad_norm": 0.6751649712191042, + "learning_rate": 0.003, + "loss": 4.262, + "step": 2216 + }, + { + "epoch": 0.02217, + "grad_norm": 0.7610112582408145, + "learning_rate": 0.003, + "loss": 4.2609, + "step": 2217 + }, + { + "epoch": 0.02218, + "grad_norm": 0.7718759523371377, + "learning_rate": 0.003, + "loss": 4.2931, + "step": 2218 + }, + { + "epoch": 0.02219, + "grad_norm": 0.7213187726292779, + "learning_rate": 0.003, + "loss": 4.2813, + "step": 2219 + }, + { + "epoch": 0.0222, + "grad_norm": 0.6931461020270434, + "learning_rate": 0.003, + "loss": 4.2766, + "step": 2220 + }, + { + "epoch": 0.02221, + "grad_norm": 0.6335484790352024, + "learning_rate": 0.003, + "loss": 4.2612, + "step": 2221 + }, + { + "epoch": 0.02222, + "grad_norm": 0.5200624241333043, + "learning_rate": 0.003, + "loss": 4.2557, + "step": 2222 + }, + { + "epoch": 0.02223, + "grad_norm": 0.47510216931250304, + "learning_rate": 0.003, + "loss": 4.2633, + "step": 2223 + }, + { + "epoch": 0.02224, + "grad_norm": 0.4708153372606544, + "learning_rate": 0.003, + "loss": 4.2436, + "step": 2224 + }, + { + "epoch": 0.02225, + "grad_norm": 0.4713839636734649, + "learning_rate": 0.003, + "loss": 4.2455, + "step": 2225 + }, + { + "epoch": 0.02226, + "grad_norm": 0.45242642141513834, + "learning_rate": 0.003, + "loss": 4.2572, + "step": 2226 + }, + { + "epoch": 0.02227, + "grad_norm": 0.5403277615094669, + "learning_rate": 0.003, + "loss": 4.2506, + "step": 2227 + }, + { + "epoch": 0.02228, + "grad_norm": 0.5419608649795276, + "learning_rate": 0.003, + "loss": 4.2378, + "step": 2228 + }, + { + "epoch": 0.02229, + "grad_norm": 0.5074364347379132, + "learning_rate": 0.003, + "loss": 4.2163, + "step": 2229 + }, + { + "epoch": 0.0223, + "grad_norm": 0.44694138444366344, + "learning_rate": 0.003, + "loss": 4.2496, + "step": 2230 + }, + { + "epoch": 0.02231, + "grad_norm": 0.3954086546661809, + "learning_rate": 0.003, + "loss": 4.2523, + "step": 2231 + }, + { + "epoch": 0.02232, + "grad_norm": 0.38774210113182334, + "learning_rate": 0.003, + "loss": 4.2551, + "step": 2232 + }, + { + "epoch": 0.02233, + "grad_norm": 0.33836180367798796, + "learning_rate": 0.003, + "loss": 4.2207, + "step": 2233 + }, + { + "epoch": 0.02234, + "grad_norm": 0.4026067706239707, + "learning_rate": 0.003, + "loss": 4.2423, + "step": 2234 + }, + { + "epoch": 0.02235, + "grad_norm": 0.3714972499375364, + "learning_rate": 0.003, + "loss": 4.2633, + "step": 2235 + }, + { + "epoch": 0.02236, + "grad_norm": 0.3656117183652505, + "learning_rate": 0.003, + "loss": 4.2142, + "step": 2236 + }, + { + "epoch": 0.02237, + "grad_norm": 0.3860156575975231, + "learning_rate": 0.003, + "loss": 4.248, + "step": 2237 + }, + { + "epoch": 0.02238, + "grad_norm": 0.40106624326868706, + "learning_rate": 0.003, + "loss": 4.227, + "step": 2238 + }, + { + "epoch": 0.02239, + "grad_norm": 0.344415203397115, + "learning_rate": 0.003, + "loss": 4.2104, + "step": 2239 + }, + { + "epoch": 0.0224, + "grad_norm": 0.30398409326848724, + "learning_rate": 0.003, + "loss": 4.1926, + "step": 2240 + }, + { + "epoch": 0.02241, + "grad_norm": 0.29910610471963883, + "learning_rate": 0.003, + "loss": 4.2222, + "step": 2241 + }, + { + "epoch": 0.02242, + "grad_norm": 0.32765283542789003, + "learning_rate": 0.003, + "loss": 4.2062, + "step": 2242 + }, + { + "epoch": 0.02243, + "grad_norm": 0.47068547305005354, + "learning_rate": 0.003, + "loss": 4.2397, + "step": 2243 + }, + { + "epoch": 0.02244, + "grad_norm": 0.619857801317153, + "learning_rate": 0.003, + "loss": 4.2209, + "step": 2244 + }, + { + "epoch": 0.02245, + "grad_norm": 0.7578137247550054, + "learning_rate": 0.003, + "loss": 4.214, + "step": 2245 + }, + { + "epoch": 0.02246, + "grad_norm": 0.7174200654723875, + "learning_rate": 0.003, + "loss": 4.2474, + "step": 2246 + }, + { + "epoch": 0.02247, + "grad_norm": 0.5627816069554801, + "learning_rate": 0.003, + "loss": 4.2147, + "step": 2247 + }, + { + "epoch": 0.02248, + "grad_norm": 0.645246341768817, + "learning_rate": 0.003, + "loss": 4.2267, + "step": 2248 + }, + { + "epoch": 0.02249, + "grad_norm": 0.5894073242429686, + "learning_rate": 0.003, + "loss": 4.2178, + "step": 2249 + }, + { + "epoch": 0.0225, + "grad_norm": 0.4708614120277191, + "learning_rate": 0.003, + "loss": 4.2264, + "step": 2250 + }, + { + "epoch": 0.02251, + "grad_norm": 0.5732703006696422, + "learning_rate": 0.003, + "loss": 4.2516, + "step": 2251 + }, + { + "epoch": 0.02252, + "grad_norm": 0.4876319361460763, + "learning_rate": 0.003, + "loss": 4.2397, + "step": 2252 + }, + { + "epoch": 0.02253, + "grad_norm": 0.4518791398680973, + "learning_rate": 0.003, + "loss": 4.2182, + "step": 2253 + }, + { + "epoch": 0.02254, + "grad_norm": 0.4686604716989295, + "learning_rate": 0.003, + "loss": 4.2242, + "step": 2254 + }, + { + "epoch": 0.02255, + "grad_norm": 0.46315510081369105, + "learning_rate": 0.003, + "loss": 4.2076, + "step": 2255 + }, + { + "epoch": 0.02256, + "grad_norm": 0.4115888436676222, + "learning_rate": 0.003, + "loss": 4.196, + "step": 2256 + }, + { + "epoch": 0.02257, + "grad_norm": 0.4250881653478155, + "learning_rate": 0.003, + "loss": 4.213, + "step": 2257 + }, + { + "epoch": 0.02258, + "grad_norm": 0.41687848922253873, + "learning_rate": 0.003, + "loss": 4.2225, + "step": 2258 + }, + { + "epoch": 0.02259, + "grad_norm": 0.42540602594527765, + "learning_rate": 0.003, + "loss": 4.1835, + "step": 2259 + }, + { + "epoch": 0.0226, + "grad_norm": 0.4465875569855491, + "learning_rate": 0.003, + "loss": 4.2393, + "step": 2260 + }, + { + "epoch": 0.02261, + "grad_norm": 0.4515532217997496, + "learning_rate": 0.003, + "loss": 4.2163, + "step": 2261 + }, + { + "epoch": 0.02262, + "grad_norm": 0.47647870630071837, + "learning_rate": 0.003, + "loss": 4.2167, + "step": 2262 + }, + { + "epoch": 0.02263, + "grad_norm": 0.5731324666338504, + "learning_rate": 0.003, + "loss": 4.2339, + "step": 2263 + }, + { + "epoch": 0.02264, + "grad_norm": 0.5964954733768894, + "learning_rate": 0.003, + "loss": 4.2281, + "step": 2264 + }, + { + "epoch": 0.02265, + "grad_norm": 0.5712613063923222, + "learning_rate": 0.003, + "loss": 4.1855, + "step": 2265 + }, + { + "epoch": 0.02266, + "grad_norm": 0.5312684023721527, + "learning_rate": 0.003, + "loss": 4.2007, + "step": 2266 + }, + { + "epoch": 0.02267, + "grad_norm": 0.5398877397723375, + "learning_rate": 0.003, + "loss": 4.2294, + "step": 2267 + }, + { + "epoch": 0.02268, + "grad_norm": 0.6097072952396848, + "learning_rate": 0.003, + "loss": 4.2058, + "step": 2268 + }, + { + "epoch": 0.02269, + "grad_norm": 0.667945707294233, + "learning_rate": 0.003, + "loss": 4.2339, + "step": 2269 + }, + { + "epoch": 0.0227, + "grad_norm": 0.6780050185078733, + "learning_rate": 0.003, + "loss": 4.23, + "step": 2270 + }, + { + "epoch": 0.02271, + "grad_norm": 0.6140341865228114, + "learning_rate": 0.003, + "loss": 4.2328, + "step": 2271 + }, + { + "epoch": 0.02272, + "grad_norm": 0.6077740705952491, + "learning_rate": 0.003, + "loss": 4.2343, + "step": 2272 + }, + { + "epoch": 0.02273, + "grad_norm": 0.6228614859955112, + "learning_rate": 0.003, + "loss": 4.237, + "step": 2273 + }, + { + "epoch": 0.02274, + "grad_norm": 0.6497689324113504, + "learning_rate": 0.003, + "loss": 4.2221, + "step": 2274 + }, + { + "epoch": 0.02275, + "grad_norm": 0.6593608950407358, + "learning_rate": 0.003, + "loss": 4.2201, + "step": 2275 + }, + { + "epoch": 0.02276, + "grad_norm": 0.6819172040141601, + "learning_rate": 0.003, + "loss": 4.2138, + "step": 2276 + }, + { + "epoch": 0.02277, + "grad_norm": 0.7620205189580369, + "learning_rate": 0.003, + "loss": 4.2321, + "step": 2277 + }, + { + "epoch": 0.02278, + "grad_norm": 0.622995687715391, + "learning_rate": 0.003, + "loss": 4.2327, + "step": 2278 + }, + { + "epoch": 0.02279, + "grad_norm": 0.5747298625450237, + "learning_rate": 0.003, + "loss": 4.2411, + "step": 2279 + }, + { + "epoch": 0.0228, + "grad_norm": 0.5988670118192916, + "learning_rate": 0.003, + "loss": 4.2313, + "step": 2280 + }, + { + "epoch": 0.02281, + "grad_norm": 0.611381401213052, + "learning_rate": 0.003, + "loss": 4.2272, + "step": 2281 + }, + { + "epoch": 0.02282, + "grad_norm": 0.5846556638089334, + "learning_rate": 0.003, + "loss": 4.2409, + "step": 2282 + }, + { + "epoch": 0.02283, + "grad_norm": 0.5475554139402415, + "learning_rate": 0.003, + "loss": 4.2536, + "step": 2283 + }, + { + "epoch": 0.02284, + "grad_norm": 0.5238952449646747, + "learning_rate": 0.003, + "loss": 4.2482, + "step": 2284 + }, + { + "epoch": 0.02285, + "grad_norm": 0.5449063604054842, + "learning_rate": 0.003, + "loss": 4.1851, + "step": 2285 + }, + { + "epoch": 0.02286, + "grad_norm": 0.5626629932890803, + "learning_rate": 0.003, + "loss": 4.25, + "step": 2286 + }, + { + "epoch": 0.02287, + "grad_norm": 0.5354326845671384, + "learning_rate": 0.003, + "loss": 4.2276, + "step": 2287 + }, + { + "epoch": 0.02288, + "grad_norm": 0.5865237537888423, + "learning_rate": 0.003, + "loss": 4.2517, + "step": 2288 + }, + { + "epoch": 0.02289, + "grad_norm": 0.6587817447859279, + "learning_rate": 0.003, + "loss": 4.2552, + "step": 2289 + }, + { + "epoch": 0.0229, + "grad_norm": 0.763374859559503, + "learning_rate": 0.003, + "loss": 4.2369, + "step": 2290 + }, + { + "epoch": 0.02291, + "grad_norm": 0.9238217255206519, + "learning_rate": 0.003, + "loss": 4.2327, + "step": 2291 + }, + { + "epoch": 0.02292, + "grad_norm": 0.8375515673022389, + "learning_rate": 0.003, + "loss": 4.2276, + "step": 2292 + }, + { + "epoch": 0.02293, + "grad_norm": 0.8229318263274273, + "learning_rate": 0.003, + "loss": 4.233, + "step": 2293 + }, + { + "epoch": 0.02294, + "grad_norm": 0.767668173749955, + "learning_rate": 0.003, + "loss": 4.2363, + "step": 2294 + }, + { + "epoch": 0.02295, + "grad_norm": 0.728858935661346, + "learning_rate": 0.003, + "loss": 4.2296, + "step": 2295 + }, + { + "epoch": 0.02296, + "grad_norm": 0.74447993660391, + "learning_rate": 0.003, + "loss": 4.2662, + "step": 2296 + }, + { + "epoch": 0.02297, + "grad_norm": 0.6968342842354955, + "learning_rate": 0.003, + "loss": 4.2327, + "step": 2297 + }, + { + "epoch": 0.02298, + "grad_norm": 0.6411785233315757, + "learning_rate": 0.003, + "loss": 4.2441, + "step": 2298 + }, + { + "epoch": 0.02299, + "grad_norm": 0.6765897982497444, + "learning_rate": 0.003, + "loss": 4.2472, + "step": 2299 + }, + { + "epoch": 0.023, + "grad_norm": 0.7218316432570615, + "learning_rate": 0.003, + "loss": 4.2659, + "step": 2300 + }, + { + "epoch": 0.02301, + "grad_norm": 0.6693972300748606, + "learning_rate": 0.003, + "loss": 4.2316, + "step": 2301 + }, + { + "epoch": 0.02302, + "grad_norm": 0.6521855282359511, + "learning_rate": 0.003, + "loss": 4.2347, + "step": 2302 + }, + { + "epoch": 0.02303, + "grad_norm": 0.5919587845688344, + "learning_rate": 0.003, + "loss": 4.224, + "step": 2303 + }, + { + "epoch": 0.02304, + "grad_norm": 0.602764977489394, + "learning_rate": 0.003, + "loss": 4.2562, + "step": 2304 + }, + { + "epoch": 0.02305, + "grad_norm": 0.5558668601822961, + "learning_rate": 0.003, + "loss": 4.2313, + "step": 2305 + }, + { + "epoch": 0.02306, + "grad_norm": 0.5156139404377212, + "learning_rate": 0.003, + "loss": 4.2058, + "step": 2306 + }, + { + "epoch": 0.02307, + "grad_norm": 0.5662595240344765, + "learning_rate": 0.003, + "loss": 4.2364, + "step": 2307 + }, + { + "epoch": 0.02308, + "grad_norm": 0.6173971988602593, + "learning_rate": 0.003, + "loss": 4.2276, + "step": 2308 + }, + { + "epoch": 0.02309, + "grad_norm": 0.7369887546423364, + "learning_rate": 0.003, + "loss": 4.261, + "step": 2309 + }, + { + "epoch": 0.0231, + "grad_norm": 0.7967301688365783, + "learning_rate": 0.003, + "loss": 4.2446, + "step": 2310 + }, + { + "epoch": 0.02311, + "grad_norm": 0.6089871818858962, + "learning_rate": 0.003, + "loss": 4.245, + "step": 2311 + }, + { + "epoch": 0.02312, + "grad_norm": 0.47586710546753136, + "learning_rate": 0.003, + "loss": 4.2111, + "step": 2312 + }, + { + "epoch": 0.02313, + "grad_norm": 0.4974927545735252, + "learning_rate": 0.003, + "loss": 4.2079, + "step": 2313 + }, + { + "epoch": 0.02314, + "grad_norm": 0.43769519573127175, + "learning_rate": 0.003, + "loss": 4.2204, + "step": 2314 + }, + { + "epoch": 0.02315, + "grad_norm": 0.401786726516975, + "learning_rate": 0.003, + "loss": 4.222, + "step": 2315 + }, + { + "epoch": 0.02316, + "grad_norm": 0.4259802780968188, + "learning_rate": 0.003, + "loss": 4.2341, + "step": 2316 + }, + { + "epoch": 0.02317, + "grad_norm": 0.41182452876226056, + "learning_rate": 0.003, + "loss": 4.2152, + "step": 2317 + }, + { + "epoch": 0.02318, + "grad_norm": 0.4136233759664175, + "learning_rate": 0.003, + "loss": 4.2028, + "step": 2318 + }, + { + "epoch": 0.02319, + "grad_norm": 0.3949029424988165, + "learning_rate": 0.003, + "loss": 4.2247, + "step": 2319 + }, + { + "epoch": 0.0232, + "grad_norm": 0.35504551442283205, + "learning_rate": 0.003, + "loss": 4.2312, + "step": 2320 + }, + { + "epoch": 0.02321, + "grad_norm": 0.32986956749591967, + "learning_rate": 0.003, + "loss": 4.2084, + "step": 2321 + }, + { + "epoch": 0.02322, + "grad_norm": 0.2968681197139734, + "learning_rate": 0.003, + "loss": 4.2156, + "step": 2322 + }, + { + "epoch": 0.02323, + "grad_norm": 0.3207801190374275, + "learning_rate": 0.003, + "loss": 4.1916, + "step": 2323 + }, + { + "epoch": 0.02324, + "grad_norm": 0.33142643285723733, + "learning_rate": 0.003, + "loss": 4.1895, + "step": 2324 + }, + { + "epoch": 0.02325, + "grad_norm": 0.3535725697371811, + "learning_rate": 0.003, + "loss": 4.215, + "step": 2325 + }, + { + "epoch": 0.02326, + "grad_norm": 0.3678851746936938, + "learning_rate": 0.003, + "loss": 4.2279, + "step": 2326 + }, + { + "epoch": 0.02327, + "grad_norm": 0.4101658150492574, + "learning_rate": 0.003, + "loss": 4.2161, + "step": 2327 + }, + { + "epoch": 0.02328, + "grad_norm": 0.4519738717854165, + "learning_rate": 0.003, + "loss": 4.2155, + "step": 2328 + }, + { + "epoch": 0.02329, + "grad_norm": 0.5647347753978986, + "learning_rate": 0.003, + "loss": 4.2089, + "step": 2329 + }, + { + "epoch": 0.0233, + "grad_norm": 0.6839573886066393, + "learning_rate": 0.003, + "loss": 4.2105, + "step": 2330 + }, + { + "epoch": 0.02331, + "grad_norm": 0.7998107409261247, + "learning_rate": 0.003, + "loss": 4.2543, + "step": 2331 + }, + { + "epoch": 0.02332, + "grad_norm": 0.689815752941326, + "learning_rate": 0.003, + "loss": 4.2161, + "step": 2332 + }, + { + "epoch": 0.02333, + "grad_norm": 0.6555251301313495, + "learning_rate": 0.003, + "loss": 4.222, + "step": 2333 + }, + { + "epoch": 0.02334, + "grad_norm": 0.6263058920174612, + "learning_rate": 0.003, + "loss": 4.1943, + "step": 2334 + }, + { + "epoch": 0.02335, + "grad_norm": 0.5873514688131092, + "learning_rate": 0.003, + "loss": 4.2307, + "step": 2335 + }, + { + "epoch": 0.02336, + "grad_norm": 0.6178829826634938, + "learning_rate": 0.003, + "loss": 4.2245, + "step": 2336 + }, + { + "epoch": 0.02337, + "grad_norm": 0.6776239992532147, + "learning_rate": 0.003, + "loss": 4.2338, + "step": 2337 + }, + { + "epoch": 0.02338, + "grad_norm": 0.6079153730498182, + "learning_rate": 0.003, + "loss": 4.2143, + "step": 2338 + }, + { + "epoch": 0.02339, + "grad_norm": 0.5540805388644168, + "learning_rate": 0.003, + "loss": 4.2097, + "step": 2339 + }, + { + "epoch": 0.0234, + "grad_norm": 0.4930522285796136, + "learning_rate": 0.003, + "loss": 4.2219, + "step": 2340 + }, + { + "epoch": 0.02341, + "grad_norm": 0.5114159866478617, + "learning_rate": 0.003, + "loss": 4.198, + "step": 2341 + }, + { + "epoch": 0.02342, + "grad_norm": 0.5088489673837915, + "learning_rate": 0.003, + "loss": 4.212, + "step": 2342 + }, + { + "epoch": 0.02343, + "grad_norm": 0.5701232150213591, + "learning_rate": 0.003, + "loss": 4.2136, + "step": 2343 + }, + { + "epoch": 0.02344, + "grad_norm": 0.5264496634765071, + "learning_rate": 0.003, + "loss": 4.2103, + "step": 2344 + }, + { + "epoch": 0.02345, + "grad_norm": 0.463092136026227, + "learning_rate": 0.003, + "loss": 4.2037, + "step": 2345 + }, + { + "epoch": 0.02346, + "grad_norm": 0.5119464591082599, + "learning_rate": 0.003, + "loss": 4.2041, + "step": 2346 + }, + { + "epoch": 0.02347, + "grad_norm": 0.5680858879896561, + "learning_rate": 0.003, + "loss": 4.2412, + "step": 2347 + }, + { + "epoch": 0.02348, + "grad_norm": 0.6575493478001748, + "learning_rate": 0.003, + "loss": 4.2098, + "step": 2348 + }, + { + "epoch": 0.02349, + "grad_norm": 0.7446709988793854, + "learning_rate": 0.003, + "loss": 4.2329, + "step": 2349 + }, + { + "epoch": 0.0235, + "grad_norm": 0.9616447862037343, + "learning_rate": 0.003, + "loss": 4.2417, + "step": 2350 + }, + { + "epoch": 0.02351, + "grad_norm": 0.892365430816041, + "learning_rate": 0.003, + "loss": 4.2567, + "step": 2351 + }, + { + "epoch": 0.02352, + "grad_norm": 0.7434115511547027, + "learning_rate": 0.003, + "loss": 4.2055, + "step": 2352 + }, + { + "epoch": 0.02353, + "grad_norm": 0.7750549493995498, + "learning_rate": 0.003, + "loss": 4.242, + "step": 2353 + }, + { + "epoch": 0.02354, + "grad_norm": 0.7464193045182832, + "learning_rate": 0.003, + "loss": 4.2509, + "step": 2354 + }, + { + "epoch": 0.02355, + "grad_norm": 0.6497010807293944, + "learning_rate": 0.003, + "loss": 4.2393, + "step": 2355 + }, + { + "epoch": 0.02356, + "grad_norm": 0.6914197602121488, + "learning_rate": 0.003, + "loss": 4.2239, + "step": 2356 + }, + { + "epoch": 0.02357, + "grad_norm": 0.6846425351668688, + "learning_rate": 0.003, + "loss": 4.2295, + "step": 2357 + }, + { + "epoch": 0.02358, + "grad_norm": 0.6290127860120783, + "learning_rate": 0.003, + "loss": 4.2459, + "step": 2358 + }, + { + "epoch": 0.02359, + "grad_norm": 0.6298376689313879, + "learning_rate": 0.003, + "loss": 4.2228, + "step": 2359 + }, + { + "epoch": 0.0236, + "grad_norm": 0.5166523858154795, + "learning_rate": 0.003, + "loss": 4.256, + "step": 2360 + }, + { + "epoch": 0.02361, + "grad_norm": 0.5392240288462758, + "learning_rate": 0.003, + "loss": 4.2371, + "step": 2361 + }, + { + "epoch": 0.02362, + "grad_norm": 0.46488034590567795, + "learning_rate": 0.003, + "loss": 4.2246, + "step": 2362 + }, + { + "epoch": 0.02363, + "grad_norm": 0.45695605203462375, + "learning_rate": 0.003, + "loss": 4.1978, + "step": 2363 + }, + { + "epoch": 0.02364, + "grad_norm": 0.450417146973962, + "learning_rate": 0.003, + "loss": 4.2171, + "step": 2364 + }, + { + "epoch": 0.02365, + "grad_norm": 0.46003087118242425, + "learning_rate": 0.003, + "loss": 4.236, + "step": 2365 + }, + { + "epoch": 0.02366, + "grad_norm": 0.48662211661939403, + "learning_rate": 0.003, + "loss": 4.2096, + "step": 2366 + }, + { + "epoch": 0.02367, + "grad_norm": 0.5645419223124684, + "learning_rate": 0.003, + "loss": 4.2212, + "step": 2367 + }, + { + "epoch": 0.02368, + "grad_norm": 0.5895187996336625, + "learning_rate": 0.003, + "loss": 4.1956, + "step": 2368 + }, + { + "epoch": 0.02369, + "grad_norm": 0.6664022667321152, + "learning_rate": 0.003, + "loss": 4.211, + "step": 2369 + }, + { + "epoch": 0.0237, + "grad_norm": 0.6992460452789251, + "learning_rate": 0.003, + "loss": 4.241, + "step": 2370 + }, + { + "epoch": 0.02371, + "grad_norm": 0.6591985126074834, + "learning_rate": 0.003, + "loss": 4.223, + "step": 2371 + }, + { + "epoch": 0.02372, + "grad_norm": 0.5265059072138487, + "learning_rate": 0.003, + "loss": 4.2082, + "step": 2372 + }, + { + "epoch": 0.02373, + "grad_norm": 0.4951201634347875, + "learning_rate": 0.003, + "loss": 4.1933, + "step": 2373 + }, + { + "epoch": 0.02374, + "grad_norm": 0.4876679210846313, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 2374 + }, + { + "epoch": 0.02375, + "grad_norm": 0.408164979920852, + "learning_rate": 0.003, + "loss": 4.2017, + "step": 2375 + }, + { + "epoch": 0.02376, + "grad_norm": 0.3895969391943079, + "learning_rate": 0.003, + "loss": 4.2257, + "step": 2376 + }, + { + "epoch": 0.02377, + "grad_norm": 0.4053428947086922, + "learning_rate": 0.003, + "loss": 4.1923, + "step": 2377 + }, + { + "epoch": 0.02378, + "grad_norm": 0.43012507081905327, + "learning_rate": 0.003, + "loss": 4.2019, + "step": 2378 + }, + { + "epoch": 0.02379, + "grad_norm": 0.4764061060377796, + "learning_rate": 0.003, + "loss": 4.1906, + "step": 2379 + }, + { + "epoch": 0.0238, + "grad_norm": 0.4854688158872303, + "learning_rate": 0.003, + "loss": 4.1642, + "step": 2380 + }, + { + "epoch": 0.02381, + "grad_norm": 0.4926226608113921, + "learning_rate": 0.003, + "loss": 4.1768, + "step": 2381 + }, + { + "epoch": 0.02382, + "grad_norm": 0.5715519289577061, + "learning_rate": 0.003, + "loss": 4.2369, + "step": 2382 + }, + { + "epoch": 0.02383, + "grad_norm": 0.5852310729435649, + "learning_rate": 0.003, + "loss": 4.2016, + "step": 2383 + }, + { + "epoch": 0.02384, + "grad_norm": 0.5027933039192362, + "learning_rate": 0.003, + "loss": 4.186, + "step": 2384 + }, + { + "epoch": 0.02385, + "grad_norm": 0.5671992764095898, + "learning_rate": 0.003, + "loss": 4.2151, + "step": 2385 + }, + { + "epoch": 0.02386, + "grad_norm": 0.5716874891458956, + "learning_rate": 0.003, + "loss": 4.1952, + "step": 2386 + }, + { + "epoch": 0.02387, + "grad_norm": 0.5519898067526273, + "learning_rate": 0.003, + "loss": 4.1919, + "step": 2387 + }, + { + "epoch": 0.02388, + "grad_norm": 0.5749264518166138, + "learning_rate": 0.003, + "loss": 4.2477, + "step": 2388 + }, + { + "epoch": 0.02389, + "grad_norm": 0.6599062653773942, + "learning_rate": 0.003, + "loss": 4.2112, + "step": 2389 + }, + { + "epoch": 0.0239, + "grad_norm": 0.8007810046111682, + "learning_rate": 0.003, + "loss": 4.2104, + "step": 2390 + }, + { + "epoch": 0.02391, + "grad_norm": 0.7728699720546903, + "learning_rate": 0.003, + "loss": 4.2355, + "step": 2391 + }, + { + "epoch": 0.02392, + "grad_norm": 0.637372032061665, + "learning_rate": 0.003, + "loss": 4.2375, + "step": 2392 + }, + { + "epoch": 0.02393, + "grad_norm": 0.5856837925361781, + "learning_rate": 0.003, + "loss": 4.1967, + "step": 2393 + }, + { + "epoch": 0.02394, + "grad_norm": 0.5796441189535547, + "learning_rate": 0.003, + "loss": 4.2107, + "step": 2394 + }, + { + "epoch": 0.02395, + "grad_norm": 0.49276837104854343, + "learning_rate": 0.003, + "loss": 4.2217, + "step": 2395 + }, + { + "epoch": 0.02396, + "grad_norm": 0.4942734378430775, + "learning_rate": 0.003, + "loss": 4.2363, + "step": 2396 + }, + { + "epoch": 0.02397, + "grad_norm": 0.4893677735684375, + "learning_rate": 0.003, + "loss": 4.2162, + "step": 2397 + }, + { + "epoch": 0.02398, + "grad_norm": 0.5811783701919775, + "learning_rate": 0.003, + "loss": 4.2259, + "step": 2398 + }, + { + "epoch": 0.02399, + "grad_norm": 0.7848440120318722, + "learning_rate": 0.003, + "loss": 4.2123, + "step": 2399 + }, + { + "epoch": 0.024, + "grad_norm": 1.102610977529852, + "learning_rate": 0.003, + "loss": 4.2476, + "step": 2400 + }, + { + "epoch": 0.02401, + "grad_norm": 0.8135265365179677, + "learning_rate": 0.003, + "loss": 4.215, + "step": 2401 + }, + { + "epoch": 0.02402, + "grad_norm": 0.583172308220775, + "learning_rate": 0.003, + "loss": 4.24, + "step": 2402 + }, + { + "epoch": 0.02403, + "grad_norm": 0.5891421965222777, + "learning_rate": 0.003, + "loss": 4.2092, + "step": 2403 + }, + { + "epoch": 0.02404, + "grad_norm": 0.5648336236466582, + "learning_rate": 0.003, + "loss": 4.2446, + "step": 2404 + }, + { + "epoch": 0.02405, + "grad_norm": 0.5745395036103889, + "learning_rate": 0.003, + "loss": 4.2523, + "step": 2405 + }, + { + "epoch": 0.02406, + "grad_norm": 0.5533135454038, + "learning_rate": 0.003, + "loss": 4.2331, + "step": 2406 + }, + { + "epoch": 0.02407, + "grad_norm": 0.49929019237957983, + "learning_rate": 0.003, + "loss": 4.2073, + "step": 2407 + }, + { + "epoch": 0.02408, + "grad_norm": 0.4943118983366084, + "learning_rate": 0.003, + "loss": 4.2144, + "step": 2408 + }, + { + "epoch": 0.02409, + "grad_norm": 0.5449234611486594, + "learning_rate": 0.003, + "loss": 4.2212, + "step": 2409 + }, + { + "epoch": 0.0241, + "grad_norm": 0.585465535756614, + "learning_rate": 0.003, + "loss": 4.2358, + "step": 2410 + }, + { + "epoch": 0.02411, + "grad_norm": 0.6698135788102235, + "learning_rate": 0.003, + "loss": 4.2088, + "step": 2411 + }, + { + "epoch": 0.02412, + "grad_norm": 0.7369339920597645, + "learning_rate": 0.003, + "loss": 4.217, + "step": 2412 + }, + { + "epoch": 0.02413, + "grad_norm": 0.7305355993897886, + "learning_rate": 0.003, + "loss": 4.2166, + "step": 2413 + }, + { + "epoch": 0.02414, + "grad_norm": 0.7556720083696307, + "learning_rate": 0.003, + "loss": 4.2325, + "step": 2414 + }, + { + "epoch": 0.02415, + "grad_norm": 0.6579993879979832, + "learning_rate": 0.003, + "loss": 4.2326, + "step": 2415 + }, + { + "epoch": 0.02416, + "grad_norm": 0.6108411981053206, + "learning_rate": 0.003, + "loss": 4.2107, + "step": 2416 + }, + { + "epoch": 0.02417, + "grad_norm": 0.5049292600507906, + "learning_rate": 0.003, + "loss": 4.2353, + "step": 2417 + }, + { + "epoch": 0.02418, + "grad_norm": 0.4900011615352856, + "learning_rate": 0.003, + "loss": 4.2259, + "step": 2418 + }, + { + "epoch": 0.02419, + "grad_norm": 0.4521560630393701, + "learning_rate": 0.003, + "loss": 4.2185, + "step": 2419 + }, + { + "epoch": 0.0242, + "grad_norm": 0.4620681001428492, + "learning_rate": 0.003, + "loss": 4.2321, + "step": 2420 + }, + { + "epoch": 0.02421, + "grad_norm": 0.4916389353511205, + "learning_rate": 0.003, + "loss": 4.1947, + "step": 2421 + }, + { + "epoch": 0.02422, + "grad_norm": 0.5425187775211936, + "learning_rate": 0.003, + "loss": 4.2503, + "step": 2422 + }, + { + "epoch": 0.02423, + "grad_norm": 0.6038360240873328, + "learning_rate": 0.003, + "loss": 4.2434, + "step": 2423 + }, + { + "epoch": 0.02424, + "grad_norm": 0.6325505331996517, + "learning_rate": 0.003, + "loss": 4.2388, + "step": 2424 + }, + { + "epoch": 0.02425, + "grad_norm": 0.5934465012445626, + "learning_rate": 0.003, + "loss": 4.2245, + "step": 2425 + }, + { + "epoch": 0.02426, + "grad_norm": 0.4852317425419543, + "learning_rate": 0.003, + "loss": 4.2259, + "step": 2426 + }, + { + "epoch": 0.02427, + "grad_norm": 0.43424836675618983, + "learning_rate": 0.003, + "loss": 4.2507, + "step": 2427 + }, + { + "epoch": 0.02428, + "grad_norm": 0.42558088050674436, + "learning_rate": 0.003, + "loss": 4.2206, + "step": 2428 + }, + { + "epoch": 0.02429, + "grad_norm": 0.4007132170531262, + "learning_rate": 0.003, + "loss": 4.2052, + "step": 2429 + }, + { + "epoch": 0.0243, + "grad_norm": 0.41461133903064823, + "learning_rate": 0.003, + "loss": 4.2277, + "step": 2430 + }, + { + "epoch": 0.02431, + "grad_norm": 0.42272490135521923, + "learning_rate": 0.003, + "loss": 4.2037, + "step": 2431 + }, + { + "epoch": 0.02432, + "grad_norm": 0.5028259204616563, + "learning_rate": 0.003, + "loss": 4.2279, + "step": 2432 + }, + { + "epoch": 0.02433, + "grad_norm": 0.6593220419533437, + "learning_rate": 0.003, + "loss": 4.2479, + "step": 2433 + }, + { + "epoch": 0.02434, + "grad_norm": 0.8027427099690023, + "learning_rate": 0.003, + "loss": 4.2219, + "step": 2434 + }, + { + "epoch": 0.02435, + "grad_norm": 0.8024207699638355, + "learning_rate": 0.003, + "loss": 4.2281, + "step": 2435 + }, + { + "epoch": 0.02436, + "grad_norm": 0.7133615297883351, + "learning_rate": 0.003, + "loss": 4.2314, + "step": 2436 + }, + { + "epoch": 0.02437, + "grad_norm": 0.6439666201166401, + "learning_rate": 0.003, + "loss": 4.2142, + "step": 2437 + }, + { + "epoch": 0.02438, + "grad_norm": 0.5858625518676239, + "learning_rate": 0.003, + "loss": 4.23, + "step": 2438 + }, + { + "epoch": 0.02439, + "grad_norm": 0.5398100187318233, + "learning_rate": 0.003, + "loss": 4.1977, + "step": 2439 + }, + { + "epoch": 0.0244, + "grad_norm": 0.5287852378026328, + "learning_rate": 0.003, + "loss": 4.2469, + "step": 2440 + }, + { + "epoch": 0.02441, + "grad_norm": 0.5446152594519079, + "learning_rate": 0.003, + "loss": 4.2079, + "step": 2441 + }, + { + "epoch": 0.02442, + "grad_norm": 0.5050019658896749, + "learning_rate": 0.003, + "loss": 4.2415, + "step": 2442 + }, + { + "epoch": 0.02443, + "grad_norm": 0.5178076569823932, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 2443 + }, + { + "epoch": 0.02444, + "grad_norm": 0.48889067635139094, + "learning_rate": 0.003, + "loss": 4.2045, + "step": 2444 + }, + { + "epoch": 0.02445, + "grad_norm": 0.5068159078989423, + "learning_rate": 0.003, + "loss": 4.2106, + "step": 2445 + }, + { + "epoch": 0.02446, + "grad_norm": 0.5126341533127841, + "learning_rate": 0.003, + "loss": 4.2121, + "step": 2446 + }, + { + "epoch": 0.02447, + "grad_norm": 0.4825218100648375, + "learning_rate": 0.003, + "loss": 4.2174, + "step": 2447 + }, + { + "epoch": 0.02448, + "grad_norm": 0.46377191023349634, + "learning_rate": 0.003, + "loss": 4.2169, + "step": 2448 + }, + { + "epoch": 0.02449, + "grad_norm": 0.5086550488721309, + "learning_rate": 0.003, + "loss": 4.2116, + "step": 2449 + }, + { + "epoch": 0.0245, + "grad_norm": 0.5740571687783603, + "learning_rate": 0.003, + "loss": 4.2159, + "step": 2450 + }, + { + "epoch": 0.02451, + "grad_norm": 0.5066757455693393, + "learning_rate": 0.003, + "loss": 4.1927, + "step": 2451 + }, + { + "epoch": 0.02452, + "grad_norm": 0.40863119709622003, + "learning_rate": 0.003, + "loss": 4.1968, + "step": 2452 + }, + { + "epoch": 0.02453, + "grad_norm": 0.44965288153972177, + "learning_rate": 0.003, + "loss": 4.1737, + "step": 2453 + }, + { + "epoch": 0.02454, + "grad_norm": 0.5324108231499827, + "learning_rate": 0.003, + "loss": 4.1829, + "step": 2454 + }, + { + "epoch": 0.02455, + "grad_norm": 0.6445398719560812, + "learning_rate": 0.003, + "loss": 4.1993, + "step": 2455 + }, + { + "epoch": 0.02456, + "grad_norm": 0.778972240900702, + "learning_rate": 0.003, + "loss": 4.1741, + "step": 2456 + }, + { + "epoch": 0.02457, + "grad_norm": 0.6895291525630388, + "learning_rate": 0.003, + "loss": 4.2281, + "step": 2457 + }, + { + "epoch": 0.02458, + "grad_norm": 0.5932547289808822, + "learning_rate": 0.003, + "loss": 4.1934, + "step": 2458 + }, + { + "epoch": 0.02459, + "grad_norm": 0.5873468472438081, + "learning_rate": 0.003, + "loss": 4.2234, + "step": 2459 + }, + { + "epoch": 0.0246, + "grad_norm": 0.6051334869486308, + "learning_rate": 0.003, + "loss": 4.2298, + "step": 2460 + }, + { + "epoch": 0.02461, + "grad_norm": 0.598960720110885, + "learning_rate": 0.003, + "loss": 4.2047, + "step": 2461 + }, + { + "epoch": 0.02462, + "grad_norm": 0.5336669037751343, + "learning_rate": 0.003, + "loss": 4.2313, + "step": 2462 + }, + { + "epoch": 0.02463, + "grad_norm": 0.5634439620834016, + "learning_rate": 0.003, + "loss": 4.2301, + "step": 2463 + }, + { + "epoch": 0.02464, + "grad_norm": 0.6026663982662849, + "learning_rate": 0.003, + "loss": 4.2123, + "step": 2464 + }, + { + "epoch": 0.02465, + "grad_norm": 0.7167643023421324, + "learning_rate": 0.003, + "loss": 4.2336, + "step": 2465 + }, + { + "epoch": 0.02466, + "grad_norm": 0.7713681863589867, + "learning_rate": 0.003, + "loss": 4.2172, + "step": 2466 + }, + { + "epoch": 0.02467, + "grad_norm": 0.6973139861610792, + "learning_rate": 0.003, + "loss": 4.1958, + "step": 2467 + }, + { + "epoch": 0.02468, + "grad_norm": 0.6102611547152099, + "learning_rate": 0.003, + "loss": 4.2374, + "step": 2468 + }, + { + "epoch": 0.02469, + "grad_norm": 0.6541217572054333, + "learning_rate": 0.003, + "loss": 4.2207, + "step": 2469 + }, + { + "epoch": 0.0247, + "grad_norm": 0.6679631943322341, + "learning_rate": 0.003, + "loss": 4.2165, + "step": 2470 + }, + { + "epoch": 0.02471, + "grad_norm": 0.7159740472340401, + "learning_rate": 0.003, + "loss": 4.2074, + "step": 2471 + }, + { + "epoch": 0.02472, + "grad_norm": 0.7905007318669144, + "learning_rate": 0.003, + "loss": 4.2152, + "step": 2472 + }, + { + "epoch": 0.02473, + "grad_norm": 0.8760585933429962, + "learning_rate": 0.003, + "loss": 4.2103, + "step": 2473 + }, + { + "epoch": 0.02474, + "grad_norm": 0.9178632648529088, + "learning_rate": 0.003, + "loss": 4.2472, + "step": 2474 + }, + { + "epoch": 0.02475, + "grad_norm": 0.8011380404965301, + "learning_rate": 0.003, + "loss": 4.2214, + "step": 2475 + }, + { + "epoch": 0.02476, + "grad_norm": 0.8709257402916254, + "learning_rate": 0.003, + "loss": 4.2334, + "step": 2476 + }, + { + "epoch": 0.02477, + "grad_norm": 0.8785057383582693, + "learning_rate": 0.003, + "loss": 4.2701, + "step": 2477 + }, + { + "epoch": 0.02478, + "grad_norm": 0.8265644129631496, + "learning_rate": 0.003, + "loss": 4.2383, + "step": 2478 + }, + { + "epoch": 0.02479, + "grad_norm": 0.7447630055654997, + "learning_rate": 0.003, + "loss": 4.228, + "step": 2479 + }, + { + "epoch": 0.0248, + "grad_norm": 0.7215953534640771, + "learning_rate": 0.003, + "loss": 4.2659, + "step": 2480 + }, + { + "epoch": 0.02481, + "grad_norm": 0.7223104530962532, + "learning_rate": 0.003, + "loss": 4.2494, + "step": 2481 + }, + { + "epoch": 0.02482, + "grad_norm": 0.7755415514978912, + "learning_rate": 0.003, + "loss": 4.2642, + "step": 2482 + }, + { + "epoch": 0.02483, + "grad_norm": 0.8000293423835445, + "learning_rate": 0.003, + "loss": 4.2156, + "step": 2483 + }, + { + "epoch": 0.02484, + "grad_norm": 0.6385982842425331, + "learning_rate": 0.003, + "loss": 4.2408, + "step": 2484 + }, + { + "epoch": 0.02485, + "grad_norm": 0.7165633164736369, + "learning_rate": 0.003, + "loss": 4.2628, + "step": 2485 + }, + { + "epoch": 0.02486, + "grad_norm": 0.666869104779761, + "learning_rate": 0.003, + "loss": 4.2845, + "step": 2486 + }, + { + "epoch": 0.02487, + "grad_norm": 0.6526919813636585, + "learning_rate": 0.003, + "loss": 4.2376, + "step": 2487 + }, + { + "epoch": 0.02488, + "grad_norm": 0.6689740712518896, + "learning_rate": 0.003, + "loss": 4.2153, + "step": 2488 + }, + { + "epoch": 0.02489, + "grad_norm": 0.6762316340846642, + "learning_rate": 0.003, + "loss": 4.2054, + "step": 2489 + }, + { + "epoch": 0.0249, + "grad_norm": 0.6381111552878137, + "learning_rate": 0.003, + "loss": 4.2081, + "step": 2490 + }, + { + "epoch": 0.02491, + "grad_norm": 0.6575656650220585, + "learning_rate": 0.003, + "loss": 4.234, + "step": 2491 + }, + { + "epoch": 0.02492, + "grad_norm": 0.6264756996157673, + "learning_rate": 0.003, + "loss": 4.243, + "step": 2492 + }, + { + "epoch": 0.02493, + "grad_norm": 0.5475484048874111, + "learning_rate": 0.003, + "loss": 4.2085, + "step": 2493 + }, + { + "epoch": 0.02494, + "grad_norm": 0.5832237103576846, + "learning_rate": 0.003, + "loss": 4.222, + "step": 2494 + }, + { + "epoch": 0.02495, + "grad_norm": 0.49782209661562854, + "learning_rate": 0.003, + "loss": 4.2274, + "step": 2495 + }, + { + "epoch": 0.02496, + "grad_norm": 0.403753171755924, + "learning_rate": 0.003, + "loss": 4.2141, + "step": 2496 + }, + { + "epoch": 0.02497, + "grad_norm": 0.36378014443485196, + "learning_rate": 0.003, + "loss": 4.2165, + "step": 2497 + }, + { + "epoch": 0.02498, + "grad_norm": 0.317230162546613, + "learning_rate": 0.003, + "loss": 4.2025, + "step": 2498 + }, + { + "epoch": 0.02499, + "grad_norm": 0.3277491553637038, + "learning_rate": 0.003, + "loss": 4.2117, + "step": 2499 + }, + { + "epoch": 0.025, + "grad_norm": 0.2882430663823555, + "learning_rate": 0.003, + "loss": 4.1956, + "step": 2500 + }, + { + "epoch": 0.02501, + "grad_norm": 0.27906079169809134, + "learning_rate": 0.003, + "loss": 4.2251, + "step": 2501 + }, + { + "epoch": 0.02502, + "grad_norm": 0.2871738503175601, + "learning_rate": 0.003, + "loss": 4.2056, + "step": 2502 + }, + { + "epoch": 0.02503, + "grad_norm": 0.3203204666233, + "learning_rate": 0.003, + "loss": 4.2125, + "step": 2503 + }, + { + "epoch": 0.02504, + "grad_norm": 0.38939921573272746, + "learning_rate": 0.003, + "loss": 4.1919, + "step": 2504 + }, + { + "epoch": 0.02505, + "grad_norm": 0.5688786331558378, + "learning_rate": 0.003, + "loss": 4.1964, + "step": 2505 + }, + { + "epoch": 0.02506, + "grad_norm": 0.7962360843283562, + "learning_rate": 0.003, + "loss": 4.2243, + "step": 2506 + }, + { + "epoch": 0.02507, + "grad_norm": 0.8008164951690123, + "learning_rate": 0.003, + "loss": 4.2174, + "step": 2507 + }, + { + "epoch": 0.02508, + "grad_norm": 0.5143170677845642, + "learning_rate": 0.003, + "loss": 4.2001, + "step": 2508 + }, + { + "epoch": 0.02509, + "grad_norm": 0.6566394093706333, + "learning_rate": 0.003, + "loss": 4.2119, + "step": 2509 + }, + { + "epoch": 0.0251, + "grad_norm": 0.7710410640765211, + "learning_rate": 0.003, + "loss": 4.2285, + "step": 2510 + }, + { + "epoch": 0.02511, + "grad_norm": 0.6799202006125559, + "learning_rate": 0.003, + "loss": 4.2382, + "step": 2511 + }, + { + "epoch": 0.02512, + "grad_norm": 0.6083471860257056, + "learning_rate": 0.003, + "loss": 4.2155, + "step": 2512 + }, + { + "epoch": 0.02513, + "grad_norm": 0.6298329580111398, + "learning_rate": 0.003, + "loss": 4.2139, + "step": 2513 + }, + { + "epoch": 0.02514, + "grad_norm": 0.5557311367958748, + "learning_rate": 0.003, + "loss": 4.1968, + "step": 2514 + }, + { + "epoch": 0.02515, + "grad_norm": 0.5172156322515098, + "learning_rate": 0.003, + "loss": 4.1861, + "step": 2515 + }, + { + "epoch": 0.02516, + "grad_norm": 0.458365154058261, + "learning_rate": 0.003, + "loss": 4.2042, + "step": 2516 + }, + { + "epoch": 0.02517, + "grad_norm": 0.47236102620897663, + "learning_rate": 0.003, + "loss": 4.2043, + "step": 2517 + }, + { + "epoch": 0.02518, + "grad_norm": 0.3986825790599497, + "learning_rate": 0.003, + "loss": 4.2114, + "step": 2518 + }, + { + "epoch": 0.02519, + "grad_norm": 0.402047193379362, + "learning_rate": 0.003, + "loss": 4.19, + "step": 2519 + }, + { + "epoch": 0.0252, + "grad_norm": 0.3916406367114395, + "learning_rate": 0.003, + "loss": 4.1997, + "step": 2520 + }, + { + "epoch": 0.02521, + "grad_norm": 0.41614255260417965, + "learning_rate": 0.003, + "loss": 4.196, + "step": 2521 + }, + { + "epoch": 0.02522, + "grad_norm": 0.39122808601170617, + "learning_rate": 0.003, + "loss": 4.1916, + "step": 2522 + }, + { + "epoch": 0.02523, + "grad_norm": 0.32265648253713153, + "learning_rate": 0.003, + "loss": 4.1909, + "step": 2523 + }, + { + "epoch": 0.02524, + "grad_norm": 0.36648634510918254, + "learning_rate": 0.003, + "loss": 4.2042, + "step": 2524 + }, + { + "epoch": 0.02525, + "grad_norm": 0.36515875028981976, + "learning_rate": 0.003, + "loss": 4.2171, + "step": 2525 + }, + { + "epoch": 0.02526, + "grad_norm": 0.3879027742116084, + "learning_rate": 0.003, + "loss": 4.1612, + "step": 2526 + }, + { + "epoch": 0.02527, + "grad_norm": 0.39717009730244557, + "learning_rate": 0.003, + "loss": 4.1965, + "step": 2527 + }, + { + "epoch": 0.02528, + "grad_norm": 0.4631846026712896, + "learning_rate": 0.003, + "loss": 4.2219, + "step": 2528 + }, + { + "epoch": 0.02529, + "grad_norm": 0.5301783594747518, + "learning_rate": 0.003, + "loss": 4.1932, + "step": 2529 + }, + { + "epoch": 0.0253, + "grad_norm": 0.5779986723238283, + "learning_rate": 0.003, + "loss": 4.2154, + "step": 2530 + }, + { + "epoch": 0.02531, + "grad_norm": 0.5503995974353456, + "learning_rate": 0.003, + "loss": 4.2161, + "step": 2531 + }, + { + "epoch": 0.02532, + "grad_norm": 0.5718882840372597, + "learning_rate": 0.003, + "loss": 4.1838, + "step": 2532 + }, + { + "epoch": 0.02533, + "grad_norm": 0.6687455514677941, + "learning_rate": 0.003, + "loss": 4.2279, + "step": 2533 + }, + { + "epoch": 0.02534, + "grad_norm": 0.6203399011184582, + "learning_rate": 0.003, + "loss": 4.2081, + "step": 2534 + }, + { + "epoch": 0.02535, + "grad_norm": 0.7566323425219436, + "learning_rate": 0.003, + "loss": 4.205, + "step": 2535 + }, + { + "epoch": 0.02536, + "grad_norm": 1.072654720911659, + "learning_rate": 0.003, + "loss": 4.2219, + "step": 2536 + }, + { + "epoch": 0.02537, + "grad_norm": 1.0281329924391467, + "learning_rate": 0.003, + "loss": 4.2253, + "step": 2537 + }, + { + "epoch": 0.02538, + "grad_norm": 0.9199583773276817, + "learning_rate": 0.003, + "loss": 4.2009, + "step": 2538 + }, + { + "epoch": 0.02539, + "grad_norm": 0.8211807842712615, + "learning_rate": 0.003, + "loss": 4.2563, + "step": 2539 + }, + { + "epoch": 0.0254, + "grad_norm": 0.8400545711442372, + "learning_rate": 0.003, + "loss": 4.2509, + "step": 2540 + }, + { + "epoch": 0.02541, + "grad_norm": 0.820158631703877, + "learning_rate": 0.003, + "loss": 4.2399, + "step": 2541 + }, + { + "epoch": 0.02542, + "grad_norm": 0.7974385353772132, + "learning_rate": 0.003, + "loss": 4.2171, + "step": 2542 + }, + { + "epoch": 0.02543, + "grad_norm": 0.8594194387358546, + "learning_rate": 0.003, + "loss": 4.2551, + "step": 2543 + }, + { + "epoch": 0.02544, + "grad_norm": 0.7910126968684995, + "learning_rate": 0.003, + "loss": 4.244, + "step": 2544 + }, + { + "epoch": 0.02545, + "grad_norm": 0.6829853653468553, + "learning_rate": 0.003, + "loss": 4.216, + "step": 2545 + }, + { + "epoch": 0.02546, + "grad_norm": 0.7120189223135196, + "learning_rate": 0.003, + "loss": 4.2304, + "step": 2546 + }, + { + "epoch": 0.02547, + "grad_norm": 0.6042723068123944, + "learning_rate": 0.003, + "loss": 4.2606, + "step": 2547 + }, + { + "epoch": 0.02548, + "grad_norm": 0.6298594843243801, + "learning_rate": 0.003, + "loss": 4.2602, + "step": 2548 + }, + { + "epoch": 0.02549, + "grad_norm": 0.5999904649113489, + "learning_rate": 0.003, + "loss": 4.2469, + "step": 2549 + }, + { + "epoch": 0.0255, + "grad_norm": 0.6271074405015903, + "learning_rate": 0.003, + "loss": 4.2107, + "step": 2550 + }, + { + "epoch": 0.02551, + "grad_norm": 0.6199575999348788, + "learning_rate": 0.003, + "loss": 4.2396, + "step": 2551 + }, + { + "epoch": 0.02552, + "grad_norm": 0.6195762543397098, + "learning_rate": 0.003, + "loss": 4.2076, + "step": 2552 + }, + { + "epoch": 0.02553, + "grad_norm": 0.6695668966322667, + "learning_rate": 0.003, + "loss": 4.2746, + "step": 2553 + }, + { + "epoch": 0.02554, + "grad_norm": 0.7359151892369767, + "learning_rate": 0.003, + "loss": 4.2245, + "step": 2554 + }, + { + "epoch": 0.02555, + "grad_norm": 0.7444694590769985, + "learning_rate": 0.003, + "loss": 4.2187, + "step": 2555 + }, + { + "epoch": 0.02556, + "grad_norm": 0.5563951149157678, + "learning_rate": 0.003, + "loss": 4.2212, + "step": 2556 + }, + { + "epoch": 0.02557, + "grad_norm": 0.463326630335006, + "learning_rate": 0.003, + "loss": 4.2204, + "step": 2557 + }, + { + "epoch": 0.02558, + "grad_norm": 0.4775061885903974, + "learning_rate": 0.003, + "loss": 4.2352, + "step": 2558 + }, + { + "epoch": 0.02559, + "grad_norm": 0.45507178814749144, + "learning_rate": 0.003, + "loss": 4.229, + "step": 2559 + }, + { + "epoch": 0.0256, + "grad_norm": 0.44496507541292724, + "learning_rate": 0.003, + "loss": 4.202, + "step": 2560 + }, + { + "epoch": 0.02561, + "grad_norm": 0.3670426288233485, + "learning_rate": 0.003, + "loss": 4.1971, + "step": 2561 + }, + { + "epoch": 0.02562, + "grad_norm": 0.38316899243773206, + "learning_rate": 0.003, + "loss": 4.2137, + "step": 2562 + }, + { + "epoch": 0.02563, + "grad_norm": 0.3516799307048104, + "learning_rate": 0.003, + "loss": 4.1944, + "step": 2563 + }, + { + "epoch": 0.02564, + "grad_norm": 0.3439157566474706, + "learning_rate": 0.003, + "loss": 4.2139, + "step": 2564 + }, + { + "epoch": 0.02565, + "grad_norm": 0.3530781028336598, + "learning_rate": 0.003, + "loss": 4.1893, + "step": 2565 + }, + { + "epoch": 0.02566, + "grad_norm": 0.3535153102961858, + "learning_rate": 0.003, + "loss": 4.2125, + "step": 2566 + }, + { + "epoch": 0.02567, + "grad_norm": 0.41816993390160107, + "learning_rate": 0.003, + "loss": 4.2115, + "step": 2567 + }, + { + "epoch": 0.02568, + "grad_norm": 0.4566608866652098, + "learning_rate": 0.003, + "loss": 4.2058, + "step": 2568 + }, + { + "epoch": 0.02569, + "grad_norm": 0.5020275746642583, + "learning_rate": 0.003, + "loss": 4.1895, + "step": 2569 + }, + { + "epoch": 0.0257, + "grad_norm": 0.45491911821241204, + "learning_rate": 0.003, + "loss": 4.1923, + "step": 2570 + }, + { + "epoch": 0.02571, + "grad_norm": 0.4133535089644209, + "learning_rate": 0.003, + "loss": 4.2113, + "step": 2571 + }, + { + "epoch": 0.02572, + "grad_norm": 0.4042168971597752, + "learning_rate": 0.003, + "loss": 4.1874, + "step": 2572 + }, + { + "epoch": 0.02573, + "grad_norm": 0.3823197589786481, + "learning_rate": 0.003, + "loss": 4.1933, + "step": 2573 + }, + { + "epoch": 0.02574, + "grad_norm": 0.39218587765755697, + "learning_rate": 0.003, + "loss": 4.2238, + "step": 2574 + }, + { + "epoch": 0.02575, + "grad_norm": 0.3438035712553242, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 2575 + }, + { + "epoch": 0.02576, + "grad_norm": 0.3842191419529875, + "learning_rate": 0.003, + "loss": 4.1626, + "step": 2576 + }, + { + "epoch": 0.02577, + "grad_norm": 0.42583222296628487, + "learning_rate": 0.003, + "loss": 4.1677, + "step": 2577 + }, + { + "epoch": 0.02578, + "grad_norm": 0.42273441184052274, + "learning_rate": 0.003, + "loss": 4.1902, + "step": 2578 + }, + { + "epoch": 0.02579, + "grad_norm": 0.46892691434613215, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 2579 + }, + { + "epoch": 0.0258, + "grad_norm": 0.5709292370942015, + "learning_rate": 0.003, + "loss": 4.199, + "step": 2580 + }, + { + "epoch": 0.02581, + "grad_norm": 0.7127132853972009, + "learning_rate": 0.003, + "loss": 4.2278, + "step": 2581 + }, + { + "epoch": 0.02582, + "grad_norm": 0.8754198005529624, + "learning_rate": 0.003, + "loss": 4.2358, + "step": 2582 + }, + { + "epoch": 0.02583, + "grad_norm": 0.8240175300819178, + "learning_rate": 0.003, + "loss": 4.2122, + "step": 2583 + }, + { + "epoch": 0.02584, + "grad_norm": 0.7842058343487789, + "learning_rate": 0.003, + "loss": 4.2267, + "step": 2584 + }, + { + "epoch": 0.02585, + "grad_norm": 0.8326334504065149, + "learning_rate": 0.003, + "loss": 4.2137, + "step": 2585 + }, + { + "epoch": 0.02586, + "grad_norm": 0.7754616837582553, + "learning_rate": 0.003, + "loss": 4.2322, + "step": 2586 + }, + { + "epoch": 0.02587, + "grad_norm": 0.6511240448731711, + "learning_rate": 0.003, + "loss": 4.2206, + "step": 2587 + }, + { + "epoch": 0.02588, + "grad_norm": 0.6418349994998516, + "learning_rate": 0.003, + "loss": 4.2155, + "step": 2588 + }, + { + "epoch": 0.02589, + "grad_norm": 0.6950697428484538, + "learning_rate": 0.003, + "loss": 4.2254, + "step": 2589 + }, + { + "epoch": 0.0259, + "grad_norm": 0.7453523656170742, + "learning_rate": 0.003, + "loss": 4.2359, + "step": 2590 + }, + { + "epoch": 0.02591, + "grad_norm": 0.6556652428839654, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 2591 + }, + { + "epoch": 0.02592, + "grad_norm": 0.5716997523979154, + "learning_rate": 0.003, + "loss": 4.2113, + "step": 2592 + }, + { + "epoch": 0.02593, + "grad_norm": 0.5895740446059741, + "learning_rate": 0.003, + "loss": 4.2131, + "step": 2593 + }, + { + "epoch": 0.02594, + "grad_norm": 0.6861270884763415, + "learning_rate": 0.003, + "loss": 4.2385, + "step": 2594 + }, + { + "epoch": 0.02595, + "grad_norm": 0.6039685752573255, + "learning_rate": 0.003, + "loss": 4.1847, + "step": 2595 + }, + { + "epoch": 0.02596, + "grad_norm": 0.5737417856631055, + "learning_rate": 0.003, + "loss": 4.2245, + "step": 2596 + }, + { + "epoch": 0.02597, + "grad_norm": 0.5807882188149026, + "learning_rate": 0.003, + "loss": 4.1916, + "step": 2597 + }, + { + "epoch": 0.02598, + "grad_norm": 0.5478759076616263, + "learning_rate": 0.003, + "loss": 4.2296, + "step": 2598 + }, + { + "epoch": 0.02599, + "grad_norm": 0.5274069809990996, + "learning_rate": 0.003, + "loss": 4.2218, + "step": 2599 + }, + { + "epoch": 0.026, + "grad_norm": 0.5489291707939286, + "learning_rate": 0.003, + "loss": 4.2225, + "step": 2600 + }, + { + "epoch": 0.02601, + "grad_norm": 0.5075235418141223, + "learning_rate": 0.003, + "loss": 4.2266, + "step": 2601 + }, + { + "epoch": 0.02602, + "grad_norm": 0.5092543256680011, + "learning_rate": 0.003, + "loss": 4.1993, + "step": 2602 + }, + { + "epoch": 0.02603, + "grad_norm": 0.48420449669893695, + "learning_rate": 0.003, + "loss": 4.1912, + "step": 2603 + }, + { + "epoch": 0.02604, + "grad_norm": 0.4845552481378335, + "learning_rate": 0.003, + "loss": 4.181, + "step": 2604 + }, + { + "epoch": 0.02605, + "grad_norm": 0.522772142438526, + "learning_rate": 0.003, + "loss": 4.1993, + "step": 2605 + }, + { + "epoch": 0.02606, + "grad_norm": 0.49986531502998316, + "learning_rate": 0.003, + "loss": 4.2158, + "step": 2606 + }, + { + "epoch": 0.02607, + "grad_norm": 0.4992114205626333, + "learning_rate": 0.003, + "loss": 4.2071, + "step": 2607 + }, + { + "epoch": 0.02608, + "grad_norm": 0.516906776877324, + "learning_rate": 0.003, + "loss": 4.1893, + "step": 2608 + }, + { + "epoch": 0.02609, + "grad_norm": 0.5823377881437489, + "learning_rate": 0.003, + "loss": 4.2336, + "step": 2609 + }, + { + "epoch": 0.0261, + "grad_norm": 0.6432677415233223, + "learning_rate": 0.003, + "loss": 4.1868, + "step": 2610 + }, + { + "epoch": 0.02611, + "grad_norm": 0.6818797064218945, + "learning_rate": 0.003, + "loss": 4.1789, + "step": 2611 + }, + { + "epoch": 0.02612, + "grad_norm": 0.6622646028172395, + "learning_rate": 0.003, + "loss": 4.2188, + "step": 2612 + }, + { + "epoch": 0.02613, + "grad_norm": 0.6111717401030455, + "learning_rate": 0.003, + "loss": 4.2101, + "step": 2613 + }, + { + "epoch": 0.02614, + "grad_norm": 0.5350928499489108, + "learning_rate": 0.003, + "loss": 4.1863, + "step": 2614 + }, + { + "epoch": 0.02615, + "grad_norm": 0.4811589002895373, + "learning_rate": 0.003, + "loss": 4.1735, + "step": 2615 + }, + { + "epoch": 0.02616, + "grad_norm": 0.4870958333875311, + "learning_rate": 0.003, + "loss": 4.1868, + "step": 2616 + }, + { + "epoch": 0.02617, + "grad_norm": 0.48293626841319026, + "learning_rate": 0.003, + "loss": 4.1753, + "step": 2617 + }, + { + "epoch": 0.02618, + "grad_norm": 0.5217244185547227, + "learning_rate": 0.003, + "loss": 4.2041, + "step": 2618 + }, + { + "epoch": 0.02619, + "grad_norm": 0.5234547384625777, + "learning_rate": 0.003, + "loss": 4.2179, + "step": 2619 + }, + { + "epoch": 0.0262, + "grad_norm": 0.4734488648814552, + "learning_rate": 0.003, + "loss": 4.1769, + "step": 2620 + }, + { + "epoch": 0.02621, + "grad_norm": 0.4072710001568894, + "learning_rate": 0.003, + "loss": 4.2023, + "step": 2621 + }, + { + "epoch": 0.02622, + "grad_norm": 0.3670914624450667, + "learning_rate": 0.003, + "loss": 4.1997, + "step": 2622 + }, + { + "epoch": 0.02623, + "grad_norm": 0.3907209927814348, + "learning_rate": 0.003, + "loss": 4.2044, + "step": 2623 + }, + { + "epoch": 0.02624, + "grad_norm": 0.4112397678843122, + "learning_rate": 0.003, + "loss": 4.2395, + "step": 2624 + }, + { + "epoch": 0.02625, + "grad_norm": 0.4373318792363482, + "learning_rate": 0.003, + "loss": 4.2339, + "step": 2625 + }, + { + "epoch": 0.02626, + "grad_norm": 0.43765342876243357, + "learning_rate": 0.003, + "loss": 4.1682, + "step": 2626 + }, + { + "epoch": 0.02627, + "grad_norm": 0.44112134115596174, + "learning_rate": 0.003, + "loss": 4.1845, + "step": 2627 + }, + { + "epoch": 0.02628, + "grad_norm": 0.5820511792087758, + "learning_rate": 0.003, + "loss": 4.2196, + "step": 2628 + }, + { + "epoch": 0.02629, + "grad_norm": 0.6987525856490595, + "learning_rate": 0.003, + "loss": 4.1665, + "step": 2629 + }, + { + "epoch": 0.0263, + "grad_norm": 0.8227125293487279, + "learning_rate": 0.003, + "loss": 4.2044, + "step": 2630 + }, + { + "epoch": 0.02631, + "grad_norm": 0.8558111239802438, + "learning_rate": 0.003, + "loss": 4.2316, + "step": 2631 + }, + { + "epoch": 0.02632, + "grad_norm": 0.7416342423752187, + "learning_rate": 0.003, + "loss": 4.1891, + "step": 2632 + }, + { + "epoch": 0.02633, + "grad_norm": 0.7123350797241652, + "learning_rate": 0.003, + "loss": 4.2064, + "step": 2633 + }, + { + "epoch": 0.02634, + "grad_norm": 0.7492282422336083, + "learning_rate": 0.003, + "loss": 4.2291, + "step": 2634 + }, + { + "epoch": 0.02635, + "grad_norm": 0.9073437715289041, + "learning_rate": 0.003, + "loss": 4.2382, + "step": 2635 + }, + { + "epoch": 0.02636, + "grad_norm": 0.8986238889024204, + "learning_rate": 0.003, + "loss": 4.2298, + "step": 2636 + }, + { + "epoch": 0.02637, + "grad_norm": 0.803485832317335, + "learning_rate": 0.003, + "loss": 4.2558, + "step": 2637 + }, + { + "epoch": 0.02638, + "grad_norm": 0.790197716438678, + "learning_rate": 0.003, + "loss": 4.2252, + "step": 2638 + }, + { + "epoch": 0.02639, + "grad_norm": 0.6490246383904035, + "learning_rate": 0.003, + "loss": 4.2065, + "step": 2639 + }, + { + "epoch": 0.0264, + "grad_norm": 0.6271932479026731, + "learning_rate": 0.003, + "loss": 4.2032, + "step": 2640 + }, + { + "epoch": 0.02641, + "grad_norm": 0.5216470184091793, + "learning_rate": 0.003, + "loss": 4.212, + "step": 2641 + }, + { + "epoch": 0.02642, + "grad_norm": 0.5340137157215237, + "learning_rate": 0.003, + "loss": 4.2208, + "step": 2642 + }, + { + "epoch": 0.02643, + "grad_norm": 0.48600536528948673, + "learning_rate": 0.003, + "loss": 4.2039, + "step": 2643 + }, + { + "epoch": 0.02644, + "grad_norm": 0.4913843517241883, + "learning_rate": 0.003, + "loss": 4.2177, + "step": 2644 + }, + { + "epoch": 0.02645, + "grad_norm": 0.5723820549842618, + "learning_rate": 0.003, + "loss": 4.2298, + "step": 2645 + }, + { + "epoch": 0.02646, + "grad_norm": 0.6080093721456342, + "learning_rate": 0.003, + "loss": 4.1979, + "step": 2646 + }, + { + "epoch": 0.02647, + "grad_norm": 0.6222106205330706, + "learning_rate": 0.003, + "loss": 4.2212, + "step": 2647 + }, + { + "epoch": 0.02648, + "grad_norm": 0.5217769121416035, + "learning_rate": 0.003, + "loss": 4.2349, + "step": 2648 + }, + { + "epoch": 0.02649, + "grad_norm": 0.4227997429869736, + "learning_rate": 0.003, + "loss": 4.2183, + "step": 2649 + }, + { + "epoch": 0.0265, + "grad_norm": 0.47327414101860077, + "learning_rate": 0.003, + "loss": 4.178, + "step": 2650 + }, + { + "epoch": 0.02651, + "grad_norm": 0.5099068125596607, + "learning_rate": 0.003, + "loss": 4.1959, + "step": 2651 + }, + { + "epoch": 0.02652, + "grad_norm": 0.60276863782341, + "learning_rate": 0.003, + "loss": 4.2278, + "step": 2652 + }, + { + "epoch": 0.02653, + "grad_norm": 0.6740196425733591, + "learning_rate": 0.003, + "loss": 4.1869, + "step": 2653 + }, + { + "epoch": 0.02654, + "grad_norm": 0.6718723086436392, + "learning_rate": 0.003, + "loss": 4.2008, + "step": 2654 + }, + { + "epoch": 0.02655, + "grad_norm": 0.6985692134702927, + "learning_rate": 0.003, + "loss": 4.21, + "step": 2655 + }, + { + "epoch": 0.02656, + "grad_norm": 0.7105425653160966, + "learning_rate": 0.003, + "loss": 4.1838, + "step": 2656 + }, + { + "epoch": 0.02657, + "grad_norm": 0.6933600976197403, + "learning_rate": 0.003, + "loss": 4.2156, + "step": 2657 + }, + { + "epoch": 0.02658, + "grad_norm": 0.6544303516003507, + "learning_rate": 0.003, + "loss": 4.2197, + "step": 2658 + }, + { + "epoch": 0.02659, + "grad_norm": 0.5910001636604436, + "learning_rate": 0.003, + "loss": 4.2113, + "step": 2659 + }, + { + "epoch": 0.0266, + "grad_norm": 0.5163250162883284, + "learning_rate": 0.003, + "loss": 4.1915, + "step": 2660 + }, + { + "epoch": 0.02661, + "grad_norm": 0.5302434720729938, + "learning_rate": 0.003, + "loss": 4.2129, + "step": 2661 + }, + { + "epoch": 0.02662, + "grad_norm": 0.4750913186060552, + "learning_rate": 0.003, + "loss": 4.1991, + "step": 2662 + }, + { + "epoch": 0.02663, + "grad_norm": 0.4491722077606405, + "learning_rate": 0.003, + "loss": 4.1805, + "step": 2663 + }, + { + "epoch": 0.02664, + "grad_norm": 0.42258758049488826, + "learning_rate": 0.003, + "loss": 4.1803, + "step": 2664 + }, + { + "epoch": 0.02665, + "grad_norm": 0.41774121759742056, + "learning_rate": 0.003, + "loss": 4.2287, + "step": 2665 + }, + { + "epoch": 0.02666, + "grad_norm": 0.43325477990837064, + "learning_rate": 0.003, + "loss": 4.2184, + "step": 2666 + }, + { + "epoch": 0.02667, + "grad_norm": 0.4508456123093455, + "learning_rate": 0.003, + "loss": 4.1974, + "step": 2667 + }, + { + "epoch": 0.02668, + "grad_norm": 0.46262752082913233, + "learning_rate": 0.003, + "loss": 4.1674, + "step": 2668 + }, + { + "epoch": 0.02669, + "grad_norm": 0.42577584585471717, + "learning_rate": 0.003, + "loss": 4.1867, + "step": 2669 + }, + { + "epoch": 0.0267, + "grad_norm": 0.404720023333166, + "learning_rate": 0.003, + "loss": 4.1856, + "step": 2670 + }, + { + "epoch": 0.02671, + "grad_norm": 0.3942334538580407, + "learning_rate": 0.003, + "loss": 4.2006, + "step": 2671 + }, + { + "epoch": 0.02672, + "grad_norm": 0.40993974344347783, + "learning_rate": 0.003, + "loss": 4.1947, + "step": 2672 + }, + { + "epoch": 0.02673, + "grad_norm": 0.4616934932283336, + "learning_rate": 0.003, + "loss": 4.1918, + "step": 2673 + }, + { + "epoch": 0.02674, + "grad_norm": 0.5514329672225634, + "learning_rate": 0.003, + "loss": 4.1647, + "step": 2674 + }, + { + "epoch": 0.02675, + "grad_norm": 0.5466818103576505, + "learning_rate": 0.003, + "loss": 4.1643, + "step": 2675 + }, + { + "epoch": 0.02676, + "grad_norm": 0.6935187296359256, + "learning_rate": 0.003, + "loss": 4.1899, + "step": 2676 + }, + { + "epoch": 0.02677, + "grad_norm": 0.8598433315495629, + "learning_rate": 0.003, + "loss": 4.231, + "step": 2677 + }, + { + "epoch": 0.02678, + "grad_norm": 0.8883830728999605, + "learning_rate": 0.003, + "loss": 4.2344, + "step": 2678 + }, + { + "epoch": 0.02679, + "grad_norm": 0.6405381161924845, + "learning_rate": 0.003, + "loss": 4.1941, + "step": 2679 + }, + { + "epoch": 0.0268, + "grad_norm": 0.6915639860426047, + "learning_rate": 0.003, + "loss": 4.2139, + "step": 2680 + }, + { + "epoch": 0.02681, + "grad_norm": 0.5927165760939286, + "learning_rate": 0.003, + "loss": 4.2278, + "step": 2681 + }, + { + "epoch": 0.02682, + "grad_norm": 0.5745893208237367, + "learning_rate": 0.003, + "loss": 4.2044, + "step": 2682 + }, + { + "epoch": 0.02683, + "grad_norm": 0.5931786049326642, + "learning_rate": 0.003, + "loss": 4.2269, + "step": 2683 + }, + { + "epoch": 0.02684, + "grad_norm": 0.5656789702546796, + "learning_rate": 0.003, + "loss": 4.1996, + "step": 2684 + }, + { + "epoch": 0.02685, + "grad_norm": 0.6294556799351446, + "learning_rate": 0.003, + "loss": 4.211, + "step": 2685 + }, + { + "epoch": 0.02686, + "grad_norm": 0.5832268033312072, + "learning_rate": 0.003, + "loss": 4.1905, + "step": 2686 + }, + { + "epoch": 0.02687, + "grad_norm": 0.570290226028237, + "learning_rate": 0.003, + "loss": 4.1996, + "step": 2687 + }, + { + "epoch": 0.02688, + "grad_norm": 0.5455818208368779, + "learning_rate": 0.003, + "loss": 4.1744, + "step": 2688 + }, + { + "epoch": 0.02689, + "grad_norm": 0.5599812876010125, + "learning_rate": 0.003, + "loss": 4.2088, + "step": 2689 + }, + { + "epoch": 0.0269, + "grad_norm": 0.5474516018001153, + "learning_rate": 0.003, + "loss": 4.2129, + "step": 2690 + }, + { + "epoch": 0.02691, + "grad_norm": 0.5566067071138427, + "learning_rate": 0.003, + "loss": 4.2043, + "step": 2691 + }, + { + "epoch": 0.02692, + "grad_norm": 0.526509280614571, + "learning_rate": 0.003, + "loss": 4.2054, + "step": 2692 + }, + { + "epoch": 0.02693, + "grad_norm": 0.4827516603970495, + "learning_rate": 0.003, + "loss": 4.218, + "step": 2693 + }, + { + "epoch": 0.02694, + "grad_norm": 0.46801137768639334, + "learning_rate": 0.003, + "loss": 4.1783, + "step": 2694 + }, + { + "epoch": 0.02695, + "grad_norm": 0.4878539547841592, + "learning_rate": 0.003, + "loss": 4.1935, + "step": 2695 + }, + { + "epoch": 0.02696, + "grad_norm": 0.47854515525549607, + "learning_rate": 0.003, + "loss": 4.1948, + "step": 2696 + }, + { + "epoch": 0.02697, + "grad_norm": 0.5065481594378214, + "learning_rate": 0.003, + "loss": 4.2065, + "step": 2697 + }, + { + "epoch": 0.02698, + "grad_norm": 0.5248263427239105, + "learning_rate": 0.003, + "loss": 4.1898, + "step": 2698 + }, + { + "epoch": 0.02699, + "grad_norm": 0.5277451425908661, + "learning_rate": 0.003, + "loss": 4.2105, + "step": 2699 + }, + { + "epoch": 0.027, + "grad_norm": 0.5538062675395776, + "learning_rate": 0.003, + "loss": 4.1806, + "step": 2700 + }, + { + "epoch": 0.02701, + "grad_norm": 0.6185989339314542, + "learning_rate": 0.003, + "loss": 4.173, + "step": 2701 + }, + { + "epoch": 0.02702, + "grad_norm": 0.5874971422385314, + "learning_rate": 0.003, + "loss": 4.225, + "step": 2702 + }, + { + "epoch": 0.02703, + "grad_norm": 0.5463427926178012, + "learning_rate": 0.003, + "loss": 4.2202, + "step": 2703 + }, + { + "epoch": 0.02704, + "grad_norm": 0.5672786289651166, + "learning_rate": 0.003, + "loss": 4.1641, + "step": 2704 + }, + { + "epoch": 0.02705, + "grad_norm": 0.6256411921927717, + "learning_rate": 0.003, + "loss": 4.2194, + "step": 2705 + }, + { + "epoch": 0.02706, + "grad_norm": 0.652684226752671, + "learning_rate": 0.003, + "loss": 4.1841, + "step": 2706 + }, + { + "epoch": 0.02707, + "grad_norm": 0.6029115328911401, + "learning_rate": 0.003, + "loss": 4.1842, + "step": 2707 + }, + { + "epoch": 0.02708, + "grad_norm": 0.5287197350990817, + "learning_rate": 0.003, + "loss": 4.2087, + "step": 2708 + }, + { + "epoch": 0.02709, + "grad_norm": 0.6492025140814462, + "learning_rate": 0.003, + "loss": 4.1853, + "step": 2709 + }, + { + "epoch": 0.0271, + "grad_norm": 0.6051042816144401, + "learning_rate": 0.003, + "loss": 4.1894, + "step": 2710 + }, + { + "epoch": 0.02711, + "grad_norm": 0.5747478778857192, + "learning_rate": 0.003, + "loss": 4.1754, + "step": 2711 + }, + { + "epoch": 0.02712, + "grad_norm": 0.5615755138550911, + "learning_rate": 0.003, + "loss": 4.1868, + "step": 2712 + }, + { + "epoch": 0.02713, + "grad_norm": 0.5892906666649981, + "learning_rate": 0.003, + "loss": 4.1834, + "step": 2713 + }, + { + "epoch": 0.02714, + "grad_norm": 0.6013481891384199, + "learning_rate": 0.003, + "loss": 4.1837, + "step": 2714 + }, + { + "epoch": 0.02715, + "grad_norm": 0.6656155771795913, + "learning_rate": 0.003, + "loss": 4.1807, + "step": 2715 + }, + { + "epoch": 0.02716, + "grad_norm": 0.8931240767996229, + "learning_rate": 0.003, + "loss": 4.2057, + "step": 2716 + }, + { + "epoch": 0.02717, + "grad_norm": 0.9885029624018516, + "learning_rate": 0.003, + "loss": 4.2196, + "step": 2717 + }, + { + "epoch": 0.02718, + "grad_norm": 0.8754295759055237, + "learning_rate": 0.003, + "loss": 4.2405, + "step": 2718 + }, + { + "epoch": 0.02719, + "grad_norm": 0.7774225786110023, + "learning_rate": 0.003, + "loss": 4.2179, + "step": 2719 + }, + { + "epoch": 0.0272, + "grad_norm": 0.6491659668608393, + "learning_rate": 0.003, + "loss": 4.2007, + "step": 2720 + }, + { + "epoch": 0.02721, + "grad_norm": 0.6343965078219608, + "learning_rate": 0.003, + "loss": 4.2145, + "step": 2721 + }, + { + "epoch": 0.02722, + "grad_norm": 0.6251153466031798, + "learning_rate": 0.003, + "loss": 4.2101, + "step": 2722 + }, + { + "epoch": 0.02723, + "grad_norm": 0.6748255813851252, + "learning_rate": 0.003, + "loss": 4.2215, + "step": 2723 + }, + { + "epoch": 0.02724, + "grad_norm": 0.6864377875600585, + "learning_rate": 0.003, + "loss": 4.2222, + "step": 2724 + }, + { + "epoch": 0.02725, + "grad_norm": 0.657540480175467, + "learning_rate": 0.003, + "loss": 4.2162, + "step": 2725 + }, + { + "epoch": 0.02726, + "grad_norm": 0.67217283621503, + "learning_rate": 0.003, + "loss": 4.1914, + "step": 2726 + }, + { + "epoch": 0.02727, + "grad_norm": 0.7144546093311819, + "learning_rate": 0.003, + "loss": 4.2253, + "step": 2727 + }, + { + "epoch": 0.02728, + "grad_norm": 0.6685628291749064, + "learning_rate": 0.003, + "loss": 4.2062, + "step": 2728 + }, + { + "epoch": 0.02729, + "grad_norm": 0.6958710627910398, + "learning_rate": 0.003, + "loss": 4.2431, + "step": 2729 + }, + { + "epoch": 0.0273, + "grad_norm": 0.7029911290820989, + "learning_rate": 0.003, + "loss": 4.2118, + "step": 2730 + }, + { + "epoch": 0.02731, + "grad_norm": 0.7388699066611659, + "learning_rate": 0.003, + "loss": 4.2131, + "step": 2731 + }, + { + "epoch": 0.02732, + "grad_norm": 0.6457150813790012, + "learning_rate": 0.003, + "loss": 4.2055, + "step": 2732 + }, + { + "epoch": 0.02733, + "grad_norm": 0.42245973261823044, + "learning_rate": 0.003, + "loss": 4.225, + "step": 2733 + }, + { + "epoch": 0.02734, + "grad_norm": 0.4568223524843039, + "learning_rate": 0.003, + "loss": 4.1769, + "step": 2734 + }, + { + "epoch": 0.02735, + "grad_norm": 0.46161311769792607, + "learning_rate": 0.003, + "loss": 4.2213, + "step": 2735 + }, + { + "epoch": 0.02736, + "grad_norm": 0.43828909949858114, + "learning_rate": 0.003, + "loss": 4.1921, + "step": 2736 + }, + { + "epoch": 0.02737, + "grad_norm": 0.44381003049579976, + "learning_rate": 0.003, + "loss": 4.1831, + "step": 2737 + }, + { + "epoch": 0.02738, + "grad_norm": 0.40770566309449235, + "learning_rate": 0.003, + "loss": 4.207, + "step": 2738 + }, + { + "epoch": 0.02739, + "grad_norm": 0.3980139287869944, + "learning_rate": 0.003, + "loss": 4.1947, + "step": 2739 + }, + { + "epoch": 0.0274, + "grad_norm": 0.4103938581926708, + "learning_rate": 0.003, + "loss": 4.1853, + "step": 2740 + }, + { + "epoch": 0.02741, + "grad_norm": 0.3977644284371821, + "learning_rate": 0.003, + "loss": 4.1971, + "step": 2741 + }, + { + "epoch": 0.02742, + "grad_norm": 0.3910236660359437, + "learning_rate": 0.003, + "loss": 4.1829, + "step": 2742 + }, + { + "epoch": 0.02743, + "grad_norm": 0.37057712717675134, + "learning_rate": 0.003, + "loss": 4.2028, + "step": 2743 + }, + { + "epoch": 0.02744, + "grad_norm": 0.39171990335728823, + "learning_rate": 0.003, + "loss": 4.1862, + "step": 2744 + }, + { + "epoch": 0.02745, + "grad_norm": 0.3931440185763024, + "learning_rate": 0.003, + "loss": 4.1978, + "step": 2745 + }, + { + "epoch": 0.02746, + "grad_norm": 0.4493443882352147, + "learning_rate": 0.003, + "loss": 4.1722, + "step": 2746 + }, + { + "epoch": 0.02747, + "grad_norm": 0.5239427386961047, + "learning_rate": 0.003, + "loss": 4.2084, + "step": 2747 + }, + { + "epoch": 0.02748, + "grad_norm": 0.48560097013750286, + "learning_rate": 0.003, + "loss": 4.16, + "step": 2748 + }, + { + "epoch": 0.02749, + "grad_norm": 0.4388250470896872, + "learning_rate": 0.003, + "loss": 4.1614, + "step": 2749 + }, + { + "epoch": 0.0275, + "grad_norm": 0.425974848346712, + "learning_rate": 0.003, + "loss": 4.1976, + "step": 2750 + }, + { + "epoch": 0.02751, + "grad_norm": 0.4487191491618812, + "learning_rate": 0.003, + "loss": 4.2102, + "step": 2751 + }, + { + "epoch": 0.02752, + "grad_norm": 0.5262082176933003, + "learning_rate": 0.003, + "loss": 4.1993, + "step": 2752 + }, + { + "epoch": 0.02753, + "grad_norm": 0.5624164107816553, + "learning_rate": 0.003, + "loss": 4.1806, + "step": 2753 + }, + { + "epoch": 0.02754, + "grad_norm": 0.6290188699871871, + "learning_rate": 0.003, + "loss": 4.1674, + "step": 2754 + }, + { + "epoch": 0.02755, + "grad_norm": 0.6718718115523771, + "learning_rate": 0.003, + "loss": 4.1724, + "step": 2755 + }, + { + "epoch": 0.02756, + "grad_norm": 0.689725298181599, + "learning_rate": 0.003, + "loss": 4.2042, + "step": 2756 + }, + { + "epoch": 0.02757, + "grad_norm": 0.7456661709777471, + "learning_rate": 0.003, + "loss": 4.2182, + "step": 2757 + }, + { + "epoch": 0.02758, + "grad_norm": 0.7434755613308037, + "learning_rate": 0.003, + "loss": 4.175, + "step": 2758 + }, + { + "epoch": 0.02759, + "grad_norm": 0.6285033589958848, + "learning_rate": 0.003, + "loss": 4.1982, + "step": 2759 + }, + { + "epoch": 0.0276, + "grad_norm": 0.6775730198112929, + "learning_rate": 0.003, + "loss": 4.2094, + "step": 2760 + }, + { + "epoch": 0.02761, + "grad_norm": 0.6910341516486844, + "learning_rate": 0.003, + "loss": 4.2268, + "step": 2761 + }, + { + "epoch": 0.02762, + "grad_norm": 0.6650852460123947, + "learning_rate": 0.003, + "loss": 4.1938, + "step": 2762 + }, + { + "epoch": 0.02763, + "grad_norm": 0.6011271334026552, + "learning_rate": 0.003, + "loss": 4.2029, + "step": 2763 + }, + { + "epoch": 0.02764, + "grad_norm": 0.5886973411048619, + "learning_rate": 0.003, + "loss": 4.1962, + "step": 2764 + }, + { + "epoch": 0.02765, + "grad_norm": 0.6682064585302908, + "learning_rate": 0.003, + "loss": 4.2039, + "step": 2765 + }, + { + "epoch": 0.02766, + "grad_norm": 0.7004868212188018, + "learning_rate": 0.003, + "loss": 4.1783, + "step": 2766 + }, + { + "epoch": 0.02767, + "grad_norm": 0.6391373162549133, + "learning_rate": 0.003, + "loss": 4.2317, + "step": 2767 + }, + { + "epoch": 0.02768, + "grad_norm": 0.5711886608834656, + "learning_rate": 0.003, + "loss": 4.1613, + "step": 2768 + }, + { + "epoch": 0.02769, + "grad_norm": 0.6028604983542873, + "learning_rate": 0.003, + "loss": 4.2212, + "step": 2769 + }, + { + "epoch": 0.0277, + "grad_norm": 0.5836613470106289, + "learning_rate": 0.003, + "loss": 4.1882, + "step": 2770 + }, + { + "epoch": 0.02771, + "grad_norm": 0.5619610082417591, + "learning_rate": 0.003, + "loss": 4.174, + "step": 2771 + }, + { + "epoch": 0.02772, + "grad_norm": 0.55339038708748, + "learning_rate": 0.003, + "loss": 4.1688, + "step": 2772 + }, + { + "epoch": 0.02773, + "grad_norm": 0.5034136342234373, + "learning_rate": 0.003, + "loss": 4.2003, + "step": 2773 + }, + { + "epoch": 0.02774, + "grad_norm": 0.5193760267646912, + "learning_rate": 0.003, + "loss": 4.2225, + "step": 2774 + }, + { + "epoch": 0.02775, + "grad_norm": 0.5211288768219952, + "learning_rate": 0.003, + "loss": 4.1899, + "step": 2775 + }, + { + "epoch": 0.02776, + "grad_norm": 0.4784203907496932, + "learning_rate": 0.003, + "loss": 4.1844, + "step": 2776 + }, + { + "epoch": 0.02777, + "grad_norm": 0.48277039376268643, + "learning_rate": 0.003, + "loss": 4.1625, + "step": 2777 + }, + { + "epoch": 0.02778, + "grad_norm": 0.5520225977650862, + "learning_rate": 0.003, + "loss": 4.2068, + "step": 2778 + }, + { + "epoch": 0.02779, + "grad_norm": 0.676790078006391, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 2779 + }, + { + "epoch": 0.0278, + "grad_norm": 0.7848726112221234, + "learning_rate": 0.003, + "loss": 4.1832, + "step": 2780 + }, + { + "epoch": 0.02781, + "grad_norm": 0.7347482785122091, + "learning_rate": 0.003, + "loss": 4.1673, + "step": 2781 + }, + { + "epoch": 0.02782, + "grad_norm": 0.6054417831566177, + "learning_rate": 0.003, + "loss": 4.1891, + "step": 2782 + }, + { + "epoch": 0.02783, + "grad_norm": 0.5849343545068603, + "learning_rate": 0.003, + "loss": 4.1925, + "step": 2783 + }, + { + "epoch": 0.02784, + "grad_norm": 0.6041156935596753, + "learning_rate": 0.003, + "loss": 4.1877, + "step": 2784 + }, + { + "epoch": 0.02785, + "grad_norm": 0.595867835436581, + "learning_rate": 0.003, + "loss": 4.2062, + "step": 2785 + }, + { + "epoch": 0.02786, + "grad_norm": 0.6928930555141227, + "learning_rate": 0.003, + "loss": 4.2052, + "step": 2786 + }, + { + "epoch": 0.02787, + "grad_norm": 0.7406479460311315, + "learning_rate": 0.003, + "loss": 4.2115, + "step": 2787 + }, + { + "epoch": 0.02788, + "grad_norm": 0.6940976950244777, + "learning_rate": 0.003, + "loss": 4.1982, + "step": 2788 + }, + { + "epoch": 0.02789, + "grad_norm": 0.6273541949163579, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 2789 + }, + { + "epoch": 0.0279, + "grad_norm": 0.5979480136249415, + "learning_rate": 0.003, + "loss": 4.1745, + "step": 2790 + }, + { + "epoch": 0.02791, + "grad_norm": 0.5594196614238619, + "learning_rate": 0.003, + "loss": 4.1864, + "step": 2791 + }, + { + "epoch": 0.02792, + "grad_norm": 0.5206581117733323, + "learning_rate": 0.003, + "loss": 4.2009, + "step": 2792 + }, + { + "epoch": 0.02793, + "grad_norm": 0.5275957045486279, + "learning_rate": 0.003, + "loss": 4.1994, + "step": 2793 + }, + { + "epoch": 0.02794, + "grad_norm": 0.524155250681782, + "learning_rate": 0.003, + "loss": 4.194, + "step": 2794 + }, + { + "epoch": 0.02795, + "grad_norm": 0.5721629122704371, + "learning_rate": 0.003, + "loss": 4.2082, + "step": 2795 + }, + { + "epoch": 0.02796, + "grad_norm": 0.6009605504824468, + "learning_rate": 0.003, + "loss": 4.2021, + "step": 2796 + }, + { + "epoch": 0.02797, + "grad_norm": 0.5762272128929422, + "learning_rate": 0.003, + "loss": 4.2131, + "step": 2797 + }, + { + "epoch": 0.02798, + "grad_norm": 0.5678013400030233, + "learning_rate": 0.003, + "loss": 4.2087, + "step": 2798 + }, + { + "epoch": 0.02799, + "grad_norm": 0.5466966091962687, + "learning_rate": 0.003, + "loss": 4.1921, + "step": 2799 + }, + { + "epoch": 0.028, + "grad_norm": 0.5426896151230969, + "learning_rate": 0.003, + "loss": 4.2039, + "step": 2800 + }, + { + "epoch": 0.02801, + "grad_norm": 0.5309488273760117, + "learning_rate": 0.003, + "loss": 4.1799, + "step": 2801 + }, + { + "epoch": 0.02802, + "grad_norm": 0.5742220089169363, + "learning_rate": 0.003, + "loss": 4.1824, + "step": 2802 + }, + { + "epoch": 0.02803, + "grad_norm": 0.7893252753789815, + "learning_rate": 0.003, + "loss": 4.2101, + "step": 2803 + }, + { + "epoch": 0.02804, + "grad_norm": 1.052487841789281, + "learning_rate": 0.003, + "loss": 4.2001, + "step": 2804 + }, + { + "epoch": 0.02805, + "grad_norm": 0.8655535125529751, + "learning_rate": 0.003, + "loss": 4.2346, + "step": 2805 + }, + { + "epoch": 0.02806, + "grad_norm": 0.6475434652146125, + "learning_rate": 0.003, + "loss": 4.202, + "step": 2806 + }, + { + "epoch": 0.02807, + "grad_norm": 0.5991110542030375, + "learning_rate": 0.003, + "loss": 4.2062, + "step": 2807 + }, + { + "epoch": 0.02808, + "grad_norm": 0.5007983686795169, + "learning_rate": 0.003, + "loss": 4.1818, + "step": 2808 + }, + { + "epoch": 0.02809, + "grad_norm": 0.49095808154292164, + "learning_rate": 0.003, + "loss": 4.2217, + "step": 2809 + }, + { + "epoch": 0.0281, + "grad_norm": 0.4819118161545627, + "learning_rate": 0.003, + "loss": 4.1742, + "step": 2810 + }, + { + "epoch": 0.02811, + "grad_norm": 0.44009677007510295, + "learning_rate": 0.003, + "loss": 4.1812, + "step": 2811 + }, + { + "epoch": 0.02812, + "grad_norm": 0.4080215487522985, + "learning_rate": 0.003, + "loss": 4.1953, + "step": 2812 + }, + { + "epoch": 0.02813, + "grad_norm": 0.3937509668583135, + "learning_rate": 0.003, + "loss": 4.1839, + "step": 2813 + }, + { + "epoch": 0.02814, + "grad_norm": 0.34050782014520087, + "learning_rate": 0.003, + "loss": 4.1814, + "step": 2814 + }, + { + "epoch": 0.02815, + "grad_norm": 0.32268087418962427, + "learning_rate": 0.003, + "loss": 4.2181, + "step": 2815 + }, + { + "epoch": 0.02816, + "grad_norm": 0.3255642062500674, + "learning_rate": 0.003, + "loss": 4.1944, + "step": 2816 + }, + { + "epoch": 0.02817, + "grad_norm": 0.3381409922347926, + "learning_rate": 0.003, + "loss": 4.1508, + "step": 2817 + }, + { + "epoch": 0.02818, + "grad_norm": 0.38168796766705765, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 2818 + }, + { + "epoch": 0.02819, + "grad_norm": 0.4325061454583649, + "learning_rate": 0.003, + "loss": 4.1913, + "step": 2819 + }, + { + "epoch": 0.0282, + "grad_norm": 0.4532493695937539, + "learning_rate": 0.003, + "loss": 4.1852, + "step": 2820 + }, + { + "epoch": 0.02821, + "grad_norm": 0.47151197099161557, + "learning_rate": 0.003, + "loss": 4.2029, + "step": 2821 + }, + { + "epoch": 0.02822, + "grad_norm": 0.550967405378687, + "learning_rate": 0.003, + "loss": 4.1762, + "step": 2822 + }, + { + "epoch": 0.02823, + "grad_norm": 0.5888233523434347, + "learning_rate": 0.003, + "loss": 4.2039, + "step": 2823 + }, + { + "epoch": 0.02824, + "grad_norm": 0.5616734570991271, + "learning_rate": 0.003, + "loss": 4.1907, + "step": 2824 + }, + { + "epoch": 0.02825, + "grad_norm": 0.4685382550291958, + "learning_rate": 0.003, + "loss": 4.1898, + "step": 2825 + }, + { + "epoch": 0.02826, + "grad_norm": 0.3984745923734355, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 2826 + }, + { + "epoch": 0.02827, + "grad_norm": 0.4466687377830192, + "learning_rate": 0.003, + "loss": 4.153, + "step": 2827 + }, + { + "epoch": 0.02828, + "grad_norm": 0.48090906385724086, + "learning_rate": 0.003, + "loss": 4.1939, + "step": 2828 + }, + { + "epoch": 0.02829, + "grad_norm": 0.4887994839877233, + "learning_rate": 0.003, + "loss": 4.2162, + "step": 2829 + }, + { + "epoch": 0.0283, + "grad_norm": 0.5659608141025803, + "learning_rate": 0.003, + "loss": 4.1838, + "step": 2830 + }, + { + "epoch": 0.02831, + "grad_norm": 0.6332182296297022, + "learning_rate": 0.003, + "loss": 4.1886, + "step": 2831 + }, + { + "epoch": 0.02832, + "grad_norm": 0.7157897208149597, + "learning_rate": 0.003, + "loss": 4.2049, + "step": 2832 + }, + { + "epoch": 0.02833, + "grad_norm": 0.7488210186844626, + "learning_rate": 0.003, + "loss": 4.1858, + "step": 2833 + }, + { + "epoch": 0.02834, + "grad_norm": 0.721287579139196, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 2834 + }, + { + "epoch": 0.02835, + "grad_norm": 0.7107105075536025, + "learning_rate": 0.003, + "loss": 4.1873, + "step": 2835 + }, + { + "epoch": 0.02836, + "grad_norm": 0.7041301874694449, + "learning_rate": 0.003, + "loss": 4.2059, + "step": 2836 + }, + { + "epoch": 0.02837, + "grad_norm": 0.7504397142936561, + "learning_rate": 0.003, + "loss": 4.182, + "step": 2837 + }, + { + "epoch": 0.02838, + "grad_norm": 0.7162503617742563, + "learning_rate": 0.003, + "loss": 4.1502, + "step": 2838 + }, + { + "epoch": 0.02839, + "grad_norm": 0.6629058755892886, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 2839 + }, + { + "epoch": 0.0284, + "grad_norm": 0.8045463625357235, + "learning_rate": 0.003, + "loss": 4.1743, + "step": 2840 + }, + { + "epoch": 0.02841, + "grad_norm": 0.8128195799944198, + "learning_rate": 0.003, + "loss": 4.2167, + "step": 2841 + }, + { + "epoch": 0.02842, + "grad_norm": 0.9021793870071328, + "learning_rate": 0.003, + "loss": 4.201, + "step": 2842 + }, + { + "epoch": 0.02843, + "grad_norm": 0.9569629998728558, + "learning_rate": 0.003, + "loss": 4.2417, + "step": 2843 + }, + { + "epoch": 0.02844, + "grad_norm": 0.9483564800535025, + "learning_rate": 0.003, + "loss": 4.2425, + "step": 2844 + }, + { + "epoch": 0.02845, + "grad_norm": 0.9913358728662397, + "learning_rate": 0.003, + "loss": 4.2312, + "step": 2845 + }, + { + "epoch": 0.02846, + "grad_norm": 1.063054241841927, + "learning_rate": 0.003, + "loss": 4.2341, + "step": 2846 + }, + { + "epoch": 0.02847, + "grad_norm": 1.0645706257254368, + "learning_rate": 0.003, + "loss": 4.2982, + "step": 2847 + }, + { + "epoch": 0.02848, + "grad_norm": 0.8386178601836739, + "learning_rate": 0.003, + "loss": 4.248, + "step": 2848 + }, + { + "epoch": 0.02849, + "grad_norm": 0.764216257611563, + "learning_rate": 0.003, + "loss": 4.2302, + "step": 2849 + }, + { + "epoch": 0.0285, + "grad_norm": 0.8296984070312703, + "learning_rate": 0.003, + "loss": 4.2406, + "step": 2850 + }, + { + "epoch": 0.02851, + "grad_norm": 0.7660626969686088, + "learning_rate": 0.003, + "loss": 4.2727, + "step": 2851 + }, + { + "epoch": 0.02852, + "grad_norm": 0.6699896951076655, + "learning_rate": 0.003, + "loss": 4.2432, + "step": 2852 + }, + { + "epoch": 0.02853, + "grad_norm": 0.6448384261298653, + "learning_rate": 0.003, + "loss": 4.2061, + "step": 2853 + }, + { + "epoch": 0.02854, + "grad_norm": 0.5738318430229328, + "learning_rate": 0.003, + "loss": 4.2452, + "step": 2854 + }, + { + "epoch": 0.02855, + "grad_norm": 0.5220513862086724, + "learning_rate": 0.003, + "loss": 4.238, + "step": 2855 + }, + { + "epoch": 0.02856, + "grad_norm": 0.487730279931273, + "learning_rate": 0.003, + "loss": 4.236, + "step": 2856 + }, + { + "epoch": 0.02857, + "grad_norm": 0.5431060757677301, + "learning_rate": 0.003, + "loss": 4.2166, + "step": 2857 + }, + { + "epoch": 0.02858, + "grad_norm": 0.5661595941123131, + "learning_rate": 0.003, + "loss": 4.204, + "step": 2858 + }, + { + "epoch": 0.02859, + "grad_norm": 0.4484066606331403, + "learning_rate": 0.003, + "loss": 4.2049, + "step": 2859 + }, + { + "epoch": 0.0286, + "grad_norm": 0.4075396606562088, + "learning_rate": 0.003, + "loss": 4.2145, + "step": 2860 + }, + { + "epoch": 0.02861, + "grad_norm": 0.38814897890981537, + "learning_rate": 0.003, + "loss": 4.1834, + "step": 2861 + }, + { + "epoch": 0.02862, + "grad_norm": 0.35763606720502106, + "learning_rate": 0.003, + "loss": 4.1992, + "step": 2862 + }, + { + "epoch": 0.02863, + "grad_norm": 0.33574165762339114, + "learning_rate": 0.003, + "loss": 4.2202, + "step": 2863 + }, + { + "epoch": 0.02864, + "grad_norm": 0.2931133637710789, + "learning_rate": 0.003, + "loss": 4.1911, + "step": 2864 + }, + { + "epoch": 0.02865, + "grad_norm": 0.2747295563994342, + "learning_rate": 0.003, + "loss": 4.2198, + "step": 2865 + }, + { + "epoch": 0.02866, + "grad_norm": 0.2878577462186846, + "learning_rate": 0.003, + "loss": 4.1997, + "step": 2866 + }, + { + "epoch": 0.02867, + "grad_norm": 0.30967589778912374, + "learning_rate": 0.003, + "loss": 4.1832, + "step": 2867 + }, + { + "epoch": 0.02868, + "grad_norm": 0.3317442746388715, + "learning_rate": 0.003, + "loss": 4.1921, + "step": 2868 + }, + { + "epoch": 0.02869, + "grad_norm": 0.41983761578290535, + "learning_rate": 0.003, + "loss": 4.1814, + "step": 2869 + }, + { + "epoch": 0.0287, + "grad_norm": 0.5221052091755417, + "learning_rate": 0.003, + "loss": 4.2048, + "step": 2870 + }, + { + "epoch": 0.02871, + "grad_norm": 0.6596505309045855, + "learning_rate": 0.003, + "loss": 4.1929, + "step": 2871 + }, + { + "epoch": 0.02872, + "grad_norm": 0.6939156668954594, + "learning_rate": 0.003, + "loss": 4.2319, + "step": 2872 + }, + { + "epoch": 0.02873, + "grad_norm": 0.5889408095632228, + "learning_rate": 0.003, + "loss": 4.1969, + "step": 2873 + }, + { + "epoch": 0.02874, + "grad_norm": 0.5945812975211832, + "learning_rate": 0.003, + "loss": 4.2106, + "step": 2874 + }, + { + "epoch": 0.02875, + "grad_norm": 0.5997681923067583, + "learning_rate": 0.003, + "loss": 4.2173, + "step": 2875 + }, + { + "epoch": 0.02876, + "grad_norm": 0.5432684898721573, + "learning_rate": 0.003, + "loss": 4.1947, + "step": 2876 + }, + { + "epoch": 0.02877, + "grad_norm": 0.5211950136895529, + "learning_rate": 0.003, + "loss": 4.1769, + "step": 2877 + }, + { + "epoch": 0.02878, + "grad_norm": 0.4787079571485456, + "learning_rate": 0.003, + "loss": 4.1717, + "step": 2878 + }, + { + "epoch": 0.02879, + "grad_norm": 0.45005415921350717, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 2879 + }, + { + "epoch": 0.0288, + "grad_norm": 0.43431149375473005, + "learning_rate": 0.003, + "loss": 4.1688, + "step": 2880 + }, + { + "epoch": 0.02881, + "grad_norm": 0.4696763789342238, + "learning_rate": 0.003, + "loss": 4.1826, + "step": 2881 + }, + { + "epoch": 0.02882, + "grad_norm": 0.5425923911480585, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 2882 + }, + { + "epoch": 0.02883, + "grad_norm": 0.4752679190892369, + "learning_rate": 0.003, + "loss": 4.1811, + "step": 2883 + }, + { + "epoch": 0.02884, + "grad_norm": 0.4357972258734796, + "learning_rate": 0.003, + "loss": 4.2167, + "step": 2884 + }, + { + "epoch": 0.02885, + "grad_norm": 0.42135010007795737, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 2885 + }, + { + "epoch": 0.02886, + "grad_norm": 0.4753116215584479, + "learning_rate": 0.003, + "loss": 4.1904, + "step": 2886 + }, + { + "epoch": 0.02887, + "grad_norm": 0.4764421561290465, + "learning_rate": 0.003, + "loss": 4.1753, + "step": 2887 + }, + { + "epoch": 0.02888, + "grad_norm": 0.4467467351995427, + "learning_rate": 0.003, + "loss": 4.1716, + "step": 2888 + }, + { + "epoch": 0.02889, + "grad_norm": 0.4870006343939259, + "learning_rate": 0.003, + "loss": 4.1625, + "step": 2889 + }, + { + "epoch": 0.0289, + "grad_norm": 0.4870936179251758, + "learning_rate": 0.003, + "loss": 4.1939, + "step": 2890 + }, + { + "epoch": 0.02891, + "grad_norm": 0.5804488605460525, + "learning_rate": 0.003, + "loss": 4.2122, + "step": 2891 + }, + { + "epoch": 0.02892, + "grad_norm": 0.6634466821898878, + "learning_rate": 0.003, + "loss": 4.17, + "step": 2892 + }, + { + "epoch": 0.02893, + "grad_norm": 0.5948043705095636, + "learning_rate": 0.003, + "loss": 4.1771, + "step": 2893 + }, + { + "epoch": 0.02894, + "grad_norm": 0.6043920015751632, + "learning_rate": 0.003, + "loss": 4.1843, + "step": 2894 + }, + { + "epoch": 0.02895, + "grad_norm": 0.6201059818349552, + "learning_rate": 0.003, + "loss": 4.2268, + "step": 2895 + }, + { + "epoch": 0.02896, + "grad_norm": 0.6991042225102296, + "learning_rate": 0.003, + "loss": 4.1857, + "step": 2896 + }, + { + "epoch": 0.02897, + "grad_norm": 0.7490225053396202, + "learning_rate": 0.003, + "loss": 4.1703, + "step": 2897 + }, + { + "epoch": 0.02898, + "grad_norm": 0.7349838612699012, + "learning_rate": 0.003, + "loss": 4.2192, + "step": 2898 + }, + { + "epoch": 0.02899, + "grad_norm": 0.7005801384832842, + "learning_rate": 0.003, + "loss": 4.2014, + "step": 2899 + }, + { + "epoch": 0.029, + "grad_norm": 0.6982394096022596, + "learning_rate": 0.003, + "loss": 4.1924, + "step": 2900 + }, + { + "epoch": 0.02901, + "grad_norm": 0.7179332370923166, + "learning_rate": 0.003, + "loss": 4.2092, + "step": 2901 + }, + { + "epoch": 0.02902, + "grad_norm": 0.6408206050082779, + "learning_rate": 0.003, + "loss": 4.2244, + "step": 2902 + }, + { + "epoch": 0.02903, + "grad_norm": 0.5794248721426231, + "learning_rate": 0.003, + "loss": 4.1817, + "step": 2903 + }, + { + "epoch": 0.02904, + "grad_norm": 0.6047919514195621, + "learning_rate": 0.003, + "loss": 4.1991, + "step": 2904 + }, + { + "epoch": 0.02905, + "grad_norm": 0.6712321741283004, + "learning_rate": 0.003, + "loss": 4.2179, + "step": 2905 + }, + { + "epoch": 0.02906, + "grad_norm": 0.7364218920572956, + "learning_rate": 0.003, + "loss": 4.2038, + "step": 2906 + }, + { + "epoch": 0.02907, + "grad_norm": 0.7800724882255463, + "learning_rate": 0.003, + "loss": 4.1811, + "step": 2907 + }, + { + "epoch": 0.02908, + "grad_norm": 0.7350720938021648, + "learning_rate": 0.003, + "loss": 4.2049, + "step": 2908 + }, + { + "epoch": 0.02909, + "grad_norm": 0.6300086850554201, + "learning_rate": 0.003, + "loss": 4.1942, + "step": 2909 + }, + { + "epoch": 0.0291, + "grad_norm": 0.5419084839810532, + "learning_rate": 0.003, + "loss": 4.203, + "step": 2910 + }, + { + "epoch": 0.02911, + "grad_norm": 0.544140250655527, + "learning_rate": 0.003, + "loss": 4.1974, + "step": 2911 + }, + { + "epoch": 0.02912, + "grad_norm": 0.5832074121091988, + "learning_rate": 0.003, + "loss": 4.1707, + "step": 2912 + }, + { + "epoch": 0.02913, + "grad_norm": 0.46996544227828957, + "learning_rate": 0.003, + "loss": 4.1972, + "step": 2913 + }, + { + "epoch": 0.02914, + "grad_norm": 0.5185065785536244, + "learning_rate": 0.003, + "loss": 4.1876, + "step": 2914 + }, + { + "epoch": 0.02915, + "grad_norm": 0.5161041915363022, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 2915 + }, + { + "epoch": 0.02916, + "grad_norm": 0.5743632962294265, + "learning_rate": 0.003, + "loss": 4.1875, + "step": 2916 + }, + { + "epoch": 0.02917, + "grad_norm": 0.5787314482733064, + "learning_rate": 0.003, + "loss": 4.2184, + "step": 2917 + }, + { + "epoch": 0.02918, + "grad_norm": 0.6089620541454349, + "learning_rate": 0.003, + "loss": 4.1759, + "step": 2918 + }, + { + "epoch": 0.02919, + "grad_norm": 0.5982379349158571, + "learning_rate": 0.003, + "loss": 4.219, + "step": 2919 + }, + { + "epoch": 0.0292, + "grad_norm": 0.5540097529270788, + "learning_rate": 0.003, + "loss": 4.156, + "step": 2920 + }, + { + "epoch": 0.02921, + "grad_norm": 0.5487736094530016, + "learning_rate": 0.003, + "loss": 4.1965, + "step": 2921 + }, + { + "epoch": 0.02922, + "grad_norm": 0.46820953528164894, + "learning_rate": 0.003, + "loss": 4.1944, + "step": 2922 + }, + { + "epoch": 0.02923, + "grad_norm": 0.3911917353150244, + "learning_rate": 0.003, + "loss": 4.1613, + "step": 2923 + }, + { + "epoch": 0.02924, + "grad_norm": 0.3519948559374971, + "learning_rate": 0.003, + "loss": 4.1948, + "step": 2924 + }, + { + "epoch": 0.02925, + "grad_norm": 0.37937109757353865, + "learning_rate": 0.003, + "loss": 4.1979, + "step": 2925 + }, + { + "epoch": 0.02926, + "grad_norm": 0.4065842767583339, + "learning_rate": 0.003, + "loss": 4.1756, + "step": 2926 + }, + { + "epoch": 0.02927, + "grad_norm": 0.475304983845517, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 2927 + }, + { + "epoch": 0.02928, + "grad_norm": 0.6734308844569649, + "learning_rate": 0.003, + "loss": 4.2095, + "step": 2928 + }, + { + "epoch": 0.02929, + "grad_norm": 0.8546919556478062, + "learning_rate": 0.003, + "loss": 4.2048, + "step": 2929 + }, + { + "epoch": 0.0293, + "grad_norm": 0.8348963936929714, + "learning_rate": 0.003, + "loss": 4.2035, + "step": 2930 + }, + { + "epoch": 0.02931, + "grad_norm": 0.7454052973981365, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 2931 + }, + { + "epoch": 0.02932, + "grad_norm": 0.7467998755597431, + "learning_rate": 0.003, + "loss": 4.1986, + "step": 2932 + }, + { + "epoch": 0.02933, + "grad_norm": 0.8069014916887333, + "learning_rate": 0.003, + "loss": 4.2246, + "step": 2933 + }, + { + "epoch": 0.02934, + "grad_norm": 0.9491281331586824, + "learning_rate": 0.003, + "loss": 4.2274, + "step": 2934 + }, + { + "epoch": 0.02935, + "grad_norm": 0.9112675124388164, + "learning_rate": 0.003, + "loss": 4.222, + "step": 2935 + }, + { + "epoch": 0.02936, + "grad_norm": 0.8519647588347719, + "learning_rate": 0.003, + "loss": 4.2389, + "step": 2936 + }, + { + "epoch": 0.02937, + "grad_norm": 0.9555587969650123, + "learning_rate": 0.003, + "loss": 4.2301, + "step": 2937 + }, + { + "epoch": 0.02938, + "grad_norm": 0.9150311365393384, + "learning_rate": 0.003, + "loss": 4.2464, + "step": 2938 + }, + { + "epoch": 0.02939, + "grad_norm": 1.0737440013454487, + "learning_rate": 0.003, + "loss": 4.2526, + "step": 2939 + }, + { + "epoch": 0.0294, + "grad_norm": 1.0147261563008552, + "learning_rate": 0.003, + "loss": 4.2801, + "step": 2940 + }, + { + "epoch": 0.02941, + "grad_norm": 1.022661673211658, + "learning_rate": 0.003, + "loss": 4.2879, + "step": 2941 + }, + { + "epoch": 0.02942, + "grad_norm": 1.0444970801430784, + "learning_rate": 0.003, + "loss": 4.2748, + "step": 2942 + }, + { + "epoch": 0.02943, + "grad_norm": 0.8455394657906586, + "learning_rate": 0.003, + "loss": 4.2533, + "step": 2943 + }, + { + "epoch": 0.02944, + "grad_norm": 0.7628339588396233, + "learning_rate": 0.003, + "loss": 4.2739, + "step": 2944 + }, + { + "epoch": 0.02945, + "grad_norm": 0.9388418061065686, + "learning_rate": 0.003, + "loss": 4.2636, + "step": 2945 + }, + { + "epoch": 0.02946, + "grad_norm": 0.9857547654390597, + "learning_rate": 0.003, + "loss": 4.2709, + "step": 2946 + }, + { + "epoch": 0.02947, + "grad_norm": 0.9078404718745537, + "learning_rate": 0.003, + "loss": 4.301, + "step": 2947 + }, + { + "epoch": 0.02948, + "grad_norm": 0.9520758925981907, + "learning_rate": 0.003, + "loss": 4.2798, + "step": 2948 + }, + { + "epoch": 0.02949, + "grad_norm": 1.0406544574219045, + "learning_rate": 0.003, + "loss": 4.2581, + "step": 2949 + }, + { + "epoch": 0.0295, + "grad_norm": 0.9292248813352704, + "learning_rate": 0.003, + "loss": 4.2783, + "step": 2950 + }, + { + "epoch": 0.02951, + "grad_norm": 0.8590093083304559, + "learning_rate": 0.003, + "loss": 4.2285, + "step": 2951 + }, + { + "epoch": 0.02952, + "grad_norm": 0.6554290928742069, + "learning_rate": 0.003, + "loss": 4.2469, + "step": 2952 + }, + { + "epoch": 0.02953, + "grad_norm": 0.8266033050494169, + "learning_rate": 0.003, + "loss": 4.2603, + "step": 2953 + }, + { + "epoch": 0.02954, + "grad_norm": 0.9710367944205619, + "learning_rate": 0.003, + "loss": 4.3127, + "step": 2954 + }, + { + "epoch": 0.02955, + "grad_norm": 0.9091757097495087, + "learning_rate": 0.003, + "loss": 4.2852, + "step": 2955 + }, + { + "epoch": 0.02956, + "grad_norm": 0.7400776132762851, + "learning_rate": 0.003, + "loss": 4.2656, + "step": 2956 + }, + { + "epoch": 0.02957, + "grad_norm": 0.5166853898536266, + "learning_rate": 0.003, + "loss": 4.2638, + "step": 2957 + }, + { + "epoch": 0.02958, + "grad_norm": 0.49710477556743804, + "learning_rate": 0.003, + "loss": 4.276, + "step": 2958 + }, + { + "epoch": 0.02959, + "grad_norm": 0.4499644234512779, + "learning_rate": 0.003, + "loss": 4.2565, + "step": 2959 + }, + { + "epoch": 0.0296, + "grad_norm": 0.4150995513488321, + "learning_rate": 0.003, + "loss": 4.2574, + "step": 2960 + }, + { + "epoch": 0.02961, + "grad_norm": 0.4170220343004301, + "learning_rate": 0.003, + "loss": 4.2257, + "step": 2961 + }, + { + "epoch": 0.02962, + "grad_norm": 0.4582297929571511, + "learning_rate": 0.003, + "loss": 4.2309, + "step": 2962 + }, + { + "epoch": 0.02963, + "grad_norm": 0.4661209765546486, + "learning_rate": 0.003, + "loss": 4.2018, + "step": 2963 + }, + { + "epoch": 0.02964, + "grad_norm": 0.40893029477200127, + "learning_rate": 0.003, + "loss": 4.1831, + "step": 2964 + }, + { + "epoch": 0.02965, + "grad_norm": 0.30952309242863746, + "learning_rate": 0.003, + "loss": 4.2259, + "step": 2965 + }, + { + "epoch": 0.02966, + "grad_norm": 0.2925470804974261, + "learning_rate": 0.003, + "loss": 4.2331, + "step": 2966 + }, + { + "epoch": 0.02967, + "grad_norm": 0.30877812791584447, + "learning_rate": 0.003, + "loss": 4.2188, + "step": 2967 + }, + { + "epoch": 0.02968, + "grad_norm": 0.2982831000211414, + "learning_rate": 0.003, + "loss": 4.2128, + "step": 2968 + }, + { + "epoch": 0.02969, + "grad_norm": 0.31941995962945985, + "learning_rate": 0.003, + "loss": 4.207, + "step": 2969 + }, + { + "epoch": 0.0297, + "grad_norm": 0.4198384435174913, + "learning_rate": 0.003, + "loss": 4.1928, + "step": 2970 + }, + { + "epoch": 0.02971, + "grad_norm": 0.4895528117346824, + "learning_rate": 0.003, + "loss": 4.2113, + "step": 2971 + }, + { + "epoch": 0.02972, + "grad_norm": 0.46403453609324874, + "learning_rate": 0.003, + "loss": 4.1791, + "step": 2972 + }, + { + "epoch": 0.02973, + "grad_norm": 0.3996362985623369, + "learning_rate": 0.003, + "loss": 4.1612, + "step": 2973 + }, + { + "epoch": 0.02974, + "grad_norm": 0.3669370591415718, + "learning_rate": 0.003, + "loss": 4.197, + "step": 2974 + }, + { + "epoch": 0.02975, + "grad_norm": 0.3727347214399743, + "learning_rate": 0.003, + "loss": 4.1805, + "step": 2975 + }, + { + "epoch": 0.02976, + "grad_norm": 0.40978542004123153, + "learning_rate": 0.003, + "loss": 4.1911, + "step": 2976 + }, + { + "epoch": 0.02977, + "grad_norm": 0.4390067885032759, + "learning_rate": 0.003, + "loss": 4.178, + "step": 2977 + }, + { + "epoch": 0.02978, + "grad_norm": 0.38270215912996663, + "learning_rate": 0.003, + "loss": 4.1974, + "step": 2978 + }, + { + "epoch": 0.02979, + "grad_norm": 0.30944659309862094, + "learning_rate": 0.003, + "loss": 4.1765, + "step": 2979 + }, + { + "epoch": 0.0298, + "grad_norm": 0.3395231398632022, + "learning_rate": 0.003, + "loss": 4.181, + "step": 2980 + }, + { + "epoch": 0.02981, + "grad_norm": 0.34592707909470244, + "learning_rate": 0.003, + "loss": 4.1907, + "step": 2981 + }, + { + "epoch": 0.02982, + "grad_norm": 0.38035052902890665, + "learning_rate": 0.003, + "loss": 4.1864, + "step": 2982 + }, + { + "epoch": 0.02983, + "grad_norm": 0.5707754810875516, + "learning_rate": 0.003, + "loss": 4.2039, + "step": 2983 + }, + { + "epoch": 0.02984, + "grad_norm": 0.8429776036729895, + "learning_rate": 0.003, + "loss": 4.1934, + "step": 2984 + }, + { + "epoch": 0.02985, + "grad_norm": 0.9426647233611747, + "learning_rate": 0.003, + "loss": 4.2337, + "step": 2985 + }, + { + "epoch": 0.02986, + "grad_norm": 0.5829174432301835, + "learning_rate": 0.003, + "loss": 4.1829, + "step": 2986 + }, + { + "epoch": 0.02987, + "grad_norm": 0.5797297658595812, + "learning_rate": 0.003, + "loss": 4.1761, + "step": 2987 + }, + { + "epoch": 0.02988, + "grad_norm": 0.5667463117618501, + "learning_rate": 0.003, + "loss": 4.1884, + "step": 2988 + }, + { + "epoch": 0.02989, + "grad_norm": 0.46700979940535653, + "learning_rate": 0.003, + "loss": 4.196, + "step": 2989 + }, + { + "epoch": 0.0299, + "grad_norm": 0.47425843632840237, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 2990 + }, + { + "epoch": 0.02991, + "grad_norm": 0.4585869479501072, + "learning_rate": 0.003, + "loss": 4.2064, + "step": 2991 + }, + { + "epoch": 0.02992, + "grad_norm": 0.44091690071775774, + "learning_rate": 0.003, + "loss": 4.2006, + "step": 2992 + }, + { + "epoch": 0.02993, + "grad_norm": 0.4427631334764928, + "learning_rate": 0.003, + "loss": 4.1703, + "step": 2993 + }, + { + "epoch": 0.02994, + "grad_norm": 0.3680333066458911, + "learning_rate": 0.003, + "loss": 4.1784, + "step": 2994 + }, + { + "epoch": 0.02995, + "grad_norm": 0.3313732790736943, + "learning_rate": 0.003, + "loss": 4.213, + "step": 2995 + }, + { + "epoch": 0.02996, + "grad_norm": 0.33322944772224145, + "learning_rate": 0.003, + "loss": 4.1998, + "step": 2996 + }, + { + "epoch": 0.02997, + "grad_norm": 0.35383706371010143, + "learning_rate": 0.003, + "loss": 4.1619, + "step": 2997 + }, + { + "epoch": 0.02998, + "grad_norm": 0.3144018568097704, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 2998 + }, + { + "epoch": 0.02999, + "grad_norm": 0.2707448134611539, + "learning_rate": 0.003, + "loss": 4.1532, + "step": 2999 + }, + { + "epoch": 0.03, + "grad_norm": 0.31453822473769166, + "learning_rate": 0.003, + "loss": 4.1985, + "step": 3000 + }, + { + "epoch": 0.03001, + "grad_norm": 0.3575380630906315, + "learning_rate": 0.003, + "loss": 4.1657, + "step": 3001 + }, + { + "epoch": 0.03002, + "grad_norm": 0.37987123508738174, + "learning_rate": 0.003, + "loss": 4.1801, + "step": 3002 + }, + { + "epoch": 0.03003, + "grad_norm": 0.43883399019288905, + "learning_rate": 0.003, + "loss": 4.1596, + "step": 3003 + }, + { + "epoch": 0.03004, + "grad_norm": 0.4354037002724127, + "learning_rate": 0.003, + "loss": 4.1389, + "step": 3004 + }, + { + "epoch": 0.03005, + "grad_norm": 0.44090216643634306, + "learning_rate": 0.003, + "loss": 4.1901, + "step": 3005 + }, + { + "epoch": 0.03006, + "grad_norm": 0.4541062050925091, + "learning_rate": 0.003, + "loss": 4.1847, + "step": 3006 + }, + { + "epoch": 0.03007, + "grad_norm": 0.5413252745038201, + "learning_rate": 0.003, + "loss": 4.1589, + "step": 3007 + }, + { + "epoch": 0.03008, + "grad_norm": 0.5914463578547696, + "learning_rate": 0.003, + "loss": 4.1893, + "step": 3008 + }, + { + "epoch": 0.03009, + "grad_norm": 0.6057328540964105, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 3009 + }, + { + "epoch": 0.0301, + "grad_norm": 0.6785032290357721, + "learning_rate": 0.003, + "loss": 4.1916, + "step": 3010 + }, + { + "epoch": 0.03011, + "grad_norm": 0.7337865546330415, + "learning_rate": 0.003, + "loss": 4.1764, + "step": 3011 + }, + { + "epoch": 0.03012, + "grad_norm": 0.7469304524271401, + "learning_rate": 0.003, + "loss": 4.1847, + "step": 3012 + }, + { + "epoch": 0.03013, + "grad_norm": 0.6647601453471846, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 3013 + }, + { + "epoch": 0.03014, + "grad_norm": 0.6507126425794799, + "learning_rate": 0.003, + "loss": 4.1729, + "step": 3014 + }, + { + "epoch": 0.03015, + "grad_norm": 0.6380413738814859, + "learning_rate": 0.003, + "loss": 4.1745, + "step": 3015 + }, + { + "epoch": 0.03016, + "grad_norm": 0.6182015477535148, + "learning_rate": 0.003, + "loss": 4.1815, + "step": 3016 + }, + { + "epoch": 0.03017, + "grad_norm": 0.586737080148615, + "learning_rate": 0.003, + "loss": 4.1768, + "step": 3017 + }, + { + "epoch": 0.03018, + "grad_norm": 0.5605400072157327, + "learning_rate": 0.003, + "loss": 4.1704, + "step": 3018 + }, + { + "epoch": 0.03019, + "grad_norm": 0.5243331941511797, + "learning_rate": 0.003, + "loss": 4.1891, + "step": 3019 + }, + { + "epoch": 0.0302, + "grad_norm": 0.49926755386002525, + "learning_rate": 0.003, + "loss": 4.1784, + "step": 3020 + }, + { + "epoch": 0.03021, + "grad_norm": 0.5365217273161322, + "learning_rate": 0.003, + "loss": 4.2017, + "step": 3021 + }, + { + "epoch": 0.03022, + "grad_norm": 0.49962846601806954, + "learning_rate": 0.003, + "loss": 4.1994, + "step": 3022 + }, + { + "epoch": 0.03023, + "grad_norm": 0.44101225502614944, + "learning_rate": 0.003, + "loss": 4.2091, + "step": 3023 + }, + { + "epoch": 0.03024, + "grad_norm": 0.4478587533564886, + "learning_rate": 0.003, + "loss": 4.2023, + "step": 3024 + }, + { + "epoch": 0.03025, + "grad_norm": 0.36914153729322, + "learning_rate": 0.003, + "loss": 4.1809, + "step": 3025 + }, + { + "epoch": 0.03026, + "grad_norm": 0.4189775215567249, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 3026 + }, + { + "epoch": 0.03027, + "grad_norm": 0.4812644816912879, + "learning_rate": 0.003, + "loss": 4.1781, + "step": 3027 + }, + { + "epoch": 0.03028, + "grad_norm": 0.5896483265711634, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 3028 + }, + { + "epoch": 0.03029, + "grad_norm": 0.7727132136494914, + "learning_rate": 0.003, + "loss": 4.1946, + "step": 3029 + }, + { + "epoch": 0.0303, + "grad_norm": 0.8225314550231486, + "learning_rate": 0.003, + "loss": 4.184, + "step": 3030 + }, + { + "epoch": 0.03031, + "grad_norm": 0.6946644336533113, + "learning_rate": 0.003, + "loss": 4.1598, + "step": 3031 + }, + { + "epoch": 0.03032, + "grad_norm": 0.7204147056292812, + "learning_rate": 0.003, + "loss": 4.192, + "step": 3032 + }, + { + "epoch": 0.03033, + "grad_norm": 0.6929276241309884, + "learning_rate": 0.003, + "loss": 4.1876, + "step": 3033 + }, + { + "epoch": 0.03034, + "grad_norm": 0.6811610794872539, + "learning_rate": 0.003, + "loss": 4.1933, + "step": 3034 + }, + { + "epoch": 0.03035, + "grad_norm": 0.6796984084846434, + "learning_rate": 0.003, + "loss": 4.2232, + "step": 3035 + }, + { + "epoch": 0.03036, + "grad_norm": 0.6389991482634575, + "learning_rate": 0.003, + "loss": 4.1949, + "step": 3036 + }, + { + "epoch": 0.03037, + "grad_norm": 0.7107042944049744, + "learning_rate": 0.003, + "loss": 4.1819, + "step": 3037 + }, + { + "epoch": 0.03038, + "grad_norm": 0.7261414033877567, + "learning_rate": 0.003, + "loss": 4.1853, + "step": 3038 + }, + { + "epoch": 0.03039, + "grad_norm": 0.6927833025504285, + "learning_rate": 0.003, + "loss": 4.1894, + "step": 3039 + }, + { + "epoch": 0.0304, + "grad_norm": 0.6355129937975011, + "learning_rate": 0.003, + "loss": 4.1938, + "step": 3040 + }, + { + "epoch": 0.03041, + "grad_norm": 0.5305188998243404, + "learning_rate": 0.003, + "loss": 4.1953, + "step": 3041 + }, + { + "epoch": 0.03042, + "grad_norm": 0.5277486902643708, + "learning_rate": 0.003, + "loss": 4.2187, + "step": 3042 + }, + { + "epoch": 0.03043, + "grad_norm": 0.48072198596584337, + "learning_rate": 0.003, + "loss": 4.1691, + "step": 3043 + }, + { + "epoch": 0.03044, + "grad_norm": 0.5305619681037871, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 3044 + }, + { + "epoch": 0.03045, + "grad_norm": 0.6019249816686142, + "learning_rate": 0.003, + "loss": 4.1781, + "step": 3045 + }, + { + "epoch": 0.03046, + "grad_norm": 0.6501665014846743, + "learning_rate": 0.003, + "loss": 4.1924, + "step": 3046 + }, + { + "epoch": 0.03047, + "grad_norm": 0.6512554720210705, + "learning_rate": 0.003, + "loss": 4.1981, + "step": 3047 + }, + { + "epoch": 0.03048, + "grad_norm": 0.6687647551352641, + "learning_rate": 0.003, + "loss": 4.1799, + "step": 3048 + }, + { + "epoch": 0.03049, + "grad_norm": 0.6495592187211907, + "learning_rate": 0.003, + "loss": 4.1692, + "step": 3049 + }, + { + "epoch": 0.0305, + "grad_norm": 0.5742671707682592, + "learning_rate": 0.003, + "loss": 4.1811, + "step": 3050 + }, + { + "epoch": 0.03051, + "grad_norm": 0.5413644439530771, + "learning_rate": 0.003, + "loss": 4.1904, + "step": 3051 + }, + { + "epoch": 0.03052, + "grad_norm": 0.45841707001486987, + "learning_rate": 0.003, + "loss": 4.1921, + "step": 3052 + }, + { + "epoch": 0.03053, + "grad_norm": 0.41417961125519964, + "learning_rate": 0.003, + "loss": 4.193, + "step": 3053 + }, + { + "epoch": 0.03054, + "grad_norm": 0.43823010289384967, + "learning_rate": 0.003, + "loss": 4.1743, + "step": 3054 + }, + { + "epoch": 0.03055, + "grad_norm": 0.5090646995969303, + "learning_rate": 0.003, + "loss": 4.1764, + "step": 3055 + }, + { + "epoch": 0.03056, + "grad_norm": 0.5373409114504033, + "learning_rate": 0.003, + "loss": 4.1895, + "step": 3056 + }, + { + "epoch": 0.03057, + "grad_norm": 0.6080154895830046, + "learning_rate": 0.003, + "loss": 4.1889, + "step": 3057 + }, + { + "epoch": 0.03058, + "grad_norm": 0.6540368964023406, + "learning_rate": 0.003, + "loss": 4.1648, + "step": 3058 + }, + { + "epoch": 0.03059, + "grad_norm": 0.5788424826468096, + "learning_rate": 0.003, + "loss": 4.1748, + "step": 3059 + }, + { + "epoch": 0.0306, + "grad_norm": 0.4988555874907532, + "learning_rate": 0.003, + "loss": 4.1584, + "step": 3060 + }, + { + "epoch": 0.03061, + "grad_norm": 0.4878251974397899, + "learning_rate": 0.003, + "loss": 4.1841, + "step": 3061 + }, + { + "epoch": 0.03062, + "grad_norm": 0.5122227250314405, + "learning_rate": 0.003, + "loss": 4.2073, + "step": 3062 + }, + { + "epoch": 0.03063, + "grad_norm": 0.48008913969979206, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 3063 + }, + { + "epoch": 0.03064, + "grad_norm": 0.46426896895718006, + "learning_rate": 0.003, + "loss": 4.1818, + "step": 3064 + }, + { + "epoch": 0.03065, + "grad_norm": 0.4213766940672057, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 3065 + }, + { + "epoch": 0.03066, + "grad_norm": 0.3680402555018932, + "learning_rate": 0.003, + "loss": 4.1908, + "step": 3066 + }, + { + "epoch": 0.03067, + "grad_norm": 0.3709473284252339, + "learning_rate": 0.003, + "loss": 4.194, + "step": 3067 + }, + { + "epoch": 0.03068, + "grad_norm": 0.3892404694228725, + "learning_rate": 0.003, + "loss": 4.1654, + "step": 3068 + }, + { + "epoch": 0.03069, + "grad_norm": 0.421709346396654, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 3069 + }, + { + "epoch": 0.0307, + "grad_norm": 0.452536722715443, + "learning_rate": 0.003, + "loss": 4.1628, + "step": 3070 + }, + { + "epoch": 0.03071, + "grad_norm": 0.43180254995245465, + "learning_rate": 0.003, + "loss": 4.1592, + "step": 3071 + }, + { + "epoch": 0.03072, + "grad_norm": 0.4894410650771712, + "learning_rate": 0.003, + "loss": 4.167, + "step": 3072 + }, + { + "epoch": 0.03073, + "grad_norm": 0.5403720255329184, + "learning_rate": 0.003, + "loss": 4.1662, + "step": 3073 + }, + { + "epoch": 0.03074, + "grad_norm": 0.5846698892510205, + "learning_rate": 0.003, + "loss": 4.1709, + "step": 3074 + }, + { + "epoch": 0.03075, + "grad_norm": 0.6574319904083324, + "learning_rate": 0.003, + "loss": 4.1977, + "step": 3075 + }, + { + "epoch": 0.03076, + "grad_norm": 0.6619917640445645, + "learning_rate": 0.003, + "loss": 4.1936, + "step": 3076 + }, + { + "epoch": 0.03077, + "grad_norm": 0.6043736206778582, + "learning_rate": 0.003, + "loss": 4.1867, + "step": 3077 + }, + { + "epoch": 0.03078, + "grad_norm": 0.6204154957439805, + "learning_rate": 0.003, + "loss": 4.1729, + "step": 3078 + }, + { + "epoch": 0.03079, + "grad_norm": 0.5593470356338103, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 3079 + }, + { + "epoch": 0.0308, + "grad_norm": 0.5726380712603409, + "learning_rate": 0.003, + "loss": 4.1884, + "step": 3080 + }, + { + "epoch": 0.03081, + "grad_norm": 0.6460072493113082, + "learning_rate": 0.003, + "loss": 4.1673, + "step": 3081 + }, + { + "epoch": 0.03082, + "grad_norm": 0.6213962184440697, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 3082 + }, + { + "epoch": 0.03083, + "grad_norm": 0.7067435150796765, + "learning_rate": 0.003, + "loss": 4.1723, + "step": 3083 + }, + { + "epoch": 0.03084, + "grad_norm": 0.7686911194186954, + "learning_rate": 0.003, + "loss": 4.1952, + "step": 3084 + }, + { + "epoch": 0.03085, + "grad_norm": 0.7715187358287607, + "learning_rate": 0.003, + "loss": 4.193, + "step": 3085 + }, + { + "epoch": 0.03086, + "grad_norm": 0.8253824375938966, + "learning_rate": 0.003, + "loss": 4.2186, + "step": 3086 + }, + { + "epoch": 0.03087, + "grad_norm": 0.7572823291901524, + "learning_rate": 0.003, + "loss": 4.1941, + "step": 3087 + }, + { + "epoch": 0.03088, + "grad_norm": 0.6756116771892228, + "learning_rate": 0.003, + "loss": 4.209, + "step": 3088 + }, + { + "epoch": 0.03089, + "grad_norm": 0.6820329410441291, + "learning_rate": 0.003, + "loss": 4.1733, + "step": 3089 + }, + { + "epoch": 0.0309, + "grad_norm": 0.7401927831575114, + "learning_rate": 0.003, + "loss": 4.1803, + "step": 3090 + }, + { + "epoch": 0.03091, + "grad_norm": 0.6394165992316918, + "learning_rate": 0.003, + "loss": 4.1389, + "step": 3091 + }, + { + "epoch": 0.03092, + "grad_norm": 0.6572468374220727, + "learning_rate": 0.003, + "loss": 4.1725, + "step": 3092 + }, + { + "epoch": 0.03093, + "grad_norm": 0.6935160519017697, + "learning_rate": 0.003, + "loss": 4.204, + "step": 3093 + }, + { + "epoch": 0.03094, + "grad_norm": 0.6716436178909779, + "learning_rate": 0.003, + "loss": 4.188, + "step": 3094 + }, + { + "epoch": 0.03095, + "grad_norm": 0.6591966230230418, + "learning_rate": 0.003, + "loss": 4.203, + "step": 3095 + }, + { + "epoch": 0.03096, + "grad_norm": 0.5959190368353036, + "learning_rate": 0.003, + "loss": 4.1776, + "step": 3096 + }, + { + "epoch": 0.03097, + "grad_norm": 0.5866701878437197, + "learning_rate": 0.003, + "loss": 4.1994, + "step": 3097 + }, + { + "epoch": 0.03098, + "grad_norm": 0.6431690827463121, + "learning_rate": 0.003, + "loss": 4.1981, + "step": 3098 + }, + { + "epoch": 0.03099, + "grad_norm": 0.6249835235277311, + "learning_rate": 0.003, + "loss": 4.1902, + "step": 3099 + }, + { + "epoch": 0.031, + "grad_norm": 0.6337483657376243, + "learning_rate": 0.003, + "loss": 4.1855, + "step": 3100 + }, + { + "epoch": 0.03101, + "grad_norm": 0.6496496012737066, + "learning_rate": 0.003, + "loss": 4.1777, + "step": 3101 + }, + { + "epoch": 0.03102, + "grad_norm": 0.6229765708037603, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 3102 + }, + { + "epoch": 0.03103, + "grad_norm": 0.6156795563561019, + "learning_rate": 0.003, + "loss": 4.1669, + "step": 3103 + }, + { + "epoch": 0.03104, + "grad_norm": 0.5787370390357155, + "learning_rate": 0.003, + "loss": 4.1717, + "step": 3104 + }, + { + "epoch": 0.03105, + "grad_norm": 0.46609179090900144, + "learning_rate": 0.003, + "loss": 4.1469, + "step": 3105 + }, + { + "epoch": 0.03106, + "grad_norm": 0.39336258398622215, + "learning_rate": 0.003, + "loss": 4.1631, + "step": 3106 + }, + { + "epoch": 0.03107, + "grad_norm": 0.4151072460907146, + "learning_rate": 0.003, + "loss": 4.1432, + "step": 3107 + }, + { + "epoch": 0.03108, + "grad_norm": 0.3556637596181332, + "learning_rate": 0.003, + "loss": 4.1522, + "step": 3108 + }, + { + "epoch": 0.03109, + "grad_norm": 0.3892938905312542, + "learning_rate": 0.003, + "loss": 4.159, + "step": 3109 + }, + { + "epoch": 0.0311, + "grad_norm": 0.4475819635965094, + "learning_rate": 0.003, + "loss": 4.1799, + "step": 3110 + }, + { + "epoch": 0.03111, + "grad_norm": 0.48864194527310445, + "learning_rate": 0.003, + "loss": 4.1578, + "step": 3111 + }, + { + "epoch": 0.03112, + "grad_norm": 0.5382631741272804, + "learning_rate": 0.003, + "loss": 4.1803, + "step": 3112 + }, + { + "epoch": 0.03113, + "grad_norm": 0.5134933039857796, + "learning_rate": 0.003, + "loss": 4.1686, + "step": 3113 + }, + { + "epoch": 0.03114, + "grad_norm": 0.45531644644558, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 3114 + }, + { + "epoch": 0.03115, + "grad_norm": 0.409821580807442, + "learning_rate": 0.003, + "loss": 4.1707, + "step": 3115 + }, + { + "epoch": 0.03116, + "grad_norm": 0.40691251051535354, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 3116 + }, + { + "epoch": 0.03117, + "grad_norm": 0.41357167905284664, + "learning_rate": 0.003, + "loss": 4.1631, + "step": 3117 + }, + { + "epoch": 0.03118, + "grad_norm": 0.4338761124077823, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 3118 + }, + { + "epoch": 0.03119, + "grad_norm": 0.4549147699459294, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 3119 + }, + { + "epoch": 0.0312, + "grad_norm": 0.5032285577701964, + "learning_rate": 0.003, + "loss": 4.1839, + "step": 3120 + }, + { + "epoch": 0.03121, + "grad_norm": 0.5982541074347542, + "learning_rate": 0.003, + "loss": 4.1639, + "step": 3121 + }, + { + "epoch": 0.03122, + "grad_norm": 0.7093524896825744, + "learning_rate": 0.003, + "loss": 4.1884, + "step": 3122 + }, + { + "epoch": 0.03123, + "grad_norm": 0.6935478647730015, + "learning_rate": 0.003, + "loss": 4.1891, + "step": 3123 + }, + { + "epoch": 0.03124, + "grad_norm": 0.6414330115408258, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 3124 + }, + { + "epoch": 0.03125, + "grad_norm": 0.6053563143256356, + "learning_rate": 0.003, + "loss": 4.1675, + "step": 3125 + }, + { + "epoch": 0.03126, + "grad_norm": 0.5853887861604886, + "learning_rate": 0.003, + "loss": 4.1684, + "step": 3126 + }, + { + "epoch": 0.03127, + "grad_norm": 0.6449491393449573, + "learning_rate": 0.003, + "loss": 4.1802, + "step": 3127 + }, + { + "epoch": 0.03128, + "grad_norm": 0.6502029339707546, + "learning_rate": 0.003, + "loss": 4.1706, + "step": 3128 + }, + { + "epoch": 0.03129, + "grad_norm": 0.7355040262252988, + "learning_rate": 0.003, + "loss": 4.1916, + "step": 3129 + }, + { + "epoch": 0.0313, + "grad_norm": 0.7004678988862265, + "learning_rate": 0.003, + "loss": 4.1982, + "step": 3130 + }, + { + "epoch": 0.03131, + "grad_norm": 0.545773725439425, + "learning_rate": 0.003, + "loss": 4.1723, + "step": 3131 + }, + { + "epoch": 0.03132, + "grad_norm": 0.575890880552423, + "learning_rate": 0.003, + "loss": 4.1945, + "step": 3132 + }, + { + "epoch": 0.03133, + "grad_norm": 0.6235572169219774, + "learning_rate": 0.003, + "loss": 4.1876, + "step": 3133 + }, + { + "epoch": 0.03134, + "grad_norm": 0.6099749117272442, + "learning_rate": 0.003, + "loss": 4.1881, + "step": 3134 + }, + { + "epoch": 0.03135, + "grad_norm": 0.6941243018172416, + "learning_rate": 0.003, + "loss": 4.1573, + "step": 3135 + }, + { + "epoch": 0.03136, + "grad_norm": 0.70121888828651, + "learning_rate": 0.003, + "loss": 4.1772, + "step": 3136 + }, + { + "epoch": 0.03137, + "grad_norm": 0.7085003488749844, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 3137 + }, + { + "epoch": 0.03138, + "grad_norm": 0.6535643891764703, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 3138 + }, + { + "epoch": 0.03139, + "grad_norm": 0.6308297801361256, + "learning_rate": 0.003, + "loss": 4.1826, + "step": 3139 + }, + { + "epoch": 0.0314, + "grad_norm": 0.5309446473531373, + "learning_rate": 0.003, + "loss": 4.1768, + "step": 3140 + }, + { + "epoch": 0.03141, + "grad_norm": 0.5693804160477011, + "learning_rate": 0.003, + "loss": 4.2097, + "step": 3141 + }, + { + "epoch": 0.03142, + "grad_norm": 0.6211997698889902, + "learning_rate": 0.003, + "loss": 4.1899, + "step": 3142 + }, + { + "epoch": 0.03143, + "grad_norm": 0.7073668422617321, + "learning_rate": 0.003, + "loss": 4.1805, + "step": 3143 + }, + { + "epoch": 0.03144, + "grad_norm": 0.7851704146814469, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 3144 + }, + { + "epoch": 0.03145, + "grad_norm": 0.7954348048069579, + "learning_rate": 0.003, + "loss": 4.1863, + "step": 3145 + }, + { + "epoch": 0.03146, + "grad_norm": 0.7034536386154087, + "learning_rate": 0.003, + "loss": 4.1929, + "step": 3146 + }, + { + "epoch": 0.03147, + "grad_norm": 0.6509274147125733, + "learning_rate": 0.003, + "loss": 4.1902, + "step": 3147 + }, + { + "epoch": 0.03148, + "grad_norm": 0.5546841496464855, + "learning_rate": 0.003, + "loss": 4.1877, + "step": 3148 + }, + { + "epoch": 0.03149, + "grad_norm": 0.46984021974514056, + "learning_rate": 0.003, + "loss": 4.1921, + "step": 3149 + }, + { + "epoch": 0.0315, + "grad_norm": 0.47872296043934637, + "learning_rate": 0.003, + "loss": 4.1773, + "step": 3150 + }, + { + "epoch": 0.03151, + "grad_norm": 0.4383095460015223, + "learning_rate": 0.003, + "loss": 4.1832, + "step": 3151 + }, + { + "epoch": 0.03152, + "grad_norm": 0.5147623540970385, + "learning_rate": 0.003, + "loss": 4.1863, + "step": 3152 + }, + { + "epoch": 0.03153, + "grad_norm": 0.5162583950949946, + "learning_rate": 0.003, + "loss": 4.1959, + "step": 3153 + }, + { + "epoch": 0.03154, + "grad_norm": 0.4461000862295419, + "learning_rate": 0.003, + "loss": 4.1831, + "step": 3154 + }, + { + "epoch": 0.03155, + "grad_norm": 0.4554669455184566, + "learning_rate": 0.003, + "loss": 4.1734, + "step": 3155 + }, + { + "epoch": 0.03156, + "grad_norm": 0.5334766080110984, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 3156 + }, + { + "epoch": 0.03157, + "grad_norm": 0.5732423454584378, + "learning_rate": 0.003, + "loss": 4.187, + "step": 3157 + }, + { + "epoch": 0.03158, + "grad_norm": 0.4974411380590613, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 3158 + }, + { + "epoch": 0.03159, + "grad_norm": 0.44203642501633, + "learning_rate": 0.003, + "loss": 4.1664, + "step": 3159 + }, + { + "epoch": 0.0316, + "grad_norm": 0.523140366094698, + "learning_rate": 0.003, + "loss": 4.1866, + "step": 3160 + }, + { + "epoch": 0.03161, + "grad_norm": 0.5162200569611098, + "learning_rate": 0.003, + "loss": 4.1933, + "step": 3161 + }, + { + "epoch": 0.03162, + "grad_norm": 0.5139469343607818, + "learning_rate": 0.003, + "loss": 4.197, + "step": 3162 + }, + { + "epoch": 0.03163, + "grad_norm": 0.4948430436476383, + "learning_rate": 0.003, + "loss": 4.1778, + "step": 3163 + }, + { + "epoch": 0.03164, + "grad_norm": 0.5669314442812998, + "learning_rate": 0.003, + "loss": 4.1763, + "step": 3164 + }, + { + "epoch": 0.03165, + "grad_norm": 0.6170132015572393, + "learning_rate": 0.003, + "loss": 4.151, + "step": 3165 + }, + { + "epoch": 0.03166, + "grad_norm": 0.7007311191133752, + "learning_rate": 0.003, + "loss": 4.1717, + "step": 3166 + }, + { + "epoch": 0.03167, + "grad_norm": 0.8595364667795417, + "learning_rate": 0.003, + "loss": 4.173, + "step": 3167 + }, + { + "epoch": 0.03168, + "grad_norm": 0.7655669847205984, + "learning_rate": 0.003, + "loss": 4.1955, + "step": 3168 + }, + { + "epoch": 0.03169, + "grad_norm": 0.6713498892330952, + "learning_rate": 0.003, + "loss": 4.2051, + "step": 3169 + }, + { + "epoch": 0.0317, + "grad_norm": 0.677153969130682, + "learning_rate": 0.003, + "loss": 4.1601, + "step": 3170 + }, + { + "epoch": 0.03171, + "grad_norm": 0.7390285282020239, + "learning_rate": 0.003, + "loss": 4.2052, + "step": 3171 + }, + { + "epoch": 0.03172, + "grad_norm": 0.7569313720684826, + "learning_rate": 0.003, + "loss": 4.1948, + "step": 3172 + }, + { + "epoch": 0.03173, + "grad_norm": 0.6724394145714953, + "learning_rate": 0.003, + "loss": 4.1656, + "step": 3173 + }, + { + "epoch": 0.03174, + "grad_norm": 0.6566446253256072, + "learning_rate": 0.003, + "loss": 4.1768, + "step": 3174 + }, + { + "epoch": 0.03175, + "grad_norm": 0.5308391109404651, + "learning_rate": 0.003, + "loss": 4.2005, + "step": 3175 + }, + { + "epoch": 0.03176, + "grad_norm": 0.522270282662199, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 3176 + }, + { + "epoch": 0.03177, + "grad_norm": 0.5118875159250269, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 3177 + }, + { + "epoch": 0.03178, + "grad_norm": 0.5975750285946342, + "learning_rate": 0.003, + "loss": 4.2169, + "step": 3178 + }, + { + "epoch": 0.03179, + "grad_norm": 0.6582127079564257, + "learning_rate": 0.003, + "loss": 4.1817, + "step": 3179 + }, + { + "epoch": 0.0318, + "grad_norm": 0.6638267847544939, + "learning_rate": 0.003, + "loss": 4.1847, + "step": 3180 + }, + { + "epoch": 0.03181, + "grad_norm": 0.633651026875217, + "learning_rate": 0.003, + "loss": 4.1946, + "step": 3181 + }, + { + "epoch": 0.03182, + "grad_norm": 0.5922002752582246, + "learning_rate": 0.003, + "loss": 4.1909, + "step": 3182 + }, + { + "epoch": 0.03183, + "grad_norm": 0.6660662970407278, + "learning_rate": 0.003, + "loss": 4.196, + "step": 3183 + }, + { + "epoch": 0.03184, + "grad_norm": 0.6465621990428836, + "learning_rate": 0.003, + "loss": 4.1683, + "step": 3184 + }, + { + "epoch": 0.03185, + "grad_norm": 0.6190012561941471, + "learning_rate": 0.003, + "loss": 4.1874, + "step": 3185 + }, + { + "epoch": 0.03186, + "grad_norm": 0.5745629525057602, + "learning_rate": 0.003, + "loss": 4.1843, + "step": 3186 + }, + { + "epoch": 0.03187, + "grad_norm": 0.596914271669763, + "learning_rate": 0.003, + "loss": 4.1467, + "step": 3187 + }, + { + "epoch": 0.03188, + "grad_norm": 0.5835579641541558, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 3188 + }, + { + "epoch": 0.03189, + "grad_norm": 0.576531148114492, + "learning_rate": 0.003, + "loss": 4.1702, + "step": 3189 + }, + { + "epoch": 0.0319, + "grad_norm": 0.46857212343029936, + "learning_rate": 0.003, + "loss": 4.1619, + "step": 3190 + }, + { + "epoch": 0.03191, + "grad_norm": 0.44669804116834894, + "learning_rate": 0.003, + "loss": 4.1916, + "step": 3191 + }, + { + "epoch": 0.03192, + "grad_norm": 0.5116025817651049, + "learning_rate": 0.003, + "loss": 4.167, + "step": 3192 + }, + { + "epoch": 0.03193, + "grad_norm": 0.5017323740559085, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 3193 + }, + { + "epoch": 0.03194, + "grad_norm": 0.5506388551758917, + "learning_rate": 0.003, + "loss": 4.1961, + "step": 3194 + }, + { + "epoch": 0.03195, + "grad_norm": 0.6905257500061265, + "learning_rate": 0.003, + "loss": 4.1672, + "step": 3195 + }, + { + "epoch": 0.03196, + "grad_norm": 0.6814664188108994, + "learning_rate": 0.003, + "loss": 4.18, + "step": 3196 + }, + { + "epoch": 0.03197, + "grad_norm": 0.6734616316609701, + "learning_rate": 0.003, + "loss": 4.15, + "step": 3197 + }, + { + "epoch": 0.03198, + "grad_norm": 0.5599435801670639, + "learning_rate": 0.003, + "loss": 4.1795, + "step": 3198 + }, + { + "epoch": 0.03199, + "grad_norm": 0.4771733892435305, + "learning_rate": 0.003, + "loss": 4.15, + "step": 3199 + }, + { + "epoch": 0.032, + "grad_norm": 0.4956367507190788, + "learning_rate": 0.003, + "loss": 4.172, + "step": 3200 + }, + { + "epoch": 0.03201, + "grad_norm": 0.40451540956897725, + "learning_rate": 0.003, + "loss": 4.1302, + "step": 3201 + }, + { + "epoch": 0.03202, + "grad_norm": 0.42753873923321506, + "learning_rate": 0.003, + "loss": 4.1703, + "step": 3202 + }, + { + "epoch": 0.03203, + "grad_norm": 0.45457758457355213, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 3203 + }, + { + "epoch": 0.03204, + "grad_norm": 0.49270456285505987, + "learning_rate": 0.003, + "loss": 4.1737, + "step": 3204 + }, + { + "epoch": 0.03205, + "grad_norm": 0.551907056420435, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 3205 + }, + { + "epoch": 0.03206, + "grad_norm": 0.5848754633539133, + "learning_rate": 0.003, + "loss": 4.1654, + "step": 3206 + }, + { + "epoch": 0.03207, + "grad_norm": 0.5646598219152835, + "learning_rate": 0.003, + "loss": 4.1449, + "step": 3207 + }, + { + "epoch": 0.03208, + "grad_norm": 0.5057453481084084, + "learning_rate": 0.003, + "loss": 4.1662, + "step": 3208 + }, + { + "epoch": 0.03209, + "grad_norm": 0.4868038489807391, + "learning_rate": 0.003, + "loss": 4.2026, + "step": 3209 + }, + { + "epoch": 0.0321, + "grad_norm": 0.5084192511453576, + "learning_rate": 0.003, + "loss": 4.1839, + "step": 3210 + }, + { + "epoch": 0.03211, + "grad_norm": 0.536979773481941, + "learning_rate": 0.003, + "loss": 4.1566, + "step": 3211 + }, + { + "epoch": 0.03212, + "grad_norm": 0.6453864413806403, + "learning_rate": 0.003, + "loss": 4.171, + "step": 3212 + }, + { + "epoch": 0.03213, + "grad_norm": 0.6979695060374274, + "learning_rate": 0.003, + "loss": 4.1767, + "step": 3213 + }, + { + "epoch": 0.03214, + "grad_norm": 0.8362949946620015, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 3214 + }, + { + "epoch": 0.03215, + "grad_norm": 0.9000343317810777, + "learning_rate": 0.003, + "loss": 4.1727, + "step": 3215 + }, + { + "epoch": 0.03216, + "grad_norm": 0.8252513255224121, + "learning_rate": 0.003, + "loss": 4.1822, + "step": 3216 + }, + { + "epoch": 0.03217, + "grad_norm": 0.7553474851929083, + "learning_rate": 0.003, + "loss": 4.18, + "step": 3217 + }, + { + "epoch": 0.03218, + "grad_norm": 0.7184034075019531, + "learning_rate": 0.003, + "loss": 4.1747, + "step": 3218 + }, + { + "epoch": 0.03219, + "grad_norm": 0.7559505824598467, + "learning_rate": 0.003, + "loss": 4.1839, + "step": 3219 + }, + { + "epoch": 0.0322, + "grad_norm": 0.7462679166089925, + "learning_rate": 0.003, + "loss": 4.1959, + "step": 3220 + }, + { + "epoch": 0.03221, + "grad_norm": 0.8114417269245792, + "learning_rate": 0.003, + "loss": 4.2066, + "step": 3221 + }, + { + "epoch": 0.03222, + "grad_norm": 0.7398684360618538, + "learning_rate": 0.003, + "loss": 4.1975, + "step": 3222 + }, + { + "epoch": 0.03223, + "grad_norm": 0.6559376119246889, + "learning_rate": 0.003, + "loss": 4.1971, + "step": 3223 + }, + { + "epoch": 0.03224, + "grad_norm": 0.6017029656710154, + "learning_rate": 0.003, + "loss": 4.1559, + "step": 3224 + }, + { + "epoch": 0.03225, + "grad_norm": 0.5653674381326333, + "learning_rate": 0.003, + "loss": 4.1866, + "step": 3225 + }, + { + "epoch": 0.03226, + "grad_norm": 0.6502250085881689, + "learning_rate": 0.003, + "loss": 4.1724, + "step": 3226 + }, + { + "epoch": 0.03227, + "grad_norm": 0.7143712957982395, + "learning_rate": 0.003, + "loss": 4.1994, + "step": 3227 + }, + { + "epoch": 0.03228, + "grad_norm": 0.6315781743986074, + "learning_rate": 0.003, + "loss": 4.1692, + "step": 3228 + }, + { + "epoch": 0.03229, + "grad_norm": 0.5567058811508958, + "learning_rate": 0.003, + "loss": 4.1858, + "step": 3229 + }, + { + "epoch": 0.0323, + "grad_norm": 0.5544929832938678, + "learning_rate": 0.003, + "loss": 4.1766, + "step": 3230 + }, + { + "epoch": 0.03231, + "grad_norm": 0.470332875801311, + "learning_rate": 0.003, + "loss": 4.1883, + "step": 3231 + }, + { + "epoch": 0.03232, + "grad_norm": 0.4362924227260804, + "learning_rate": 0.003, + "loss": 4.1748, + "step": 3232 + }, + { + "epoch": 0.03233, + "grad_norm": 0.4029479081021775, + "learning_rate": 0.003, + "loss": 4.1893, + "step": 3233 + }, + { + "epoch": 0.03234, + "grad_norm": 0.48479141962478917, + "learning_rate": 0.003, + "loss": 4.1724, + "step": 3234 + }, + { + "epoch": 0.03235, + "grad_norm": 0.5441326810507732, + "learning_rate": 0.003, + "loss": 4.1834, + "step": 3235 + }, + { + "epoch": 0.03236, + "grad_norm": 0.5291856539541236, + "learning_rate": 0.003, + "loss": 4.1742, + "step": 3236 + }, + { + "epoch": 0.03237, + "grad_norm": 0.4961166691014795, + "learning_rate": 0.003, + "loss": 4.1904, + "step": 3237 + }, + { + "epoch": 0.03238, + "grad_norm": 0.4503193502587542, + "learning_rate": 0.003, + "loss": 4.1937, + "step": 3238 + }, + { + "epoch": 0.03239, + "grad_norm": 0.49034979349671226, + "learning_rate": 0.003, + "loss": 4.176, + "step": 3239 + }, + { + "epoch": 0.0324, + "grad_norm": 0.5016168112544078, + "learning_rate": 0.003, + "loss": 4.1905, + "step": 3240 + }, + { + "epoch": 0.03241, + "grad_norm": 0.576118532422288, + "learning_rate": 0.003, + "loss": 4.1878, + "step": 3241 + }, + { + "epoch": 0.03242, + "grad_norm": 0.6819472138134928, + "learning_rate": 0.003, + "loss": 4.165, + "step": 3242 + }, + { + "epoch": 0.03243, + "grad_norm": 0.7729342734665511, + "learning_rate": 0.003, + "loss": 4.1588, + "step": 3243 + }, + { + "epoch": 0.03244, + "grad_norm": 0.8596141569373975, + "learning_rate": 0.003, + "loss": 4.1873, + "step": 3244 + }, + { + "epoch": 0.03245, + "grad_norm": 0.8099112239109375, + "learning_rate": 0.003, + "loss": 4.2019, + "step": 3245 + }, + { + "epoch": 0.03246, + "grad_norm": 0.6033775070976766, + "learning_rate": 0.003, + "loss": 4.1827, + "step": 3246 + }, + { + "epoch": 0.03247, + "grad_norm": 0.7122383502056996, + "learning_rate": 0.003, + "loss": 4.1951, + "step": 3247 + }, + { + "epoch": 0.03248, + "grad_norm": 0.6134714767115921, + "learning_rate": 0.003, + "loss": 4.2117, + "step": 3248 + }, + { + "epoch": 0.03249, + "grad_norm": 0.551686621394429, + "learning_rate": 0.003, + "loss": 4.172, + "step": 3249 + }, + { + "epoch": 0.0325, + "grad_norm": 0.4692280069519903, + "learning_rate": 0.003, + "loss": 4.2016, + "step": 3250 + }, + { + "epoch": 0.03251, + "grad_norm": 0.4762141737958913, + "learning_rate": 0.003, + "loss": 4.1968, + "step": 3251 + }, + { + "epoch": 0.03252, + "grad_norm": 0.451291493723249, + "learning_rate": 0.003, + "loss": 4.1609, + "step": 3252 + }, + { + "epoch": 0.03253, + "grad_norm": 0.5006035466184977, + "learning_rate": 0.003, + "loss": 4.1658, + "step": 3253 + }, + { + "epoch": 0.03254, + "grad_norm": 0.49103882046130487, + "learning_rate": 0.003, + "loss": 4.1589, + "step": 3254 + }, + { + "epoch": 0.03255, + "grad_norm": 0.528505966654426, + "learning_rate": 0.003, + "loss": 4.1726, + "step": 3255 + }, + { + "epoch": 0.03256, + "grad_norm": 0.5411019927206225, + "learning_rate": 0.003, + "loss": 4.1715, + "step": 3256 + }, + { + "epoch": 0.03257, + "grad_norm": 0.5426466711031169, + "learning_rate": 0.003, + "loss": 4.1571, + "step": 3257 + }, + { + "epoch": 0.03258, + "grad_norm": 0.5458703600164347, + "learning_rate": 0.003, + "loss": 4.1709, + "step": 3258 + }, + { + "epoch": 0.03259, + "grad_norm": 0.5203040924859128, + "learning_rate": 0.003, + "loss": 4.1699, + "step": 3259 + }, + { + "epoch": 0.0326, + "grad_norm": 0.45480155011690926, + "learning_rate": 0.003, + "loss": 4.1662, + "step": 3260 + }, + { + "epoch": 0.03261, + "grad_norm": 0.4731935212642663, + "learning_rate": 0.003, + "loss": 4.1413, + "step": 3261 + }, + { + "epoch": 0.03262, + "grad_norm": 0.47522534323877436, + "learning_rate": 0.003, + "loss": 4.1669, + "step": 3262 + }, + { + "epoch": 0.03263, + "grad_norm": 0.4999694797882516, + "learning_rate": 0.003, + "loss": 4.1772, + "step": 3263 + }, + { + "epoch": 0.03264, + "grad_norm": 0.5217567082705248, + "learning_rate": 0.003, + "loss": 4.189, + "step": 3264 + }, + { + "epoch": 0.03265, + "grad_norm": 0.6014802197349467, + "learning_rate": 0.003, + "loss": 4.187, + "step": 3265 + }, + { + "epoch": 0.03266, + "grad_norm": 0.6262382617606659, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 3266 + }, + { + "epoch": 0.03267, + "grad_norm": 0.6366837634406238, + "learning_rate": 0.003, + "loss": 4.1524, + "step": 3267 + }, + { + "epoch": 0.03268, + "grad_norm": 0.724266364773543, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 3268 + }, + { + "epoch": 0.03269, + "grad_norm": 0.7761710629109877, + "learning_rate": 0.003, + "loss": 4.1789, + "step": 3269 + }, + { + "epoch": 0.0327, + "grad_norm": 0.7383390055481723, + "learning_rate": 0.003, + "loss": 4.2091, + "step": 3270 + }, + { + "epoch": 0.03271, + "grad_norm": 0.6168450072471018, + "learning_rate": 0.003, + "loss": 4.1678, + "step": 3271 + }, + { + "epoch": 0.03272, + "grad_norm": 0.5388626714250451, + "learning_rate": 0.003, + "loss": 4.1712, + "step": 3272 + }, + { + "epoch": 0.03273, + "grad_norm": 0.541266940860741, + "learning_rate": 0.003, + "loss": 4.1713, + "step": 3273 + }, + { + "epoch": 0.03274, + "grad_norm": 0.5293709638699655, + "learning_rate": 0.003, + "loss": 4.1897, + "step": 3274 + }, + { + "epoch": 0.03275, + "grad_norm": 0.45689882391191533, + "learning_rate": 0.003, + "loss": 4.1828, + "step": 3275 + }, + { + "epoch": 0.03276, + "grad_norm": 0.341646145545343, + "learning_rate": 0.003, + "loss": 4.1585, + "step": 3276 + }, + { + "epoch": 0.03277, + "grad_norm": 0.36838541029861377, + "learning_rate": 0.003, + "loss": 4.1576, + "step": 3277 + }, + { + "epoch": 0.03278, + "grad_norm": 0.3636023442028596, + "learning_rate": 0.003, + "loss": 4.1669, + "step": 3278 + }, + { + "epoch": 0.03279, + "grad_norm": 0.3985620455372761, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 3279 + }, + { + "epoch": 0.0328, + "grad_norm": 0.4543675760898354, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 3280 + }, + { + "epoch": 0.03281, + "grad_norm": 0.5083262289907728, + "learning_rate": 0.003, + "loss": 4.1897, + "step": 3281 + }, + { + "epoch": 0.03282, + "grad_norm": 0.6315851302649976, + "learning_rate": 0.003, + "loss": 4.1492, + "step": 3282 + }, + { + "epoch": 0.03283, + "grad_norm": 0.7322734724447769, + "learning_rate": 0.003, + "loss": 4.1521, + "step": 3283 + }, + { + "epoch": 0.03284, + "grad_norm": 0.7647225152469821, + "learning_rate": 0.003, + "loss": 4.1977, + "step": 3284 + }, + { + "epoch": 0.03285, + "grad_norm": 0.8483759241793429, + "learning_rate": 0.003, + "loss": 4.1483, + "step": 3285 + }, + { + "epoch": 0.03286, + "grad_norm": 0.7999560566423449, + "learning_rate": 0.003, + "loss": 4.1904, + "step": 3286 + }, + { + "epoch": 0.03287, + "grad_norm": 0.71589535526088, + "learning_rate": 0.003, + "loss": 4.196, + "step": 3287 + }, + { + "epoch": 0.03288, + "grad_norm": 0.6336111011473581, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 3288 + }, + { + "epoch": 0.03289, + "grad_norm": 0.6641631109672658, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 3289 + }, + { + "epoch": 0.0329, + "grad_norm": 0.6647898720593706, + "learning_rate": 0.003, + "loss": 4.2094, + "step": 3290 + }, + { + "epoch": 0.03291, + "grad_norm": 0.5807864922821838, + "learning_rate": 0.003, + "loss": 4.1898, + "step": 3291 + }, + { + "epoch": 0.03292, + "grad_norm": 0.515754721038097, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 3292 + }, + { + "epoch": 0.03293, + "grad_norm": 0.5498053265889223, + "learning_rate": 0.003, + "loss": 4.1693, + "step": 3293 + }, + { + "epoch": 0.03294, + "grad_norm": 0.6841987985815203, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 3294 + }, + { + "epoch": 0.03295, + "grad_norm": 0.6093729695331228, + "learning_rate": 0.003, + "loss": 4.1702, + "step": 3295 + }, + { + "epoch": 0.03296, + "grad_norm": 0.6089661662715524, + "learning_rate": 0.003, + "loss": 4.1627, + "step": 3296 + }, + { + "epoch": 0.03297, + "grad_norm": 0.5614137495318692, + "learning_rate": 0.003, + "loss": 4.1715, + "step": 3297 + }, + { + "epoch": 0.03298, + "grad_norm": 0.5134366817986185, + "learning_rate": 0.003, + "loss": 4.1886, + "step": 3298 + }, + { + "epoch": 0.03299, + "grad_norm": 0.549963907837565, + "learning_rate": 0.003, + "loss": 4.1739, + "step": 3299 + }, + { + "epoch": 0.033, + "grad_norm": 0.6519931831327299, + "learning_rate": 0.003, + "loss": 4.1675, + "step": 3300 + }, + { + "epoch": 0.03301, + "grad_norm": 0.7183972305593801, + "learning_rate": 0.003, + "loss": 4.2021, + "step": 3301 + }, + { + "epoch": 0.03302, + "grad_norm": 0.7018171786354589, + "learning_rate": 0.003, + "loss": 4.1793, + "step": 3302 + }, + { + "epoch": 0.03303, + "grad_norm": 0.5921722024443354, + "learning_rate": 0.003, + "loss": 4.1793, + "step": 3303 + }, + { + "epoch": 0.03304, + "grad_norm": 0.5309946357381335, + "learning_rate": 0.003, + "loss": 4.1726, + "step": 3304 + }, + { + "epoch": 0.03305, + "grad_norm": 0.5654757181942384, + "learning_rate": 0.003, + "loss": 4.1781, + "step": 3305 + }, + { + "epoch": 0.03306, + "grad_norm": 0.5651614262192222, + "learning_rate": 0.003, + "loss": 4.194, + "step": 3306 + }, + { + "epoch": 0.03307, + "grad_norm": 0.6220742834040969, + "learning_rate": 0.003, + "loss": 4.1933, + "step": 3307 + }, + { + "epoch": 0.03308, + "grad_norm": 0.6857411580561741, + "learning_rate": 0.003, + "loss": 4.1774, + "step": 3308 + }, + { + "epoch": 0.03309, + "grad_norm": 0.7647611362606778, + "learning_rate": 0.003, + "loss": 4.168, + "step": 3309 + }, + { + "epoch": 0.0331, + "grad_norm": 0.9027911762824596, + "learning_rate": 0.003, + "loss": 4.1838, + "step": 3310 + }, + { + "epoch": 0.03311, + "grad_norm": 0.9232745325192504, + "learning_rate": 0.003, + "loss": 4.2092, + "step": 3311 + }, + { + "epoch": 0.03312, + "grad_norm": 1.0033276702218215, + "learning_rate": 0.003, + "loss": 4.1863, + "step": 3312 + }, + { + "epoch": 0.03313, + "grad_norm": 0.8616181406278286, + "learning_rate": 0.003, + "loss": 4.217, + "step": 3313 + }, + { + "epoch": 0.03314, + "grad_norm": 0.6538312191871826, + "learning_rate": 0.003, + "loss": 4.1482, + "step": 3314 + }, + { + "epoch": 0.03315, + "grad_norm": 0.6513293592499171, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 3315 + }, + { + "epoch": 0.03316, + "grad_norm": 0.5753636988224785, + "learning_rate": 0.003, + "loss": 4.1986, + "step": 3316 + }, + { + "epoch": 0.03317, + "grad_norm": 0.5660504612340471, + "learning_rate": 0.003, + "loss": 4.1897, + "step": 3317 + }, + { + "epoch": 0.03318, + "grad_norm": 0.5321945122016645, + "learning_rate": 0.003, + "loss": 4.1661, + "step": 3318 + }, + { + "epoch": 0.03319, + "grad_norm": 0.5341973320019645, + "learning_rate": 0.003, + "loss": 4.1598, + "step": 3319 + }, + { + "epoch": 0.0332, + "grad_norm": 0.47286473477321905, + "learning_rate": 0.003, + "loss": 4.1852, + "step": 3320 + }, + { + "epoch": 0.03321, + "grad_norm": 0.5050275992577558, + "learning_rate": 0.003, + "loss": 4.1965, + "step": 3321 + }, + { + "epoch": 0.03322, + "grad_norm": 0.4844023535662593, + "learning_rate": 0.003, + "loss": 4.1482, + "step": 3322 + }, + { + "epoch": 0.03323, + "grad_norm": 0.43309930004470687, + "learning_rate": 0.003, + "loss": 4.1893, + "step": 3323 + }, + { + "epoch": 0.03324, + "grad_norm": 0.44890262885350884, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 3324 + }, + { + "epoch": 0.03325, + "grad_norm": 0.5095521797271855, + "learning_rate": 0.003, + "loss": 4.1917, + "step": 3325 + }, + { + "epoch": 0.03326, + "grad_norm": 0.477471322433437, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 3326 + }, + { + "epoch": 0.03327, + "grad_norm": 0.4315337339422698, + "learning_rate": 0.003, + "loss": 4.1751, + "step": 3327 + }, + { + "epoch": 0.03328, + "grad_norm": 0.4171492646524173, + "learning_rate": 0.003, + "loss": 4.1912, + "step": 3328 + }, + { + "epoch": 0.03329, + "grad_norm": 0.4506599360723401, + "learning_rate": 0.003, + "loss": 4.149, + "step": 3329 + }, + { + "epoch": 0.0333, + "grad_norm": 0.43373534373844225, + "learning_rate": 0.003, + "loss": 4.1533, + "step": 3330 + }, + { + "epoch": 0.03331, + "grad_norm": 0.48072192590135343, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 3331 + }, + { + "epoch": 0.03332, + "grad_norm": 0.5603763883718295, + "learning_rate": 0.003, + "loss": 4.1515, + "step": 3332 + }, + { + "epoch": 0.03333, + "grad_norm": 0.5773402592299904, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 3333 + }, + { + "epoch": 0.03334, + "grad_norm": 0.5925239668699488, + "learning_rate": 0.003, + "loss": 4.1717, + "step": 3334 + }, + { + "epoch": 0.03335, + "grad_norm": 0.5265113459144353, + "learning_rate": 0.003, + "loss": 4.1765, + "step": 3335 + }, + { + "epoch": 0.03336, + "grad_norm": 0.4418521183409816, + "learning_rate": 0.003, + "loss": 4.1732, + "step": 3336 + }, + { + "epoch": 0.03337, + "grad_norm": 0.42952166536927455, + "learning_rate": 0.003, + "loss": 4.1516, + "step": 3337 + }, + { + "epoch": 0.03338, + "grad_norm": 0.43519066754716446, + "learning_rate": 0.003, + "loss": 4.1603, + "step": 3338 + }, + { + "epoch": 0.03339, + "grad_norm": 0.4658156069803623, + "learning_rate": 0.003, + "loss": 4.1638, + "step": 3339 + }, + { + "epoch": 0.0334, + "grad_norm": 0.5171946739861909, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 3340 + }, + { + "epoch": 0.03341, + "grad_norm": 0.6356266827382321, + "learning_rate": 0.003, + "loss": 4.1677, + "step": 3341 + }, + { + "epoch": 0.03342, + "grad_norm": 0.6867092162549009, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 3342 + }, + { + "epoch": 0.03343, + "grad_norm": 0.704031363814926, + "learning_rate": 0.003, + "loss": 4.1361, + "step": 3343 + }, + { + "epoch": 0.03344, + "grad_norm": 0.7513440673993107, + "learning_rate": 0.003, + "loss": 4.1949, + "step": 3344 + }, + { + "epoch": 0.03345, + "grad_norm": 0.7519677795539157, + "learning_rate": 0.003, + "loss": 4.1731, + "step": 3345 + }, + { + "epoch": 0.03346, + "grad_norm": 0.7713219880690217, + "learning_rate": 0.003, + "loss": 4.19, + "step": 3346 + }, + { + "epoch": 0.03347, + "grad_norm": 0.8480035915924508, + "learning_rate": 0.003, + "loss": 4.1797, + "step": 3347 + }, + { + "epoch": 0.03348, + "grad_norm": 0.8325415221214432, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 3348 + }, + { + "epoch": 0.03349, + "grad_norm": 0.7621143367252534, + "learning_rate": 0.003, + "loss": 4.2199, + "step": 3349 + }, + { + "epoch": 0.0335, + "grad_norm": 0.7234514614978355, + "learning_rate": 0.003, + "loss": 4.1749, + "step": 3350 + }, + { + "epoch": 0.03351, + "grad_norm": 0.6715941237384234, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 3351 + }, + { + "epoch": 0.03352, + "grad_norm": 0.6771948560655807, + "learning_rate": 0.003, + "loss": 4.1972, + "step": 3352 + }, + { + "epoch": 0.03353, + "grad_norm": 0.7374767561640508, + "learning_rate": 0.003, + "loss": 4.1997, + "step": 3353 + }, + { + "epoch": 0.03354, + "grad_norm": 0.7924221564216216, + "learning_rate": 0.003, + "loss": 4.1685, + "step": 3354 + }, + { + "epoch": 0.03355, + "grad_norm": 0.752343695322846, + "learning_rate": 0.003, + "loss": 4.1919, + "step": 3355 + }, + { + "epoch": 0.03356, + "grad_norm": 0.6955515760269133, + "learning_rate": 0.003, + "loss": 4.2144, + "step": 3356 + }, + { + "epoch": 0.03357, + "grad_norm": 0.6439226598770458, + "learning_rate": 0.003, + "loss": 4.1843, + "step": 3357 + }, + { + "epoch": 0.03358, + "grad_norm": 0.6072963552767238, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 3358 + }, + { + "epoch": 0.03359, + "grad_norm": 0.5544317009226006, + "learning_rate": 0.003, + "loss": 4.1868, + "step": 3359 + }, + { + "epoch": 0.0336, + "grad_norm": 0.7584252977946636, + "learning_rate": 0.003, + "loss": 4.2055, + "step": 3360 + }, + { + "epoch": 0.03361, + "grad_norm": 0.8675540717955526, + "learning_rate": 0.003, + "loss": 4.2091, + "step": 3361 + }, + { + "epoch": 0.03362, + "grad_norm": 0.8109217912617241, + "learning_rate": 0.003, + "loss": 4.2092, + "step": 3362 + }, + { + "epoch": 0.03363, + "grad_norm": 0.7666749182841046, + "learning_rate": 0.003, + "loss": 4.1611, + "step": 3363 + }, + { + "epoch": 0.03364, + "grad_norm": 0.7823149796193858, + "learning_rate": 0.003, + "loss": 4.1828, + "step": 3364 + }, + { + "epoch": 0.03365, + "grad_norm": 0.6387430725521948, + "learning_rate": 0.003, + "loss": 4.1715, + "step": 3365 + }, + { + "epoch": 0.03366, + "grad_norm": 0.5594833522786266, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 3366 + }, + { + "epoch": 0.03367, + "grad_norm": 0.5181383097238287, + "learning_rate": 0.003, + "loss": 4.1575, + "step": 3367 + }, + { + "epoch": 0.03368, + "grad_norm": 0.5760572623008274, + "learning_rate": 0.003, + "loss": 4.207, + "step": 3368 + }, + { + "epoch": 0.03369, + "grad_norm": 0.550972005009095, + "learning_rate": 0.003, + "loss": 4.165, + "step": 3369 + }, + { + "epoch": 0.0337, + "grad_norm": 0.5261333264386098, + "learning_rate": 0.003, + "loss": 4.1995, + "step": 3370 + }, + { + "epoch": 0.03371, + "grad_norm": 0.4162241207425965, + "learning_rate": 0.003, + "loss": 4.1281, + "step": 3371 + }, + { + "epoch": 0.03372, + "grad_norm": 0.3964364873259744, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 3372 + }, + { + "epoch": 0.03373, + "grad_norm": 0.39921807899478956, + "learning_rate": 0.003, + "loss": 4.1886, + "step": 3373 + }, + { + "epoch": 0.03374, + "grad_norm": 0.40722470978586633, + "learning_rate": 0.003, + "loss": 4.1693, + "step": 3374 + }, + { + "epoch": 0.03375, + "grad_norm": 0.4174362735293826, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 3375 + }, + { + "epoch": 0.03376, + "grad_norm": 0.4670589348527475, + "learning_rate": 0.003, + "loss": 4.133, + "step": 3376 + }, + { + "epoch": 0.03377, + "grad_norm": 0.5539342573100582, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 3377 + }, + { + "epoch": 0.03378, + "grad_norm": 0.6187862572290908, + "learning_rate": 0.003, + "loss": 4.1833, + "step": 3378 + }, + { + "epoch": 0.03379, + "grad_norm": 0.6015622211172064, + "learning_rate": 0.003, + "loss": 4.1567, + "step": 3379 + }, + { + "epoch": 0.0338, + "grad_norm": 0.4615401428395281, + "learning_rate": 0.003, + "loss": 4.1799, + "step": 3380 + }, + { + "epoch": 0.03381, + "grad_norm": 0.32933941833228836, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 3381 + }, + { + "epoch": 0.03382, + "grad_norm": 0.3788346338425552, + "learning_rate": 0.003, + "loss": 4.1673, + "step": 3382 + }, + { + "epoch": 0.03383, + "grad_norm": 0.376989210662061, + "learning_rate": 0.003, + "loss": 4.13, + "step": 3383 + }, + { + "epoch": 0.03384, + "grad_norm": 0.34387151748219785, + "learning_rate": 0.003, + "loss": 4.1531, + "step": 3384 + }, + { + "epoch": 0.03385, + "grad_norm": 0.3352611282783692, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 3385 + }, + { + "epoch": 0.03386, + "grad_norm": 0.33892990368837295, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 3386 + }, + { + "epoch": 0.03387, + "grad_norm": 0.37206580118259613, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 3387 + }, + { + "epoch": 0.03388, + "grad_norm": 0.41506855452840574, + "learning_rate": 0.003, + "loss": 4.1663, + "step": 3388 + }, + { + "epoch": 0.03389, + "grad_norm": 0.42405220029053703, + "learning_rate": 0.003, + "loss": 4.1835, + "step": 3389 + }, + { + "epoch": 0.0339, + "grad_norm": 0.4090594827115757, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 3390 + }, + { + "epoch": 0.03391, + "grad_norm": 0.4342402082004512, + "learning_rate": 0.003, + "loss": 4.148, + "step": 3391 + }, + { + "epoch": 0.03392, + "grad_norm": 0.49470807894733526, + "learning_rate": 0.003, + "loss": 4.1568, + "step": 3392 + }, + { + "epoch": 0.03393, + "grad_norm": 0.7113520954651117, + "learning_rate": 0.003, + "loss": 4.1741, + "step": 3393 + }, + { + "epoch": 0.03394, + "grad_norm": 0.8695013133927905, + "learning_rate": 0.003, + "loss": 4.1852, + "step": 3394 + }, + { + "epoch": 0.03395, + "grad_norm": 0.937839460650371, + "learning_rate": 0.003, + "loss": 4.1994, + "step": 3395 + }, + { + "epoch": 0.03396, + "grad_norm": 0.7904694460773314, + "learning_rate": 0.003, + "loss": 4.1798, + "step": 3396 + }, + { + "epoch": 0.03397, + "grad_norm": 0.7235872457578897, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 3397 + }, + { + "epoch": 0.03398, + "grad_norm": 0.8062806717697898, + "learning_rate": 0.003, + "loss": 4.2062, + "step": 3398 + }, + { + "epoch": 0.03399, + "grad_norm": 0.7158225027891656, + "learning_rate": 0.003, + "loss": 4.2222, + "step": 3399 + }, + { + "epoch": 0.034, + "grad_norm": 0.6123275074220181, + "learning_rate": 0.003, + "loss": 4.1995, + "step": 3400 + }, + { + "epoch": 0.03401, + "grad_norm": 0.6599152366758141, + "learning_rate": 0.003, + "loss": 4.1767, + "step": 3401 + }, + { + "epoch": 0.03402, + "grad_norm": 0.6314070611427443, + "learning_rate": 0.003, + "loss": 4.1971, + "step": 3402 + }, + { + "epoch": 0.03403, + "grad_norm": 0.6571294664516991, + "learning_rate": 0.003, + "loss": 4.1648, + "step": 3403 + }, + { + "epoch": 0.03404, + "grad_norm": 0.6404238631170781, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 3404 + }, + { + "epoch": 0.03405, + "grad_norm": 0.6321854819314461, + "learning_rate": 0.003, + "loss": 4.1828, + "step": 3405 + }, + { + "epoch": 0.03406, + "grad_norm": 0.6941079886283862, + "learning_rate": 0.003, + "loss": 4.1694, + "step": 3406 + }, + { + "epoch": 0.03407, + "grad_norm": 0.8136592990040448, + "learning_rate": 0.003, + "loss": 4.1931, + "step": 3407 + }, + { + "epoch": 0.03408, + "grad_norm": 0.8054752836427478, + "learning_rate": 0.003, + "loss": 4.1777, + "step": 3408 + }, + { + "epoch": 0.03409, + "grad_norm": 0.7438300271168681, + "learning_rate": 0.003, + "loss": 4.1836, + "step": 3409 + }, + { + "epoch": 0.0341, + "grad_norm": 0.7769660820773376, + "learning_rate": 0.003, + "loss": 4.1811, + "step": 3410 + }, + { + "epoch": 0.03411, + "grad_norm": 0.7387445989360225, + "learning_rate": 0.003, + "loss": 4.1964, + "step": 3411 + }, + { + "epoch": 0.03412, + "grad_norm": 0.7543102885862639, + "learning_rate": 0.003, + "loss": 4.2165, + "step": 3412 + }, + { + "epoch": 0.03413, + "grad_norm": 0.7858533091946368, + "learning_rate": 0.003, + "loss": 4.2279, + "step": 3413 + }, + { + "epoch": 0.03414, + "grad_norm": 0.8085223407654301, + "learning_rate": 0.003, + "loss": 4.2197, + "step": 3414 + }, + { + "epoch": 0.03415, + "grad_norm": 0.7204513262789403, + "learning_rate": 0.003, + "loss": 4.1982, + "step": 3415 + }, + { + "epoch": 0.03416, + "grad_norm": 0.669578263783182, + "learning_rate": 0.003, + "loss": 4.2009, + "step": 3416 + }, + { + "epoch": 0.03417, + "grad_norm": 0.595729901049528, + "learning_rate": 0.003, + "loss": 4.1874, + "step": 3417 + }, + { + "epoch": 0.03418, + "grad_norm": 0.6302485601669457, + "learning_rate": 0.003, + "loss": 4.1603, + "step": 3418 + }, + { + "epoch": 0.03419, + "grad_norm": 0.48681001127779644, + "learning_rate": 0.003, + "loss": 4.1774, + "step": 3419 + }, + { + "epoch": 0.0342, + "grad_norm": 0.4381251825205999, + "learning_rate": 0.003, + "loss": 4.1848, + "step": 3420 + }, + { + "epoch": 0.03421, + "grad_norm": 0.40842368527410894, + "learning_rate": 0.003, + "loss": 4.1733, + "step": 3421 + }, + { + "epoch": 0.03422, + "grad_norm": 0.4158377016736793, + "learning_rate": 0.003, + "loss": 4.1888, + "step": 3422 + }, + { + "epoch": 0.03423, + "grad_norm": 0.38251097705951487, + "learning_rate": 0.003, + "loss": 4.1647, + "step": 3423 + }, + { + "epoch": 0.03424, + "grad_norm": 0.4199738680628049, + "learning_rate": 0.003, + "loss": 4.1889, + "step": 3424 + }, + { + "epoch": 0.03425, + "grad_norm": 0.5152529015696985, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 3425 + }, + { + "epoch": 0.03426, + "grad_norm": 0.5309877993913092, + "learning_rate": 0.003, + "loss": 4.1773, + "step": 3426 + }, + { + "epoch": 0.03427, + "grad_norm": 0.4899224228978265, + "learning_rate": 0.003, + "loss": 4.1445, + "step": 3427 + }, + { + "epoch": 0.03428, + "grad_norm": 0.5031074901419099, + "learning_rate": 0.003, + "loss": 4.1439, + "step": 3428 + }, + { + "epoch": 0.03429, + "grad_norm": 0.6278449502427672, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 3429 + }, + { + "epoch": 0.0343, + "grad_norm": 0.6739968965302988, + "learning_rate": 0.003, + "loss": 4.1611, + "step": 3430 + }, + { + "epoch": 0.03431, + "grad_norm": 0.7060406786523291, + "learning_rate": 0.003, + "loss": 4.1589, + "step": 3431 + }, + { + "epoch": 0.03432, + "grad_norm": 0.6678436460415758, + "learning_rate": 0.003, + "loss": 4.1677, + "step": 3432 + }, + { + "epoch": 0.03433, + "grad_norm": 0.47826597565957213, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 3433 + }, + { + "epoch": 0.03434, + "grad_norm": 0.4648969929741348, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 3434 + }, + { + "epoch": 0.03435, + "grad_norm": 0.46695854146561, + "learning_rate": 0.003, + "loss": 4.1381, + "step": 3435 + }, + { + "epoch": 0.03436, + "grad_norm": 0.5162770863422669, + "learning_rate": 0.003, + "loss": 4.1665, + "step": 3436 + }, + { + "epoch": 0.03437, + "grad_norm": 0.48303509013107304, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 3437 + }, + { + "epoch": 0.03438, + "grad_norm": 0.5052348764127375, + "learning_rate": 0.003, + "loss": 4.1693, + "step": 3438 + }, + { + "epoch": 0.03439, + "grad_norm": 0.4517303897020918, + "learning_rate": 0.003, + "loss": 4.1673, + "step": 3439 + }, + { + "epoch": 0.0344, + "grad_norm": 0.44296664325342977, + "learning_rate": 0.003, + "loss": 4.14, + "step": 3440 + }, + { + "epoch": 0.03441, + "grad_norm": 0.4450842770576517, + "learning_rate": 0.003, + "loss": 4.1618, + "step": 3441 + }, + { + "epoch": 0.03442, + "grad_norm": 0.4941470142821772, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 3442 + }, + { + "epoch": 0.03443, + "grad_norm": 0.5954542724862985, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 3443 + }, + { + "epoch": 0.03444, + "grad_norm": 0.679140132757601, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 3444 + }, + { + "epoch": 0.03445, + "grad_norm": 0.6296700934818499, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 3445 + }, + { + "epoch": 0.03446, + "grad_norm": 0.5424802868176756, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 3446 + }, + { + "epoch": 0.03447, + "grad_norm": 0.4237052026562954, + "learning_rate": 0.003, + "loss": 4.1749, + "step": 3447 + }, + { + "epoch": 0.03448, + "grad_norm": 0.49663616285480033, + "learning_rate": 0.003, + "loss": 4.1465, + "step": 3448 + }, + { + "epoch": 0.03449, + "grad_norm": 0.4965879325162881, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 3449 + }, + { + "epoch": 0.0345, + "grad_norm": 0.4699418192975183, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 3450 + }, + { + "epoch": 0.03451, + "grad_norm": 0.4342808651184125, + "learning_rate": 0.003, + "loss": 4.1406, + "step": 3451 + }, + { + "epoch": 0.03452, + "grad_norm": 0.48981095252176143, + "learning_rate": 0.003, + "loss": 4.1702, + "step": 3452 + }, + { + "epoch": 0.03453, + "grad_norm": 0.5221855555116031, + "learning_rate": 0.003, + "loss": 4.1474, + "step": 3453 + }, + { + "epoch": 0.03454, + "grad_norm": 0.5252422198675372, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 3454 + }, + { + "epoch": 0.03455, + "grad_norm": 0.5714349317836669, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 3455 + }, + { + "epoch": 0.03456, + "grad_norm": 0.5686518541913316, + "learning_rate": 0.003, + "loss": 4.154, + "step": 3456 + }, + { + "epoch": 0.03457, + "grad_norm": 0.5777235599897509, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 3457 + }, + { + "epoch": 0.03458, + "grad_norm": 0.6023496924245035, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 3458 + }, + { + "epoch": 0.03459, + "grad_norm": 0.6835034129659396, + "learning_rate": 0.003, + "loss": 4.1883, + "step": 3459 + }, + { + "epoch": 0.0346, + "grad_norm": 0.7616829391144937, + "learning_rate": 0.003, + "loss": 4.195, + "step": 3460 + }, + { + "epoch": 0.03461, + "grad_norm": 0.6951133371872186, + "learning_rate": 0.003, + "loss": 4.1716, + "step": 3461 + }, + { + "epoch": 0.03462, + "grad_norm": 0.6533341057192256, + "learning_rate": 0.003, + "loss": 4.1653, + "step": 3462 + }, + { + "epoch": 0.03463, + "grad_norm": 0.7614181817822457, + "learning_rate": 0.003, + "loss": 4.1547, + "step": 3463 + }, + { + "epoch": 0.03464, + "grad_norm": 0.8154255816699014, + "learning_rate": 0.003, + "loss": 4.1769, + "step": 3464 + }, + { + "epoch": 0.03465, + "grad_norm": 0.834367328138272, + "learning_rate": 0.003, + "loss": 4.1879, + "step": 3465 + }, + { + "epoch": 0.03466, + "grad_norm": 0.7054161963486509, + "learning_rate": 0.003, + "loss": 4.1834, + "step": 3466 + }, + { + "epoch": 0.03467, + "grad_norm": 0.6747042802423374, + "learning_rate": 0.003, + "loss": 4.1681, + "step": 3467 + }, + { + "epoch": 0.03468, + "grad_norm": 0.6274506871293699, + "learning_rate": 0.003, + "loss": 4.1789, + "step": 3468 + }, + { + "epoch": 0.03469, + "grad_norm": 0.574247040112387, + "learning_rate": 0.003, + "loss": 4.1751, + "step": 3469 + }, + { + "epoch": 0.0347, + "grad_norm": 0.5474693168535483, + "learning_rate": 0.003, + "loss": 4.1868, + "step": 3470 + }, + { + "epoch": 0.03471, + "grad_norm": 0.5484056382186252, + "learning_rate": 0.003, + "loss": 4.1743, + "step": 3471 + }, + { + "epoch": 0.03472, + "grad_norm": 0.548625120805519, + "learning_rate": 0.003, + "loss": 4.1705, + "step": 3472 + }, + { + "epoch": 0.03473, + "grad_norm": 0.637401868814706, + "learning_rate": 0.003, + "loss": 4.1702, + "step": 3473 + }, + { + "epoch": 0.03474, + "grad_norm": 0.7144883444351541, + "learning_rate": 0.003, + "loss": 4.2012, + "step": 3474 + }, + { + "epoch": 0.03475, + "grad_norm": 0.7979670963387884, + "learning_rate": 0.003, + "loss": 4.1787, + "step": 3475 + }, + { + "epoch": 0.03476, + "grad_norm": 0.7163973548344581, + "learning_rate": 0.003, + "loss": 4.1814, + "step": 3476 + }, + { + "epoch": 0.03477, + "grad_norm": 0.5684898388546081, + "learning_rate": 0.003, + "loss": 4.1719, + "step": 3477 + }, + { + "epoch": 0.03478, + "grad_norm": 0.5861592780845318, + "learning_rate": 0.003, + "loss": 4.1823, + "step": 3478 + }, + { + "epoch": 0.03479, + "grad_norm": 0.5340943622573318, + "learning_rate": 0.003, + "loss": 4.1533, + "step": 3479 + }, + { + "epoch": 0.0348, + "grad_norm": 0.4404622712681107, + "learning_rate": 0.003, + "loss": 4.1503, + "step": 3480 + }, + { + "epoch": 0.03481, + "grad_norm": 0.4792980762796971, + "learning_rate": 0.003, + "loss": 4.1659, + "step": 3481 + }, + { + "epoch": 0.03482, + "grad_norm": 0.4579076307590615, + "learning_rate": 0.003, + "loss": 4.173, + "step": 3482 + }, + { + "epoch": 0.03483, + "grad_norm": 0.42854750510723455, + "learning_rate": 0.003, + "loss": 4.1486, + "step": 3483 + }, + { + "epoch": 0.03484, + "grad_norm": 0.4222950210388672, + "learning_rate": 0.003, + "loss": 4.1912, + "step": 3484 + }, + { + "epoch": 0.03485, + "grad_norm": 0.47787917501254906, + "learning_rate": 0.003, + "loss": 4.1842, + "step": 3485 + }, + { + "epoch": 0.03486, + "grad_norm": 0.6218177691290887, + "learning_rate": 0.003, + "loss": 4.1608, + "step": 3486 + }, + { + "epoch": 0.03487, + "grad_norm": 0.7695122506406105, + "learning_rate": 0.003, + "loss": 4.16, + "step": 3487 + }, + { + "epoch": 0.03488, + "grad_norm": 0.811403718151084, + "learning_rate": 0.003, + "loss": 4.1602, + "step": 3488 + }, + { + "epoch": 0.03489, + "grad_norm": 0.6478567488540471, + "learning_rate": 0.003, + "loss": 4.1728, + "step": 3489 + }, + { + "epoch": 0.0349, + "grad_norm": 0.5349166330706258, + "learning_rate": 0.003, + "loss": 4.1647, + "step": 3490 + }, + { + "epoch": 0.03491, + "grad_norm": 0.5539220367541061, + "learning_rate": 0.003, + "loss": 4.1823, + "step": 3491 + }, + { + "epoch": 0.03492, + "grad_norm": 0.4809491803196432, + "learning_rate": 0.003, + "loss": 4.1646, + "step": 3492 + }, + { + "epoch": 0.03493, + "grad_norm": 0.4980699721500476, + "learning_rate": 0.003, + "loss": 4.1561, + "step": 3493 + }, + { + "epoch": 0.03494, + "grad_norm": 0.6241704569078306, + "learning_rate": 0.003, + "loss": 4.161, + "step": 3494 + }, + { + "epoch": 0.03495, + "grad_norm": 0.6045742174777644, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 3495 + }, + { + "epoch": 0.03496, + "grad_norm": 0.601044244149248, + "learning_rate": 0.003, + "loss": 4.1431, + "step": 3496 + }, + { + "epoch": 0.03497, + "grad_norm": 0.5400771740929299, + "learning_rate": 0.003, + "loss": 4.1771, + "step": 3497 + }, + { + "epoch": 0.03498, + "grad_norm": 0.713136808858786, + "learning_rate": 0.003, + "loss": 4.157, + "step": 3498 + }, + { + "epoch": 0.03499, + "grad_norm": 0.7906121138775885, + "learning_rate": 0.003, + "loss": 4.1705, + "step": 3499 + }, + { + "epoch": 0.035, + "grad_norm": 0.7463261361411888, + "learning_rate": 0.003, + "loss": 4.1867, + "step": 3500 + }, + { + "epoch": 0.03501, + "grad_norm": 0.6401328140167164, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 3501 + }, + { + "epoch": 0.03502, + "grad_norm": 0.5849954396740931, + "learning_rate": 0.003, + "loss": 4.1761, + "step": 3502 + }, + { + "epoch": 0.03503, + "grad_norm": 0.5132547849361668, + "learning_rate": 0.003, + "loss": 4.1879, + "step": 3503 + }, + { + "epoch": 0.03504, + "grad_norm": 0.5053019189532232, + "learning_rate": 0.003, + "loss": 4.159, + "step": 3504 + }, + { + "epoch": 0.03505, + "grad_norm": 0.5412370735339649, + "learning_rate": 0.003, + "loss": 4.177, + "step": 3505 + }, + { + "epoch": 0.03506, + "grad_norm": 0.5174741400250557, + "learning_rate": 0.003, + "loss": 4.1855, + "step": 3506 + }, + { + "epoch": 0.03507, + "grad_norm": 0.5828929341462858, + "learning_rate": 0.003, + "loss": 4.1653, + "step": 3507 + }, + { + "epoch": 0.03508, + "grad_norm": 0.6123621534119361, + "learning_rate": 0.003, + "loss": 4.1469, + "step": 3508 + }, + { + "epoch": 0.03509, + "grad_norm": 0.529763454477017, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 3509 + }, + { + "epoch": 0.0351, + "grad_norm": 0.4991440774069516, + "learning_rate": 0.003, + "loss": 4.1468, + "step": 3510 + }, + { + "epoch": 0.03511, + "grad_norm": 0.5106927805679168, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 3511 + }, + { + "epoch": 0.03512, + "grad_norm": 0.5290709648712968, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 3512 + }, + { + "epoch": 0.03513, + "grad_norm": 0.5838748180600412, + "learning_rate": 0.003, + "loss": 4.1534, + "step": 3513 + }, + { + "epoch": 0.03514, + "grad_norm": 0.8057246907688399, + "learning_rate": 0.003, + "loss": 4.173, + "step": 3514 + }, + { + "epoch": 0.03515, + "grad_norm": 1.0135173467038623, + "learning_rate": 0.003, + "loss": 4.1945, + "step": 3515 + }, + { + "epoch": 0.03516, + "grad_norm": 0.8727734267667479, + "learning_rate": 0.003, + "loss": 4.2288, + "step": 3516 + }, + { + "epoch": 0.03517, + "grad_norm": 0.7059545473280222, + "learning_rate": 0.003, + "loss": 4.1709, + "step": 3517 + }, + { + "epoch": 0.03518, + "grad_norm": 0.7000960321874965, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 3518 + }, + { + "epoch": 0.03519, + "grad_norm": 0.6627465613911154, + "learning_rate": 0.003, + "loss": 4.2036, + "step": 3519 + }, + { + "epoch": 0.0352, + "grad_norm": 0.7079684591627484, + "learning_rate": 0.003, + "loss": 4.1815, + "step": 3520 + }, + { + "epoch": 0.03521, + "grad_norm": 0.7564247406144935, + "learning_rate": 0.003, + "loss": 4.1833, + "step": 3521 + }, + { + "epoch": 0.03522, + "grad_norm": 0.6970141789263714, + "learning_rate": 0.003, + "loss": 4.1807, + "step": 3522 + }, + { + "epoch": 0.03523, + "grad_norm": 0.6458343838785704, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 3523 + }, + { + "epoch": 0.03524, + "grad_norm": 0.727411286446386, + "learning_rate": 0.003, + "loss": 4.1943, + "step": 3524 + }, + { + "epoch": 0.03525, + "grad_norm": 0.6885063026654827, + "learning_rate": 0.003, + "loss": 4.163, + "step": 3525 + }, + { + "epoch": 0.03526, + "grad_norm": 0.503068225605494, + "learning_rate": 0.003, + "loss": 4.16, + "step": 3526 + }, + { + "epoch": 0.03527, + "grad_norm": 0.4984239052259041, + "learning_rate": 0.003, + "loss": 4.1946, + "step": 3527 + }, + { + "epoch": 0.03528, + "grad_norm": 0.48402535036066135, + "learning_rate": 0.003, + "loss": 4.1744, + "step": 3528 + }, + { + "epoch": 0.03529, + "grad_norm": 0.4945614778246426, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 3529 + }, + { + "epoch": 0.0353, + "grad_norm": 0.488546537630184, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 3530 + }, + { + "epoch": 0.03531, + "grad_norm": 0.4615658807568464, + "learning_rate": 0.003, + "loss": 4.173, + "step": 3531 + }, + { + "epoch": 0.03532, + "grad_norm": 0.5005896772829939, + "learning_rate": 0.003, + "loss": 4.1751, + "step": 3532 + }, + { + "epoch": 0.03533, + "grad_norm": 0.5146344370982837, + "learning_rate": 0.003, + "loss": 4.1732, + "step": 3533 + }, + { + "epoch": 0.03534, + "grad_norm": 0.5278648337419918, + "learning_rate": 0.003, + "loss": 4.1672, + "step": 3534 + }, + { + "epoch": 0.03535, + "grad_norm": 0.573214872592988, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 3535 + }, + { + "epoch": 0.03536, + "grad_norm": 0.6216412138977996, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 3536 + }, + { + "epoch": 0.03537, + "grad_norm": 0.641334014656523, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 3537 + }, + { + "epoch": 0.03538, + "grad_norm": 0.7120173819031035, + "learning_rate": 0.003, + "loss": 4.1587, + "step": 3538 + }, + { + "epoch": 0.03539, + "grad_norm": 0.7562087971985374, + "learning_rate": 0.003, + "loss": 4.1514, + "step": 3539 + }, + { + "epoch": 0.0354, + "grad_norm": 0.745826443201787, + "learning_rate": 0.003, + "loss": 4.1925, + "step": 3540 + }, + { + "epoch": 0.03541, + "grad_norm": 0.6880220783097426, + "learning_rate": 0.003, + "loss": 4.1777, + "step": 3541 + }, + { + "epoch": 0.03542, + "grad_norm": 0.6660987603699731, + "learning_rate": 0.003, + "loss": 4.158, + "step": 3542 + }, + { + "epoch": 0.03543, + "grad_norm": 0.5966852103308192, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 3543 + }, + { + "epoch": 0.03544, + "grad_norm": 0.6147196669679205, + "learning_rate": 0.003, + "loss": 4.1752, + "step": 3544 + }, + { + "epoch": 0.03545, + "grad_norm": 0.5876025358116799, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 3545 + }, + { + "epoch": 0.03546, + "grad_norm": 0.5394946467408606, + "learning_rate": 0.003, + "loss": 4.161, + "step": 3546 + }, + { + "epoch": 0.03547, + "grad_norm": 0.4760714738457914, + "learning_rate": 0.003, + "loss": 4.1659, + "step": 3547 + }, + { + "epoch": 0.03548, + "grad_norm": 0.511046276124325, + "learning_rate": 0.003, + "loss": 4.1712, + "step": 3548 + }, + { + "epoch": 0.03549, + "grad_norm": 0.5497663152303965, + "learning_rate": 0.003, + "loss": 4.1676, + "step": 3549 + }, + { + "epoch": 0.0355, + "grad_norm": 0.5734708015673942, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 3550 + }, + { + "epoch": 0.03551, + "grad_norm": 0.5704932035569535, + "learning_rate": 0.003, + "loss": 4.1799, + "step": 3551 + }, + { + "epoch": 0.03552, + "grad_norm": 0.5413961705528875, + "learning_rate": 0.003, + "loss": 4.1392, + "step": 3552 + }, + { + "epoch": 0.03553, + "grad_norm": 0.5629706262237084, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 3553 + }, + { + "epoch": 0.03554, + "grad_norm": 0.6847813269812701, + "learning_rate": 0.003, + "loss": 4.1633, + "step": 3554 + }, + { + "epoch": 0.03555, + "grad_norm": 0.7442929788518159, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 3555 + }, + { + "epoch": 0.03556, + "grad_norm": 0.6995396794139108, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 3556 + }, + { + "epoch": 0.03557, + "grad_norm": 0.6870977550247097, + "learning_rate": 0.003, + "loss": 4.1641, + "step": 3557 + }, + { + "epoch": 0.03558, + "grad_norm": 0.4853466992283488, + "learning_rate": 0.003, + "loss": 4.1831, + "step": 3558 + }, + { + "epoch": 0.03559, + "grad_norm": 0.4762880827808817, + "learning_rate": 0.003, + "loss": 4.1691, + "step": 3559 + }, + { + "epoch": 0.0356, + "grad_norm": 0.44888469164051914, + "learning_rate": 0.003, + "loss": 4.1691, + "step": 3560 + }, + { + "epoch": 0.03561, + "grad_norm": 0.4860772847324368, + "learning_rate": 0.003, + "loss": 4.1939, + "step": 3561 + }, + { + "epoch": 0.03562, + "grad_norm": 0.609514740868879, + "learning_rate": 0.003, + "loss": 4.1615, + "step": 3562 + }, + { + "epoch": 0.03563, + "grad_norm": 0.5645288584044179, + "learning_rate": 0.003, + "loss": 4.1919, + "step": 3563 + }, + { + "epoch": 0.03564, + "grad_norm": 0.5713508531485367, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 3564 + }, + { + "epoch": 0.03565, + "grad_norm": 0.6809281314446707, + "learning_rate": 0.003, + "loss": 4.1657, + "step": 3565 + }, + { + "epoch": 0.03566, + "grad_norm": 0.8446250147895774, + "learning_rate": 0.003, + "loss": 4.1796, + "step": 3566 + }, + { + "epoch": 0.03567, + "grad_norm": 0.7816404227486906, + "learning_rate": 0.003, + "loss": 4.173, + "step": 3567 + }, + { + "epoch": 0.03568, + "grad_norm": 0.565515569510055, + "learning_rate": 0.003, + "loss": 4.1588, + "step": 3568 + }, + { + "epoch": 0.03569, + "grad_norm": 0.6392539102485247, + "learning_rate": 0.003, + "loss": 4.1604, + "step": 3569 + }, + { + "epoch": 0.0357, + "grad_norm": 0.6094618901019504, + "learning_rate": 0.003, + "loss": 4.1618, + "step": 3570 + }, + { + "epoch": 0.03571, + "grad_norm": 0.516014245189601, + "learning_rate": 0.003, + "loss": 4.1658, + "step": 3571 + }, + { + "epoch": 0.03572, + "grad_norm": 0.5649956960536694, + "learning_rate": 0.003, + "loss": 4.1874, + "step": 3572 + }, + { + "epoch": 0.03573, + "grad_norm": 0.5285488438884934, + "learning_rate": 0.003, + "loss": 4.1551, + "step": 3573 + }, + { + "epoch": 0.03574, + "grad_norm": 0.5059015354099536, + "learning_rate": 0.003, + "loss": 4.1778, + "step": 3574 + }, + { + "epoch": 0.03575, + "grad_norm": 0.5626349168929159, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 3575 + }, + { + "epoch": 0.03576, + "grad_norm": 0.5730355700030897, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 3576 + }, + { + "epoch": 0.03577, + "grad_norm": 0.5081514156321213, + "learning_rate": 0.003, + "loss": 4.1719, + "step": 3577 + }, + { + "epoch": 0.03578, + "grad_norm": 0.4522800880743083, + "learning_rate": 0.003, + "loss": 4.1594, + "step": 3578 + }, + { + "epoch": 0.03579, + "grad_norm": 0.4334604254761663, + "learning_rate": 0.003, + "loss": 4.1743, + "step": 3579 + }, + { + "epoch": 0.0358, + "grad_norm": 0.478675617542924, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 3580 + }, + { + "epoch": 0.03581, + "grad_norm": 0.5103431984153175, + "learning_rate": 0.003, + "loss": 4.1461, + "step": 3581 + }, + { + "epoch": 0.03582, + "grad_norm": 0.5133704031348394, + "learning_rate": 0.003, + "loss": 4.144, + "step": 3582 + }, + { + "epoch": 0.03583, + "grad_norm": 0.5875445897748297, + "learning_rate": 0.003, + "loss": 4.1484, + "step": 3583 + }, + { + "epoch": 0.03584, + "grad_norm": 0.5543663682322499, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 3584 + }, + { + "epoch": 0.03585, + "grad_norm": 0.49560990112191017, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 3585 + }, + { + "epoch": 0.03586, + "grad_norm": 0.5080536513623409, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 3586 + }, + { + "epoch": 0.03587, + "grad_norm": 0.47897066887270817, + "learning_rate": 0.003, + "loss": 4.1534, + "step": 3587 + }, + { + "epoch": 0.03588, + "grad_norm": 0.4859791474646478, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 3588 + }, + { + "epoch": 0.03589, + "grad_norm": 0.5360881510224248, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 3589 + }, + { + "epoch": 0.0359, + "grad_norm": 0.6282942855775232, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 3590 + }, + { + "epoch": 0.03591, + "grad_norm": 0.7149898709877768, + "learning_rate": 0.003, + "loss": 4.1753, + "step": 3591 + }, + { + "epoch": 0.03592, + "grad_norm": 0.6656767655390144, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 3592 + }, + { + "epoch": 0.03593, + "grad_norm": 0.5744352573697437, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 3593 + }, + { + "epoch": 0.03594, + "grad_norm": 0.5564608607734087, + "learning_rate": 0.003, + "loss": 4.1586, + "step": 3594 + }, + { + "epoch": 0.03595, + "grad_norm": 0.6440817185713912, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 3595 + }, + { + "epoch": 0.03596, + "grad_norm": 0.8208781786922031, + "learning_rate": 0.003, + "loss": 4.1965, + "step": 3596 + }, + { + "epoch": 0.03597, + "grad_norm": 0.9222891823782106, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 3597 + }, + { + "epoch": 0.03598, + "grad_norm": 0.8427211641310154, + "learning_rate": 0.003, + "loss": 4.187, + "step": 3598 + }, + { + "epoch": 0.03599, + "grad_norm": 0.852351507152875, + "learning_rate": 0.003, + "loss": 4.1882, + "step": 3599 + }, + { + "epoch": 0.036, + "grad_norm": 0.8507559173234883, + "learning_rate": 0.003, + "loss": 4.2187, + "step": 3600 + }, + { + "epoch": 0.03601, + "grad_norm": 0.9636100541843863, + "learning_rate": 0.003, + "loss": 4.1988, + "step": 3601 + }, + { + "epoch": 0.03602, + "grad_norm": 1.043053481597312, + "learning_rate": 0.003, + "loss": 4.2019, + "step": 3602 + }, + { + "epoch": 0.03603, + "grad_norm": 0.8586008867305468, + "learning_rate": 0.003, + "loss": 4.1772, + "step": 3603 + }, + { + "epoch": 0.03604, + "grad_norm": 0.8633622676072363, + "learning_rate": 0.003, + "loss": 4.2215, + "step": 3604 + }, + { + "epoch": 0.03605, + "grad_norm": 0.8221166093366499, + "learning_rate": 0.003, + "loss": 4.213, + "step": 3605 + }, + { + "epoch": 0.03606, + "grad_norm": 0.7191787485946937, + "learning_rate": 0.003, + "loss": 4.1949, + "step": 3606 + }, + { + "epoch": 0.03607, + "grad_norm": 0.6624835833221929, + "learning_rate": 0.003, + "loss": 4.1849, + "step": 3607 + }, + { + "epoch": 0.03608, + "grad_norm": 0.6824569834328487, + "learning_rate": 0.003, + "loss": 4.1883, + "step": 3608 + }, + { + "epoch": 0.03609, + "grad_norm": 0.6457675177801978, + "learning_rate": 0.003, + "loss": 4.2132, + "step": 3609 + }, + { + "epoch": 0.0361, + "grad_norm": 0.6295884180353496, + "learning_rate": 0.003, + "loss": 4.2142, + "step": 3610 + }, + { + "epoch": 0.03611, + "grad_norm": 0.590169158296821, + "learning_rate": 0.003, + "loss": 4.1598, + "step": 3611 + }, + { + "epoch": 0.03612, + "grad_norm": 0.5389921423268351, + "learning_rate": 0.003, + "loss": 4.153, + "step": 3612 + }, + { + "epoch": 0.03613, + "grad_norm": 0.5497537704200285, + "learning_rate": 0.003, + "loss": 4.1961, + "step": 3613 + }, + { + "epoch": 0.03614, + "grad_norm": 0.4843702888139074, + "learning_rate": 0.003, + "loss": 4.1672, + "step": 3614 + }, + { + "epoch": 0.03615, + "grad_norm": 0.48025803520879506, + "learning_rate": 0.003, + "loss": 4.1788, + "step": 3615 + }, + { + "epoch": 0.03616, + "grad_norm": 0.46348349208320905, + "learning_rate": 0.003, + "loss": 4.1857, + "step": 3616 + }, + { + "epoch": 0.03617, + "grad_norm": 0.39854001217238155, + "learning_rate": 0.003, + "loss": 4.1763, + "step": 3617 + }, + { + "epoch": 0.03618, + "grad_norm": 0.4027281929968038, + "learning_rate": 0.003, + "loss": 4.1671, + "step": 3618 + }, + { + "epoch": 0.03619, + "grad_norm": 0.3985420323335125, + "learning_rate": 0.003, + "loss": 4.1636, + "step": 3619 + }, + { + "epoch": 0.0362, + "grad_norm": 0.3613468610024924, + "learning_rate": 0.003, + "loss": 4.1676, + "step": 3620 + }, + { + "epoch": 0.03621, + "grad_norm": 0.33261587429198597, + "learning_rate": 0.003, + "loss": 4.1667, + "step": 3621 + }, + { + "epoch": 0.03622, + "grad_norm": 0.3403864350330926, + "learning_rate": 0.003, + "loss": 4.1353, + "step": 3622 + }, + { + "epoch": 0.03623, + "grad_norm": 0.3583311023359255, + "learning_rate": 0.003, + "loss": 4.1243, + "step": 3623 + }, + { + "epoch": 0.03624, + "grad_norm": 0.32450649557353767, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 3624 + }, + { + "epoch": 0.03625, + "grad_norm": 0.3235232357698917, + "learning_rate": 0.003, + "loss": 4.1676, + "step": 3625 + }, + { + "epoch": 0.03626, + "grad_norm": 0.3029552071156344, + "learning_rate": 0.003, + "loss": 4.1418, + "step": 3626 + }, + { + "epoch": 0.03627, + "grad_norm": 0.3634806878840478, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 3627 + }, + { + "epoch": 0.03628, + "grad_norm": 0.5031788929713537, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 3628 + }, + { + "epoch": 0.03629, + "grad_norm": 0.7314661052924161, + "learning_rate": 0.003, + "loss": 4.1821, + "step": 3629 + }, + { + "epoch": 0.0363, + "grad_norm": 0.9369830066901181, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 3630 + }, + { + "epoch": 0.03631, + "grad_norm": 0.7563300299386175, + "learning_rate": 0.003, + "loss": 4.1699, + "step": 3631 + }, + { + "epoch": 0.03632, + "grad_norm": 0.6272780473551262, + "learning_rate": 0.003, + "loss": 4.1574, + "step": 3632 + }, + { + "epoch": 0.03633, + "grad_norm": 0.6625289405061192, + "learning_rate": 0.003, + "loss": 4.1907, + "step": 3633 + }, + { + "epoch": 0.03634, + "grad_norm": 0.624342554157624, + "learning_rate": 0.003, + "loss": 4.1627, + "step": 3634 + }, + { + "epoch": 0.03635, + "grad_norm": 0.583607515091326, + "learning_rate": 0.003, + "loss": 4.149, + "step": 3635 + }, + { + "epoch": 0.03636, + "grad_norm": 0.5584832759878093, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 3636 + }, + { + "epoch": 0.03637, + "grad_norm": 0.5837086725119016, + "learning_rate": 0.003, + "loss": 4.1561, + "step": 3637 + }, + { + "epoch": 0.03638, + "grad_norm": 0.6095521315464627, + "learning_rate": 0.003, + "loss": 4.1649, + "step": 3638 + }, + { + "epoch": 0.03639, + "grad_norm": 0.6657809000387206, + "learning_rate": 0.003, + "loss": 4.1922, + "step": 3639 + }, + { + "epoch": 0.0364, + "grad_norm": 0.6289647060691178, + "learning_rate": 0.003, + "loss": 4.1882, + "step": 3640 + }, + { + "epoch": 0.03641, + "grad_norm": 0.6059605963156967, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 3641 + }, + { + "epoch": 0.03642, + "grad_norm": 0.5705396060730448, + "learning_rate": 0.003, + "loss": 4.1875, + "step": 3642 + }, + { + "epoch": 0.03643, + "grad_norm": 0.6035912533971294, + "learning_rate": 0.003, + "loss": 4.1489, + "step": 3643 + }, + { + "epoch": 0.03644, + "grad_norm": 0.6309633478231041, + "learning_rate": 0.003, + "loss": 4.1769, + "step": 3644 + }, + { + "epoch": 0.03645, + "grad_norm": 0.6355334191850317, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 3645 + }, + { + "epoch": 0.03646, + "grad_norm": 0.5786595124970733, + "learning_rate": 0.003, + "loss": 4.1657, + "step": 3646 + }, + { + "epoch": 0.03647, + "grad_norm": 0.5949997765584221, + "learning_rate": 0.003, + "loss": 4.1642, + "step": 3647 + }, + { + "epoch": 0.03648, + "grad_norm": 0.5464060834426805, + "learning_rate": 0.003, + "loss": 4.1502, + "step": 3648 + }, + { + "epoch": 0.03649, + "grad_norm": 0.6161442180745826, + "learning_rate": 0.003, + "loss": 4.1777, + "step": 3649 + }, + { + "epoch": 0.0365, + "grad_norm": 0.7276354515874378, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 3650 + }, + { + "epoch": 0.03651, + "grad_norm": 0.879246794128947, + "learning_rate": 0.003, + "loss": 4.1938, + "step": 3651 + }, + { + "epoch": 0.03652, + "grad_norm": 0.8889526606107412, + "learning_rate": 0.003, + "loss": 4.1834, + "step": 3652 + }, + { + "epoch": 0.03653, + "grad_norm": 0.7646404951633242, + "learning_rate": 0.003, + "loss": 4.162, + "step": 3653 + }, + { + "epoch": 0.03654, + "grad_norm": 0.7290317207893501, + "learning_rate": 0.003, + "loss": 4.1597, + "step": 3654 + }, + { + "epoch": 0.03655, + "grad_norm": 0.7063774172804314, + "learning_rate": 0.003, + "loss": 4.1865, + "step": 3655 + }, + { + "epoch": 0.03656, + "grad_norm": 0.7160866471184082, + "learning_rate": 0.003, + "loss": 4.1743, + "step": 3656 + }, + { + "epoch": 0.03657, + "grad_norm": 0.5821614514235085, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 3657 + }, + { + "epoch": 0.03658, + "grad_norm": 0.5963209078366205, + "learning_rate": 0.003, + "loss": 4.2061, + "step": 3658 + }, + { + "epoch": 0.03659, + "grad_norm": 0.5885163411512104, + "learning_rate": 0.003, + "loss": 4.1544, + "step": 3659 + }, + { + "epoch": 0.0366, + "grad_norm": 0.6724966576985278, + "learning_rate": 0.003, + "loss": 4.1869, + "step": 3660 + }, + { + "epoch": 0.03661, + "grad_norm": 0.70045611677563, + "learning_rate": 0.003, + "loss": 4.1682, + "step": 3661 + }, + { + "epoch": 0.03662, + "grad_norm": 0.7038628116184068, + "learning_rate": 0.003, + "loss": 4.1668, + "step": 3662 + }, + { + "epoch": 0.03663, + "grad_norm": 0.7338499926784156, + "learning_rate": 0.003, + "loss": 4.1859, + "step": 3663 + }, + { + "epoch": 0.03664, + "grad_norm": 0.6003408827366159, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 3664 + }, + { + "epoch": 0.03665, + "grad_norm": 0.531193738984965, + "learning_rate": 0.003, + "loss": 4.1816, + "step": 3665 + }, + { + "epoch": 0.03666, + "grad_norm": 0.502967345745313, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 3666 + }, + { + "epoch": 0.03667, + "grad_norm": 0.4403238489897598, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 3667 + }, + { + "epoch": 0.03668, + "grad_norm": 0.4214554821381715, + "learning_rate": 0.003, + "loss": 4.1542, + "step": 3668 + }, + { + "epoch": 0.03669, + "grad_norm": 0.3786750496308303, + "learning_rate": 0.003, + "loss": 4.129, + "step": 3669 + }, + { + "epoch": 0.0367, + "grad_norm": 0.36562759077361934, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 3670 + }, + { + "epoch": 0.03671, + "grad_norm": 0.4132800551107581, + "learning_rate": 0.003, + "loss": 4.1551, + "step": 3671 + }, + { + "epoch": 0.03672, + "grad_norm": 0.49256525507220217, + "learning_rate": 0.003, + "loss": 4.1596, + "step": 3672 + }, + { + "epoch": 0.03673, + "grad_norm": 0.6349831894939928, + "learning_rate": 0.003, + "loss": 4.171, + "step": 3673 + }, + { + "epoch": 0.03674, + "grad_norm": 0.6952242122376082, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 3674 + }, + { + "epoch": 0.03675, + "grad_norm": 0.5660824888307048, + "learning_rate": 0.003, + "loss": 4.1736, + "step": 3675 + }, + { + "epoch": 0.03676, + "grad_norm": 0.4101727822433613, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 3676 + }, + { + "epoch": 0.03677, + "grad_norm": 0.4248844160991166, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 3677 + }, + { + "epoch": 0.03678, + "grad_norm": 0.40692106726610927, + "learning_rate": 0.003, + "loss": 4.1681, + "step": 3678 + }, + { + "epoch": 0.03679, + "grad_norm": 0.43656051390996525, + "learning_rate": 0.003, + "loss": 4.1576, + "step": 3679 + }, + { + "epoch": 0.0368, + "grad_norm": 0.4651408055341787, + "learning_rate": 0.003, + "loss": 4.1504, + "step": 3680 + }, + { + "epoch": 0.03681, + "grad_norm": 0.4431552362299314, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 3681 + }, + { + "epoch": 0.03682, + "grad_norm": 0.45670367087714675, + "learning_rate": 0.003, + "loss": 4.1622, + "step": 3682 + }, + { + "epoch": 0.03683, + "grad_norm": 0.48732877269273184, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 3683 + }, + { + "epoch": 0.03684, + "grad_norm": 0.49569158763050314, + "learning_rate": 0.003, + "loss": 4.1464, + "step": 3684 + }, + { + "epoch": 0.03685, + "grad_norm": 0.5827537649992007, + "learning_rate": 0.003, + "loss": 4.1345, + "step": 3685 + }, + { + "epoch": 0.03686, + "grad_norm": 0.7130056023766307, + "learning_rate": 0.003, + "loss": 4.1622, + "step": 3686 + }, + { + "epoch": 0.03687, + "grad_norm": 0.806080078993977, + "learning_rate": 0.003, + "loss": 4.1913, + "step": 3687 + }, + { + "epoch": 0.03688, + "grad_norm": 0.8654301422344852, + "learning_rate": 0.003, + "loss": 4.1853, + "step": 3688 + }, + { + "epoch": 0.03689, + "grad_norm": 0.824342247624797, + "learning_rate": 0.003, + "loss": 4.1558, + "step": 3689 + }, + { + "epoch": 0.0369, + "grad_norm": 0.8399398898767244, + "learning_rate": 0.003, + "loss": 4.1897, + "step": 3690 + }, + { + "epoch": 0.03691, + "grad_norm": 0.8376909355521104, + "learning_rate": 0.003, + "loss": 4.1757, + "step": 3691 + }, + { + "epoch": 0.03692, + "grad_norm": 0.7664237094520844, + "learning_rate": 0.003, + "loss": 4.2095, + "step": 3692 + }, + { + "epoch": 0.03693, + "grad_norm": 0.6756940227740064, + "learning_rate": 0.003, + "loss": 4.1782, + "step": 3693 + }, + { + "epoch": 0.03694, + "grad_norm": 0.665602669017162, + "learning_rate": 0.003, + "loss": 4.1706, + "step": 3694 + }, + { + "epoch": 0.03695, + "grad_norm": 0.6845010372537103, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 3695 + }, + { + "epoch": 0.03696, + "grad_norm": 0.8052078414912427, + "learning_rate": 0.003, + "loss": 4.1755, + "step": 3696 + }, + { + "epoch": 0.03697, + "grad_norm": 0.8787348855409773, + "learning_rate": 0.003, + "loss": 4.1745, + "step": 3697 + }, + { + "epoch": 0.03698, + "grad_norm": 0.8247546282969409, + "learning_rate": 0.003, + "loss": 4.2121, + "step": 3698 + }, + { + "epoch": 0.03699, + "grad_norm": 0.7409801258428377, + "learning_rate": 0.003, + "loss": 4.1946, + "step": 3699 + }, + { + "epoch": 0.037, + "grad_norm": 0.6088451406774416, + "learning_rate": 0.003, + "loss": 4.1795, + "step": 3700 + }, + { + "epoch": 0.03701, + "grad_norm": 0.6322208971568581, + "learning_rate": 0.003, + "loss": 4.1844, + "step": 3701 + }, + { + "epoch": 0.03702, + "grad_norm": 0.5054476642910108, + "learning_rate": 0.003, + "loss": 4.1646, + "step": 3702 + }, + { + "epoch": 0.03703, + "grad_norm": 0.5443386034292382, + "learning_rate": 0.003, + "loss": 4.1798, + "step": 3703 + }, + { + "epoch": 0.03704, + "grad_norm": 0.5944500200700061, + "learning_rate": 0.003, + "loss": 4.1823, + "step": 3704 + }, + { + "epoch": 0.03705, + "grad_norm": 0.600099616356784, + "learning_rate": 0.003, + "loss": 4.1957, + "step": 3705 + }, + { + "epoch": 0.03706, + "grad_norm": 0.6109788360926119, + "learning_rate": 0.003, + "loss": 4.182, + "step": 3706 + }, + { + "epoch": 0.03707, + "grad_norm": 0.5293706856345912, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 3707 + }, + { + "epoch": 0.03708, + "grad_norm": 0.47791373263744796, + "learning_rate": 0.003, + "loss": 4.1753, + "step": 3708 + }, + { + "epoch": 0.03709, + "grad_norm": 0.49156286964072154, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 3709 + }, + { + "epoch": 0.0371, + "grad_norm": 0.5248101152432202, + "learning_rate": 0.003, + "loss": 4.1867, + "step": 3710 + }, + { + "epoch": 0.03711, + "grad_norm": 0.6117031890743618, + "learning_rate": 0.003, + "loss": 4.1604, + "step": 3711 + }, + { + "epoch": 0.03712, + "grad_norm": 0.6254066152554677, + "learning_rate": 0.003, + "loss": 4.1614, + "step": 3712 + }, + { + "epoch": 0.03713, + "grad_norm": 0.6264703701067362, + "learning_rate": 0.003, + "loss": 4.1504, + "step": 3713 + }, + { + "epoch": 0.03714, + "grad_norm": 0.620653200418187, + "learning_rate": 0.003, + "loss": 4.1614, + "step": 3714 + }, + { + "epoch": 0.03715, + "grad_norm": 0.5248205552325349, + "learning_rate": 0.003, + "loss": 4.1655, + "step": 3715 + }, + { + "epoch": 0.03716, + "grad_norm": 0.5526084707582196, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 3716 + }, + { + "epoch": 0.03717, + "grad_norm": 0.49730839862475784, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 3717 + }, + { + "epoch": 0.03718, + "grad_norm": 0.462962673452178, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 3718 + }, + { + "epoch": 0.03719, + "grad_norm": 0.3930444861352458, + "learning_rate": 0.003, + "loss": 4.1539, + "step": 3719 + }, + { + "epoch": 0.0372, + "grad_norm": 0.40801634488970573, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 3720 + }, + { + "epoch": 0.03721, + "grad_norm": 0.44606614802333183, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 3721 + }, + { + "epoch": 0.03722, + "grad_norm": 0.39111226545505906, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 3722 + }, + { + "epoch": 0.03723, + "grad_norm": 0.4408949322752162, + "learning_rate": 0.003, + "loss": 4.1757, + "step": 3723 + }, + { + "epoch": 0.03724, + "grad_norm": 0.5569410843865424, + "learning_rate": 0.003, + "loss": 4.156, + "step": 3724 + }, + { + "epoch": 0.03725, + "grad_norm": 0.6820788875058175, + "learning_rate": 0.003, + "loss": 4.1691, + "step": 3725 + }, + { + "epoch": 0.03726, + "grad_norm": 0.7697397803396343, + "learning_rate": 0.003, + "loss": 4.1495, + "step": 3726 + }, + { + "epoch": 0.03727, + "grad_norm": 0.7112467858706011, + "learning_rate": 0.003, + "loss": 4.1828, + "step": 3727 + }, + { + "epoch": 0.03728, + "grad_norm": 0.6285060537833618, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 3728 + }, + { + "epoch": 0.03729, + "grad_norm": 0.5988137930316242, + "learning_rate": 0.003, + "loss": 4.1659, + "step": 3729 + }, + { + "epoch": 0.0373, + "grad_norm": 0.6000038779888225, + "learning_rate": 0.003, + "loss": 4.1483, + "step": 3730 + }, + { + "epoch": 0.03731, + "grad_norm": 0.5727776342995737, + "learning_rate": 0.003, + "loss": 4.1728, + "step": 3731 + }, + { + "epoch": 0.03732, + "grad_norm": 0.47697561747753886, + "learning_rate": 0.003, + "loss": 4.1673, + "step": 3732 + }, + { + "epoch": 0.03733, + "grad_norm": 0.44272447557143685, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 3733 + }, + { + "epoch": 0.03734, + "grad_norm": 0.4247027107702777, + "learning_rate": 0.003, + "loss": 4.145, + "step": 3734 + }, + { + "epoch": 0.03735, + "grad_norm": 0.4479925161214012, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 3735 + }, + { + "epoch": 0.03736, + "grad_norm": 0.45973685869578795, + "learning_rate": 0.003, + "loss": 4.1686, + "step": 3736 + }, + { + "epoch": 0.03737, + "grad_norm": 0.3920498911497691, + "learning_rate": 0.003, + "loss": 4.1531, + "step": 3737 + }, + { + "epoch": 0.03738, + "grad_norm": 0.4063251593566824, + "learning_rate": 0.003, + "loss": 4.1715, + "step": 3738 + }, + { + "epoch": 0.03739, + "grad_norm": 0.46745772862008544, + "learning_rate": 0.003, + "loss": 4.135, + "step": 3739 + }, + { + "epoch": 0.0374, + "grad_norm": 0.46296090379338223, + "learning_rate": 0.003, + "loss": 4.1535, + "step": 3740 + }, + { + "epoch": 0.03741, + "grad_norm": 0.512997084501291, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 3741 + }, + { + "epoch": 0.03742, + "grad_norm": 0.6523245986952474, + "learning_rate": 0.003, + "loss": 4.1366, + "step": 3742 + }, + { + "epoch": 0.03743, + "grad_norm": 0.7442806374508512, + "learning_rate": 0.003, + "loss": 4.1373, + "step": 3743 + }, + { + "epoch": 0.03744, + "grad_norm": 0.7320455988631113, + "learning_rate": 0.003, + "loss": 4.1716, + "step": 3744 + }, + { + "epoch": 0.03745, + "grad_norm": 0.7624968676067105, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 3745 + }, + { + "epoch": 0.03746, + "grad_norm": 0.7031928628117834, + "learning_rate": 0.003, + "loss": 4.1647, + "step": 3746 + }, + { + "epoch": 0.03747, + "grad_norm": 0.6687793682231787, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 3747 + }, + { + "epoch": 0.03748, + "grad_norm": 0.779694731392804, + "learning_rate": 0.003, + "loss": 4.1488, + "step": 3748 + }, + { + "epoch": 0.03749, + "grad_norm": 0.7516521349267218, + "learning_rate": 0.003, + "loss": 4.1655, + "step": 3749 + }, + { + "epoch": 0.0375, + "grad_norm": 0.7321877471048407, + "learning_rate": 0.003, + "loss": 4.186, + "step": 3750 + }, + { + "epoch": 0.03751, + "grad_norm": 0.7130679243817225, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 3751 + }, + { + "epoch": 0.03752, + "grad_norm": 0.7452672741472625, + "learning_rate": 0.003, + "loss": 4.1674, + "step": 3752 + }, + { + "epoch": 0.03753, + "grad_norm": 0.6724770863912444, + "learning_rate": 0.003, + "loss": 4.1729, + "step": 3753 + }, + { + "epoch": 0.03754, + "grad_norm": 0.556817823672953, + "learning_rate": 0.003, + "loss": 4.1762, + "step": 3754 + }, + { + "epoch": 0.03755, + "grad_norm": 0.5579634308288945, + "learning_rate": 0.003, + "loss": 4.1683, + "step": 3755 + }, + { + "epoch": 0.03756, + "grad_norm": 0.6263580644537361, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 3756 + }, + { + "epoch": 0.03757, + "grad_norm": 0.6400148872312791, + "learning_rate": 0.003, + "loss": 4.176, + "step": 3757 + }, + { + "epoch": 0.03758, + "grad_norm": 0.6369106854442147, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 3758 + }, + { + "epoch": 0.03759, + "grad_norm": 0.5476477808333663, + "learning_rate": 0.003, + "loss": 4.1735, + "step": 3759 + }, + { + "epoch": 0.0376, + "grad_norm": 0.6773838858228983, + "learning_rate": 0.003, + "loss": 4.1634, + "step": 3760 + }, + { + "epoch": 0.03761, + "grad_norm": 0.7718542722389751, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 3761 + }, + { + "epoch": 0.03762, + "grad_norm": 0.6657955446674061, + "learning_rate": 0.003, + "loss": 4.1909, + "step": 3762 + }, + { + "epoch": 0.03763, + "grad_norm": 0.6194620722466222, + "learning_rate": 0.003, + "loss": 4.172, + "step": 3763 + }, + { + "epoch": 0.03764, + "grad_norm": 0.6595957894013659, + "learning_rate": 0.003, + "loss": 4.1858, + "step": 3764 + }, + { + "epoch": 0.03765, + "grad_norm": 0.7419338588721475, + "learning_rate": 0.003, + "loss": 4.2111, + "step": 3765 + }, + { + "epoch": 0.03766, + "grad_norm": 0.7575024078950962, + "learning_rate": 0.003, + "loss": 4.1527, + "step": 3766 + }, + { + "epoch": 0.03767, + "grad_norm": 0.6434009404162437, + "learning_rate": 0.003, + "loss": 4.1896, + "step": 3767 + }, + { + "epoch": 0.03768, + "grad_norm": 0.6051033538599371, + "learning_rate": 0.003, + "loss": 4.1655, + "step": 3768 + }, + { + "epoch": 0.03769, + "grad_norm": 0.581660518608867, + "learning_rate": 0.003, + "loss": 4.1566, + "step": 3769 + }, + { + "epoch": 0.0377, + "grad_norm": 0.47329598157123676, + "learning_rate": 0.003, + "loss": 4.1315, + "step": 3770 + }, + { + "epoch": 0.03771, + "grad_norm": 0.5353355178607329, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 3771 + }, + { + "epoch": 0.03772, + "grad_norm": 0.5081985686041285, + "learning_rate": 0.003, + "loss": 4.1668, + "step": 3772 + }, + { + "epoch": 0.03773, + "grad_norm": 0.4994313050826756, + "learning_rate": 0.003, + "loss": 4.1748, + "step": 3773 + }, + { + "epoch": 0.03774, + "grad_norm": 0.48919774898243473, + "learning_rate": 0.003, + "loss": 4.1567, + "step": 3774 + }, + { + "epoch": 0.03775, + "grad_norm": 0.46915381954128726, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 3775 + }, + { + "epoch": 0.03776, + "grad_norm": 0.46730858925326924, + "learning_rate": 0.003, + "loss": 4.1476, + "step": 3776 + }, + { + "epoch": 0.03777, + "grad_norm": 0.4868019030635906, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 3777 + }, + { + "epoch": 0.03778, + "grad_norm": 0.4826030300886077, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 3778 + }, + { + "epoch": 0.03779, + "grad_norm": 0.4347204008546579, + "learning_rate": 0.003, + "loss": 4.151, + "step": 3779 + }, + { + "epoch": 0.0378, + "grad_norm": 0.4013702620235084, + "learning_rate": 0.003, + "loss": 4.149, + "step": 3780 + }, + { + "epoch": 0.03781, + "grad_norm": 0.4909799595844497, + "learning_rate": 0.003, + "loss": 4.1807, + "step": 3781 + }, + { + "epoch": 0.03782, + "grad_norm": 0.5827570172997248, + "learning_rate": 0.003, + "loss": 4.1562, + "step": 3782 + }, + { + "epoch": 0.03783, + "grad_norm": 0.7057652047796394, + "learning_rate": 0.003, + "loss": 4.1621, + "step": 3783 + }, + { + "epoch": 0.03784, + "grad_norm": 0.7176846202765269, + "learning_rate": 0.003, + "loss": 4.1645, + "step": 3784 + }, + { + "epoch": 0.03785, + "grad_norm": 0.6227082605371008, + "learning_rate": 0.003, + "loss": 4.1465, + "step": 3785 + }, + { + "epoch": 0.03786, + "grad_norm": 0.5851856366466184, + "learning_rate": 0.003, + "loss": 4.1631, + "step": 3786 + }, + { + "epoch": 0.03787, + "grad_norm": 0.5827445998978293, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 3787 + }, + { + "epoch": 0.03788, + "grad_norm": 0.5961875200035882, + "learning_rate": 0.003, + "loss": 4.1832, + "step": 3788 + }, + { + "epoch": 0.03789, + "grad_norm": 0.5599813731691621, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 3789 + }, + { + "epoch": 0.0379, + "grad_norm": 0.5457520494795979, + "learning_rate": 0.003, + "loss": 4.137, + "step": 3790 + }, + { + "epoch": 0.03791, + "grad_norm": 0.5890367342982761, + "learning_rate": 0.003, + "loss": 4.152, + "step": 3791 + }, + { + "epoch": 0.03792, + "grad_norm": 0.6819220498588953, + "learning_rate": 0.003, + "loss": 4.1877, + "step": 3792 + }, + { + "epoch": 0.03793, + "grad_norm": 0.7245933962883803, + "learning_rate": 0.003, + "loss": 4.1838, + "step": 3793 + }, + { + "epoch": 0.03794, + "grad_norm": 0.6330889319321834, + "learning_rate": 0.003, + "loss": 4.1803, + "step": 3794 + }, + { + "epoch": 0.03795, + "grad_norm": 0.6825420551312804, + "learning_rate": 0.003, + "loss": 4.1655, + "step": 3795 + }, + { + "epoch": 0.03796, + "grad_norm": 0.7213382120258517, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 3796 + }, + { + "epoch": 0.03797, + "grad_norm": 0.753053475755318, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 3797 + }, + { + "epoch": 0.03798, + "grad_norm": 0.8024496845778784, + "learning_rate": 0.003, + "loss": 4.163, + "step": 3798 + }, + { + "epoch": 0.03799, + "grad_norm": 0.6761853816469585, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 3799 + }, + { + "epoch": 0.038, + "grad_norm": 0.5760561821833747, + "learning_rate": 0.003, + "loss": 4.1741, + "step": 3800 + }, + { + "epoch": 0.03801, + "grad_norm": 0.5877988869253646, + "learning_rate": 0.003, + "loss": 4.1763, + "step": 3801 + }, + { + "epoch": 0.03802, + "grad_norm": 0.6488463807188865, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 3802 + }, + { + "epoch": 0.03803, + "grad_norm": 0.5517421440712098, + "learning_rate": 0.003, + "loss": 4.177, + "step": 3803 + }, + { + "epoch": 0.03804, + "grad_norm": 0.5096073674154153, + "learning_rate": 0.003, + "loss": 4.1455, + "step": 3804 + }, + { + "epoch": 0.03805, + "grad_norm": 0.5032605295974328, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 3805 + }, + { + "epoch": 0.03806, + "grad_norm": 0.515161268146513, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 3806 + }, + { + "epoch": 0.03807, + "grad_norm": 0.5868281956315563, + "learning_rate": 0.003, + "loss": 4.154, + "step": 3807 + }, + { + "epoch": 0.03808, + "grad_norm": 0.6126977458856417, + "learning_rate": 0.003, + "loss": 4.1693, + "step": 3808 + }, + { + "epoch": 0.03809, + "grad_norm": 0.5958831138264383, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 3809 + }, + { + "epoch": 0.0381, + "grad_norm": 0.5125381608413851, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 3810 + }, + { + "epoch": 0.03811, + "grad_norm": 0.46806634569742234, + "learning_rate": 0.003, + "loss": 4.1638, + "step": 3811 + }, + { + "epoch": 0.03812, + "grad_norm": 0.4708164911957734, + "learning_rate": 0.003, + "loss": 4.1534, + "step": 3812 + }, + { + "epoch": 0.03813, + "grad_norm": 0.4451707961484543, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 3813 + }, + { + "epoch": 0.03814, + "grad_norm": 0.440545036907673, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 3814 + }, + { + "epoch": 0.03815, + "grad_norm": 0.4171348679565834, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 3815 + }, + { + "epoch": 0.03816, + "grad_norm": 0.553176653360458, + "learning_rate": 0.003, + "loss": 4.148, + "step": 3816 + }, + { + "epoch": 0.03817, + "grad_norm": 0.8249856806613923, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 3817 + }, + { + "epoch": 0.03818, + "grad_norm": 1.0849772720701052, + "learning_rate": 0.003, + "loss": 4.2073, + "step": 3818 + }, + { + "epoch": 0.03819, + "grad_norm": 0.8469500316270079, + "learning_rate": 0.003, + "loss": 4.1542, + "step": 3819 + }, + { + "epoch": 0.0382, + "grad_norm": 0.707642322044435, + "learning_rate": 0.003, + "loss": 4.2001, + "step": 3820 + }, + { + "epoch": 0.03821, + "grad_norm": 0.747958147518101, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 3821 + }, + { + "epoch": 0.03822, + "grad_norm": 0.6648920321229015, + "learning_rate": 0.003, + "loss": 4.169, + "step": 3822 + }, + { + "epoch": 0.03823, + "grad_norm": 0.6528306662073633, + "learning_rate": 0.003, + "loss": 4.1635, + "step": 3823 + }, + { + "epoch": 0.03824, + "grad_norm": 0.760735694975771, + "learning_rate": 0.003, + "loss": 4.1651, + "step": 3824 + }, + { + "epoch": 0.03825, + "grad_norm": 0.8622307424344766, + "learning_rate": 0.003, + "loss": 4.185, + "step": 3825 + }, + { + "epoch": 0.03826, + "grad_norm": 0.8866334244520664, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 3826 + }, + { + "epoch": 0.03827, + "grad_norm": 0.8924274619717957, + "learning_rate": 0.003, + "loss": 4.206, + "step": 3827 + }, + { + "epoch": 0.03828, + "grad_norm": 0.7065036502476953, + "learning_rate": 0.003, + "loss": 4.178, + "step": 3828 + }, + { + "epoch": 0.03829, + "grad_norm": 0.6969933866633966, + "learning_rate": 0.003, + "loss": 4.1837, + "step": 3829 + }, + { + "epoch": 0.0383, + "grad_norm": 0.6465184385629209, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 3830 + }, + { + "epoch": 0.03831, + "grad_norm": 0.7285929638974894, + "learning_rate": 0.003, + "loss": 4.1731, + "step": 3831 + }, + { + "epoch": 0.03832, + "grad_norm": 0.7578366971294306, + "learning_rate": 0.003, + "loss": 4.1714, + "step": 3832 + }, + { + "epoch": 0.03833, + "grad_norm": 0.7591256877673351, + "learning_rate": 0.003, + "loss": 4.173, + "step": 3833 + }, + { + "epoch": 0.03834, + "grad_norm": 0.6570705623790875, + "learning_rate": 0.003, + "loss": 4.2013, + "step": 3834 + }, + { + "epoch": 0.03835, + "grad_norm": 0.6338048148231775, + "learning_rate": 0.003, + "loss": 4.1848, + "step": 3835 + }, + { + "epoch": 0.03836, + "grad_norm": 0.579305584931518, + "learning_rate": 0.003, + "loss": 4.1872, + "step": 3836 + }, + { + "epoch": 0.03837, + "grad_norm": 0.5000040915409231, + "learning_rate": 0.003, + "loss": 4.1685, + "step": 3837 + }, + { + "epoch": 0.03838, + "grad_norm": 0.48980765754438066, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 3838 + }, + { + "epoch": 0.03839, + "grad_norm": 0.44391262328684045, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 3839 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4581943030154349, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 3840 + }, + { + "epoch": 0.03841, + "grad_norm": 0.4824433708702651, + "learning_rate": 0.003, + "loss": 4.1634, + "step": 3841 + }, + { + "epoch": 0.03842, + "grad_norm": 0.4830219498091642, + "learning_rate": 0.003, + "loss": 4.1638, + "step": 3842 + }, + { + "epoch": 0.03843, + "grad_norm": 0.45884626845612136, + "learning_rate": 0.003, + "loss": 4.1489, + "step": 3843 + }, + { + "epoch": 0.03844, + "grad_norm": 0.3942504840023139, + "learning_rate": 0.003, + "loss": 4.1542, + "step": 3844 + }, + { + "epoch": 0.03845, + "grad_norm": 0.35673387851966176, + "learning_rate": 0.003, + "loss": 4.1521, + "step": 3845 + }, + { + "epoch": 0.03846, + "grad_norm": 0.3480023206810377, + "learning_rate": 0.003, + "loss": 4.1675, + "step": 3846 + }, + { + "epoch": 0.03847, + "grad_norm": 0.30772952512029117, + "learning_rate": 0.003, + "loss": 4.153, + "step": 3847 + }, + { + "epoch": 0.03848, + "grad_norm": 0.351053429769811, + "learning_rate": 0.003, + "loss": 4.138, + "step": 3848 + }, + { + "epoch": 0.03849, + "grad_norm": 0.3486839774780466, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 3849 + }, + { + "epoch": 0.0385, + "grad_norm": 0.41203805300045565, + "learning_rate": 0.003, + "loss": 4.1624, + "step": 3850 + }, + { + "epoch": 0.03851, + "grad_norm": 0.5052943186225257, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 3851 + }, + { + "epoch": 0.03852, + "grad_norm": 0.6267509920431387, + "learning_rate": 0.003, + "loss": 4.167, + "step": 3852 + }, + { + "epoch": 0.03853, + "grad_norm": 0.7787688899298432, + "learning_rate": 0.003, + "loss": 4.1691, + "step": 3853 + }, + { + "epoch": 0.03854, + "grad_norm": 0.7725109769746114, + "learning_rate": 0.003, + "loss": 4.1654, + "step": 3854 + }, + { + "epoch": 0.03855, + "grad_norm": 0.6052355534312069, + "learning_rate": 0.003, + "loss": 4.1757, + "step": 3855 + }, + { + "epoch": 0.03856, + "grad_norm": 0.6299503142898346, + "learning_rate": 0.003, + "loss": 4.1483, + "step": 3856 + }, + { + "epoch": 0.03857, + "grad_norm": 0.5706246944559047, + "learning_rate": 0.003, + "loss": 4.1532, + "step": 3857 + }, + { + "epoch": 0.03858, + "grad_norm": 0.5178970385116678, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 3858 + }, + { + "epoch": 0.03859, + "grad_norm": 0.6163137266901324, + "learning_rate": 0.003, + "loss": 4.1717, + "step": 3859 + }, + { + "epoch": 0.0386, + "grad_norm": 0.5584392387461609, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 3860 + }, + { + "epoch": 0.03861, + "grad_norm": 0.5881922942664491, + "learning_rate": 0.003, + "loss": 4.174, + "step": 3861 + }, + { + "epoch": 0.03862, + "grad_norm": 0.6070887580362981, + "learning_rate": 0.003, + "loss": 4.1499, + "step": 3862 + }, + { + "epoch": 0.03863, + "grad_norm": 0.5412974380781377, + "learning_rate": 0.003, + "loss": 4.1431, + "step": 3863 + }, + { + "epoch": 0.03864, + "grad_norm": 0.558239316328612, + "learning_rate": 0.003, + "loss": 4.1489, + "step": 3864 + }, + { + "epoch": 0.03865, + "grad_norm": 0.5920559813538258, + "learning_rate": 0.003, + "loss": 4.1625, + "step": 3865 + }, + { + "epoch": 0.03866, + "grad_norm": 0.65782610303018, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 3866 + }, + { + "epoch": 0.03867, + "grad_norm": 0.8001945832132011, + "learning_rate": 0.003, + "loss": 4.1339, + "step": 3867 + }, + { + "epoch": 0.03868, + "grad_norm": 0.8093398470542099, + "learning_rate": 0.003, + "loss": 4.1732, + "step": 3868 + }, + { + "epoch": 0.03869, + "grad_norm": 0.8272905784155873, + "learning_rate": 0.003, + "loss": 4.1631, + "step": 3869 + }, + { + "epoch": 0.0387, + "grad_norm": 0.7805492019719857, + "learning_rate": 0.003, + "loss": 4.1745, + "step": 3870 + }, + { + "epoch": 0.03871, + "grad_norm": 0.6463277445729251, + "learning_rate": 0.003, + "loss": 4.1432, + "step": 3871 + }, + { + "epoch": 0.03872, + "grad_norm": 0.7130625269699555, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 3872 + }, + { + "epoch": 0.03873, + "grad_norm": 0.7604226657571129, + "learning_rate": 0.003, + "loss": 4.1892, + "step": 3873 + }, + { + "epoch": 0.03874, + "grad_norm": 0.8264003715319567, + "learning_rate": 0.003, + "loss": 4.1659, + "step": 3874 + }, + { + "epoch": 0.03875, + "grad_norm": 0.8157133390523014, + "learning_rate": 0.003, + "loss": 4.1706, + "step": 3875 + }, + { + "epoch": 0.03876, + "grad_norm": 0.6633759602344576, + "learning_rate": 0.003, + "loss": 4.1981, + "step": 3876 + }, + { + "epoch": 0.03877, + "grad_norm": 0.5945213411893223, + "learning_rate": 0.003, + "loss": 4.1757, + "step": 3877 + }, + { + "epoch": 0.03878, + "grad_norm": 0.6057373256262768, + "learning_rate": 0.003, + "loss": 4.1882, + "step": 3878 + }, + { + "epoch": 0.03879, + "grad_norm": 0.632481493308064, + "learning_rate": 0.003, + "loss": 4.1887, + "step": 3879 + }, + { + "epoch": 0.0388, + "grad_norm": 0.5936273104909171, + "learning_rate": 0.003, + "loss": 4.1491, + "step": 3880 + }, + { + "epoch": 0.03881, + "grad_norm": 0.5078096495379679, + "learning_rate": 0.003, + "loss": 4.1746, + "step": 3881 + }, + { + "epoch": 0.03882, + "grad_norm": 0.47981556545878357, + "learning_rate": 0.003, + "loss": 4.1795, + "step": 3882 + }, + { + "epoch": 0.03883, + "grad_norm": 0.5081271003225738, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 3883 + }, + { + "epoch": 0.03884, + "grad_norm": 0.5324630916346529, + "learning_rate": 0.003, + "loss": 4.1596, + "step": 3884 + }, + { + "epoch": 0.03885, + "grad_norm": 0.5770047266124201, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 3885 + }, + { + "epoch": 0.03886, + "grad_norm": 0.5823626216641113, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 3886 + }, + { + "epoch": 0.03887, + "grad_norm": 0.5323044500683095, + "learning_rate": 0.003, + "loss": 4.1628, + "step": 3887 + }, + { + "epoch": 0.03888, + "grad_norm": 0.6470253491308557, + "learning_rate": 0.003, + "loss": 4.1434, + "step": 3888 + }, + { + "epoch": 0.03889, + "grad_norm": 0.7016901913145263, + "learning_rate": 0.003, + "loss": 4.1667, + "step": 3889 + }, + { + "epoch": 0.0389, + "grad_norm": 0.6102524549297166, + "learning_rate": 0.003, + "loss": 4.1498, + "step": 3890 + }, + { + "epoch": 0.03891, + "grad_norm": 0.5529440776190121, + "learning_rate": 0.003, + "loss": 4.1557, + "step": 3891 + }, + { + "epoch": 0.03892, + "grad_norm": 0.5916779639985591, + "learning_rate": 0.003, + "loss": 4.172, + "step": 3892 + }, + { + "epoch": 0.03893, + "grad_norm": 0.6828403164964358, + "learning_rate": 0.003, + "loss": 4.1486, + "step": 3893 + }, + { + "epoch": 0.03894, + "grad_norm": 0.6193514242120052, + "learning_rate": 0.003, + "loss": 4.1601, + "step": 3894 + }, + { + "epoch": 0.03895, + "grad_norm": 0.5599659326192736, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 3895 + }, + { + "epoch": 0.03896, + "grad_norm": 0.48762045043395263, + "learning_rate": 0.003, + "loss": 4.1557, + "step": 3896 + }, + { + "epoch": 0.03897, + "grad_norm": 0.5042861977924956, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 3897 + }, + { + "epoch": 0.03898, + "grad_norm": 0.487281141563187, + "learning_rate": 0.003, + "loss": 4.1596, + "step": 3898 + }, + { + "epoch": 0.03899, + "grad_norm": 0.5187455846326631, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 3899 + }, + { + "epoch": 0.039, + "grad_norm": 0.4540397798848912, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 3900 + }, + { + "epoch": 0.03901, + "grad_norm": 0.4735936186553674, + "learning_rate": 0.003, + "loss": 4.1619, + "step": 3901 + }, + { + "epoch": 0.03902, + "grad_norm": 0.45921775589104796, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 3902 + }, + { + "epoch": 0.03903, + "grad_norm": 0.37493975839614513, + "learning_rate": 0.003, + "loss": 4.168, + "step": 3903 + }, + { + "epoch": 0.03904, + "grad_norm": 0.3462652687764147, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 3904 + }, + { + "epoch": 0.03905, + "grad_norm": 0.35156300254567974, + "learning_rate": 0.003, + "loss": 4.1648, + "step": 3905 + }, + { + "epoch": 0.03906, + "grad_norm": 0.3615192109464765, + "learning_rate": 0.003, + "loss": 4.1355, + "step": 3906 + }, + { + "epoch": 0.03907, + "grad_norm": 0.4018823550496774, + "learning_rate": 0.003, + "loss": 4.1489, + "step": 3907 + }, + { + "epoch": 0.03908, + "grad_norm": 0.5764826701055724, + "learning_rate": 0.003, + "loss": 4.1338, + "step": 3908 + }, + { + "epoch": 0.03909, + "grad_norm": 0.7886220228031054, + "learning_rate": 0.003, + "loss": 4.1504, + "step": 3909 + }, + { + "epoch": 0.0391, + "grad_norm": 0.8778590965436065, + "learning_rate": 0.003, + "loss": 4.1885, + "step": 3910 + }, + { + "epoch": 0.03911, + "grad_norm": 0.7707501748681591, + "learning_rate": 0.003, + "loss": 4.1709, + "step": 3911 + }, + { + "epoch": 0.03912, + "grad_norm": 0.7237965058473473, + "learning_rate": 0.003, + "loss": 4.1699, + "step": 3912 + }, + { + "epoch": 0.03913, + "grad_norm": 0.7684494783386372, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 3913 + }, + { + "epoch": 0.03914, + "grad_norm": 0.6559500648333249, + "learning_rate": 0.003, + "loss": 4.1776, + "step": 3914 + }, + { + "epoch": 0.03915, + "grad_norm": 0.6438695734989948, + "learning_rate": 0.003, + "loss": 4.149, + "step": 3915 + }, + { + "epoch": 0.03916, + "grad_norm": 0.710001018742664, + "learning_rate": 0.003, + "loss": 4.1781, + "step": 3916 + }, + { + "epoch": 0.03917, + "grad_norm": 0.7219014702450186, + "learning_rate": 0.003, + "loss": 4.1693, + "step": 3917 + }, + { + "epoch": 0.03918, + "grad_norm": 0.6133683335088463, + "learning_rate": 0.003, + "loss": 4.156, + "step": 3918 + }, + { + "epoch": 0.03919, + "grad_norm": 0.4840575228811507, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 3919 + }, + { + "epoch": 0.0392, + "grad_norm": 0.4695659280359604, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 3920 + }, + { + "epoch": 0.03921, + "grad_norm": 0.4597028009194431, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 3921 + }, + { + "epoch": 0.03922, + "grad_norm": 0.4021059704956593, + "learning_rate": 0.003, + "loss": 4.1754, + "step": 3922 + }, + { + "epoch": 0.03923, + "grad_norm": 0.45811748316459755, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 3923 + }, + { + "epoch": 0.03924, + "grad_norm": 0.4786080372228159, + "learning_rate": 0.003, + "loss": 4.1388, + "step": 3924 + }, + { + "epoch": 0.03925, + "grad_norm": 0.5049165752143754, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 3925 + }, + { + "epoch": 0.03926, + "grad_norm": 0.5163104662428779, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 3926 + }, + { + "epoch": 0.03927, + "grad_norm": 0.5099171230784774, + "learning_rate": 0.003, + "loss": 4.1492, + "step": 3927 + }, + { + "epoch": 0.03928, + "grad_norm": 0.572288419562524, + "learning_rate": 0.003, + "loss": 4.168, + "step": 3928 + }, + { + "epoch": 0.03929, + "grad_norm": 0.6901560668624227, + "learning_rate": 0.003, + "loss": 4.1812, + "step": 3929 + }, + { + "epoch": 0.0393, + "grad_norm": 0.6037962837223242, + "learning_rate": 0.003, + "loss": 4.1643, + "step": 3930 + }, + { + "epoch": 0.03931, + "grad_norm": 0.5251957250436882, + "learning_rate": 0.003, + "loss": 4.1638, + "step": 3931 + }, + { + "epoch": 0.03932, + "grad_norm": 0.538831165664605, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 3932 + }, + { + "epoch": 0.03933, + "grad_norm": 0.5209838890496463, + "learning_rate": 0.003, + "loss": 4.1515, + "step": 3933 + }, + { + "epoch": 0.03934, + "grad_norm": 0.566519679434704, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 3934 + }, + { + "epoch": 0.03935, + "grad_norm": 0.646396237695394, + "learning_rate": 0.003, + "loss": 4.1458, + "step": 3935 + }, + { + "epoch": 0.03936, + "grad_norm": 0.7750930535443175, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 3936 + }, + { + "epoch": 0.03937, + "grad_norm": 0.9869397924658896, + "learning_rate": 0.003, + "loss": 4.168, + "step": 3937 + }, + { + "epoch": 0.03938, + "grad_norm": 1.0277905519905985, + "learning_rate": 0.003, + "loss": 4.1836, + "step": 3938 + }, + { + "epoch": 0.03939, + "grad_norm": 0.7852266987902085, + "learning_rate": 0.003, + "loss": 4.1521, + "step": 3939 + }, + { + "epoch": 0.0394, + "grad_norm": 0.8232677591358063, + "learning_rate": 0.003, + "loss": 4.1891, + "step": 3940 + }, + { + "epoch": 0.03941, + "grad_norm": 0.7554935915430424, + "learning_rate": 0.003, + "loss": 4.1765, + "step": 3941 + }, + { + "epoch": 0.03942, + "grad_norm": 0.5714665309206746, + "learning_rate": 0.003, + "loss": 4.1794, + "step": 3942 + }, + { + "epoch": 0.03943, + "grad_norm": 0.5845658325896066, + "learning_rate": 0.003, + "loss": 4.1718, + "step": 3943 + }, + { + "epoch": 0.03944, + "grad_norm": 0.6026107955836698, + "learning_rate": 0.003, + "loss": 4.1867, + "step": 3944 + }, + { + "epoch": 0.03945, + "grad_norm": 0.5468006864617043, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 3945 + }, + { + "epoch": 0.03946, + "grad_norm": 0.5298095903489612, + "learning_rate": 0.003, + "loss": 4.1779, + "step": 3946 + }, + { + "epoch": 0.03947, + "grad_norm": 0.5118480688773221, + "learning_rate": 0.003, + "loss": 4.1795, + "step": 3947 + }, + { + "epoch": 0.03948, + "grad_norm": 0.5415983011837967, + "learning_rate": 0.003, + "loss": 4.1971, + "step": 3948 + }, + { + "epoch": 0.03949, + "grad_norm": 0.6476511273829448, + "learning_rate": 0.003, + "loss": 4.1406, + "step": 3949 + }, + { + "epoch": 0.0395, + "grad_norm": 0.5921282759053738, + "learning_rate": 0.003, + "loss": 4.1694, + "step": 3950 + }, + { + "epoch": 0.03951, + "grad_norm": 0.4873072132844621, + "learning_rate": 0.003, + "loss": 4.1943, + "step": 3951 + }, + { + "epoch": 0.03952, + "grad_norm": 0.47680587697404614, + "learning_rate": 0.003, + "loss": 4.1735, + "step": 3952 + }, + { + "epoch": 0.03953, + "grad_norm": 0.4863348841149029, + "learning_rate": 0.003, + "loss": 4.1495, + "step": 3953 + }, + { + "epoch": 0.03954, + "grad_norm": 0.4976353402048533, + "learning_rate": 0.003, + "loss": 4.1532, + "step": 3954 + }, + { + "epoch": 0.03955, + "grad_norm": 0.5951063481718434, + "learning_rate": 0.003, + "loss": 4.1575, + "step": 3955 + }, + { + "epoch": 0.03956, + "grad_norm": 0.6359841084362824, + "learning_rate": 0.003, + "loss": 4.1616, + "step": 3956 + }, + { + "epoch": 0.03957, + "grad_norm": 0.5941354341152176, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 3957 + }, + { + "epoch": 0.03958, + "grad_norm": 0.6262454009808719, + "learning_rate": 0.003, + "loss": 4.1397, + "step": 3958 + }, + { + "epoch": 0.03959, + "grad_norm": 0.7154633677223874, + "learning_rate": 0.003, + "loss": 4.1455, + "step": 3959 + }, + { + "epoch": 0.0396, + "grad_norm": 0.7081920422281349, + "learning_rate": 0.003, + "loss": 4.165, + "step": 3960 + }, + { + "epoch": 0.03961, + "grad_norm": 0.7397573168118693, + "learning_rate": 0.003, + "loss": 4.1626, + "step": 3961 + }, + { + "epoch": 0.03962, + "grad_norm": 0.711337260874705, + "learning_rate": 0.003, + "loss": 4.1737, + "step": 3962 + }, + { + "epoch": 0.03963, + "grad_norm": 0.7123884385146033, + "learning_rate": 0.003, + "loss": 4.1626, + "step": 3963 + }, + { + "epoch": 0.03964, + "grad_norm": 0.6152211153875224, + "learning_rate": 0.003, + "loss": 4.1597, + "step": 3964 + }, + { + "epoch": 0.03965, + "grad_norm": 0.591222143630805, + "learning_rate": 0.003, + "loss": 4.1579, + "step": 3965 + }, + { + "epoch": 0.03966, + "grad_norm": 0.5392802901033877, + "learning_rate": 0.003, + "loss": 4.1935, + "step": 3966 + }, + { + "epoch": 0.03967, + "grad_norm": 0.524610568064759, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 3967 + }, + { + "epoch": 0.03968, + "grad_norm": 0.5121602181555871, + "learning_rate": 0.003, + "loss": 4.1754, + "step": 3968 + }, + { + "epoch": 0.03969, + "grad_norm": 0.5008822737603466, + "learning_rate": 0.003, + "loss": 4.1543, + "step": 3969 + }, + { + "epoch": 0.0397, + "grad_norm": 0.5150356493385037, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 3970 + }, + { + "epoch": 0.03971, + "grad_norm": 0.5181126377410241, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 3971 + }, + { + "epoch": 0.03972, + "grad_norm": 0.4952868090920291, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 3972 + }, + { + "epoch": 0.03973, + "grad_norm": 0.49146340749665485, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 3973 + }, + { + "epoch": 0.03974, + "grad_norm": 0.6037509933252784, + "learning_rate": 0.003, + "loss": 4.1574, + "step": 3974 + }, + { + "epoch": 0.03975, + "grad_norm": 0.7194350601933044, + "learning_rate": 0.003, + "loss": 4.152, + "step": 3975 + }, + { + "epoch": 0.03976, + "grad_norm": 0.7807821793201645, + "learning_rate": 0.003, + "loss": 4.189, + "step": 3976 + }, + { + "epoch": 0.03977, + "grad_norm": 0.7824814786642659, + "learning_rate": 0.003, + "loss": 4.18, + "step": 3977 + }, + { + "epoch": 0.03978, + "grad_norm": 0.7382196107400837, + "learning_rate": 0.003, + "loss": 4.145, + "step": 3978 + }, + { + "epoch": 0.03979, + "grad_norm": 0.7187722022394311, + "learning_rate": 0.003, + "loss": 4.1683, + "step": 3979 + }, + { + "epoch": 0.0398, + "grad_norm": 0.7054934205273465, + "learning_rate": 0.003, + "loss": 4.1617, + "step": 3980 + }, + { + "epoch": 0.03981, + "grad_norm": 0.7032054871032929, + "learning_rate": 0.003, + "loss": 4.1547, + "step": 3981 + }, + { + "epoch": 0.03982, + "grad_norm": 0.5823608521179618, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 3982 + }, + { + "epoch": 0.03983, + "grad_norm": 0.5786298940783224, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 3983 + }, + { + "epoch": 0.03984, + "grad_norm": 0.6246079403898089, + "learning_rate": 0.003, + "loss": 4.166, + "step": 3984 + }, + { + "epoch": 0.03985, + "grad_norm": 0.6818628523024235, + "learning_rate": 0.003, + "loss": 4.1623, + "step": 3985 + }, + { + "epoch": 0.03986, + "grad_norm": 0.6535731478840459, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 3986 + }, + { + "epoch": 0.03987, + "grad_norm": 0.6040151659435402, + "learning_rate": 0.003, + "loss": 4.1639, + "step": 3987 + }, + { + "epoch": 0.03988, + "grad_norm": 0.6453741519043461, + "learning_rate": 0.003, + "loss": 4.1707, + "step": 3988 + }, + { + "epoch": 0.03989, + "grad_norm": 0.764312719470282, + "learning_rate": 0.003, + "loss": 4.1601, + "step": 3989 + }, + { + "epoch": 0.0399, + "grad_norm": 0.8495293415117621, + "learning_rate": 0.003, + "loss": 4.1394, + "step": 3990 + }, + { + "epoch": 0.03991, + "grad_norm": 0.7402933337405657, + "learning_rate": 0.003, + "loss": 4.183, + "step": 3991 + }, + { + "epoch": 0.03992, + "grad_norm": 0.5804226786236383, + "learning_rate": 0.003, + "loss": 4.1586, + "step": 3992 + }, + { + "epoch": 0.03993, + "grad_norm": 0.586169093303552, + "learning_rate": 0.003, + "loss": 4.1558, + "step": 3993 + }, + { + "epoch": 0.03994, + "grad_norm": 0.6041046229351503, + "learning_rate": 0.003, + "loss": 4.1516, + "step": 3994 + }, + { + "epoch": 0.03995, + "grad_norm": 0.6875437069134418, + "learning_rate": 0.003, + "loss": 4.1441, + "step": 3995 + }, + { + "epoch": 0.03996, + "grad_norm": 0.5847418717553203, + "learning_rate": 0.003, + "loss": 4.1722, + "step": 3996 + }, + { + "epoch": 0.03997, + "grad_norm": 0.5323079012210317, + "learning_rate": 0.003, + "loss": 4.1722, + "step": 3997 + }, + { + "epoch": 0.03998, + "grad_norm": 0.4356783084468044, + "learning_rate": 0.003, + "loss": 4.1713, + "step": 3998 + }, + { + "epoch": 0.03999, + "grad_norm": 0.3763086485452268, + "learning_rate": 0.003, + "loss": 4.1692, + "step": 3999 + }, + { + "epoch": 0.04, + "grad_norm": 0.37795109500013657, + "learning_rate": 0.003, + "loss": 4.1715, + "step": 4000 + }, + { + "epoch": 0.04001, + "grad_norm": 0.3830959820048188, + "learning_rate": 0.003, + "loss": 4.167, + "step": 4001 + }, + { + "epoch": 0.04002, + "grad_norm": 0.41146487178002505, + "learning_rate": 0.003, + "loss": 4.1388, + "step": 4002 + }, + { + "epoch": 0.04003, + "grad_norm": 0.4313910001981467, + "learning_rate": 0.003, + "loss": 4.1612, + "step": 4003 + }, + { + "epoch": 0.04004, + "grad_norm": 0.4459746247604169, + "learning_rate": 0.003, + "loss": 4.1664, + "step": 4004 + }, + { + "epoch": 0.04005, + "grad_norm": 0.47718734644902294, + "learning_rate": 0.003, + "loss": 4.1732, + "step": 4005 + }, + { + "epoch": 0.04006, + "grad_norm": 0.4814752963605302, + "learning_rate": 0.003, + "loss": 4.1303, + "step": 4006 + }, + { + "epoch": 0.04007, + "grad_norm": 0.5099778221457298, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 4007 + }, + { + "epoch": 0.04008, + "grad_norm": 0.5043675363902501, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 4008 + }, + { + "epoch": 0.04009, + "grad_norm": 0.5439841205133068, + "learning_rate": 0.003, + "loss": 4.1338, + "step": 4009 + }, + { + "epoch": 0.0401, + "grad_norm": 0.6497881885617821, + "learning_rate": 0.003, + "loss": 4.155, + "step": 4010 + }, + { + "epoch": 0.04011, + "grad_norm": 0.8245370470535895, + "learning_rate": 0.003, + "loss": 4.1694, + "step": 4011 + }, + { + "epoch": 0.04012, + "grad_norm": 0.9674009929255976, + "learning_rate": 0.003, + "loss": 4.1789, + "step": 4012 + }, + { + "epoch": 0.04013, + "grad_norm": 0.7353158349829942, + "learning_rate": 0.003, + "loss": 4.1699, + "step": 4013 + }, + { + "epoch": 0.04014, + "grad_norm": 0.7081307887263109, + "learning_rate": 0.003, + "loss": 4.1588, + "step": 4014 + }, + { + "epoch": 0.04015, + "grad_norm": 0.7515395751430206, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 4015 + }, + { + "epoch": 0.04016, + "grad_norm": 0.7357741460180915, + "learning_rate": 0.003, + "loss": 4.1563, + "step": 4016 + }, + { + "epoch": 0.04017, + "grad_norm": 0.6881306027184227, + "learning_rate": 0.003, + "loss": 4.2004, + "step": 4017 + }, + { + "epoch": 0.04018, + "grad_norm": 0.6328656616117202, + "learning_rate": 0.003, + "loss": 4.1663, + "step": 4018 + }, + { + "epoch": 0.04019, + "grad_norm": 0.672111331766413, + "learning_rate": 0.003, + "loss": 4.147, + "step": 4019 + }, + { + "epoch": 0.0402, + "grad_norm": 0.6935269300068305, + "learning_rate": 0.003, + "loss": 4.1825, + "step": 4020 + }, + { + "epoch": 0.04021, + "grad_norm": 0.6368533158642944, + "learning_rate": 0.003, + "loss": 4.1532, + "step": 4021 + }, + { + "epoch": 0.04022, + "grad_norm": 0.7051716536344202, + "learning_rate": 0.003, + "loss": 4.1558, + "step": 4022 + }, + { + "epoch": 0.04023, + "grad_norm": 0.6469926590464656, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 4023 + }, + { + "epoch": 0.04024, + "grad_norm": 0.6694771111384129, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 4024 + }, + { + "epoch": 0.04025, + "grad_norm": 0.6341741755537447, + "learning_rate": 0.003, + "loss": 4.1513, + "step": 4025 + }, + { + "epoch": 0.04026, + "grad_norm": 0.6640885632809859, + "learning_rate": 0.003, + "loss": 4.1767, + "step": 4026 + }, + { + "epoch": 0.04027, + "grad_norm": 0.6089194432075719, + "learning_rate": 0.003, + "loss": 4.1801, + "step": 4027 + }, + { + "epoch": 0.04028, + "grad_norm": 0.6436111853565991, + "learning_rate": 0.003, + "loss": 4.1589, + "step": 4028 + }, + { + "epoch": 0.04029, + "grad_norm": 0.601196906060423, + "learning_rate": 0.003, + "loss": 4.1851, + "step": 4029 + }, + { + "epoch": 0.0403, + "grad_norm": 0.6008464331523574, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 4030 + }, + { + "epoch": 0.04031, + "grad_norm": 0.5735266100756939, + "learning_rate": 0.003, + "loss": 4.1527, + "step": 4031 + }, + { + "epoch": 0.04032, + "grad_norm": 0.5248413725968446, + "learning_rate": 0.003, + "loss": 4.1768, + "step": 4032 + }, + { + "epoch": 0.04033, + "grad_norm": 0.49046370370450504, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 4033 + }, + { + "epoch": 0.04034, + "grad_norm": 0.5090112067595912, + "learning_rate": 0.003, + "loss": 4.136, + "step": 4034 + }, + { + "epoch": 0.04035, + "grad_norm": 0.4649874752654264, + "learning_rate": 0.003, + "loss": 4.1578, + "step": 4035 + }, + { + "epoch": 0.04036, + "grad_norm": 0.4897958199323308, + "learning_rate": 0.003, + "loss": 4.1724, + "step": 4036 + }, + { + "epoch": 0.04037, + "grad_norm": 0.6044670370792896, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 4037 + }, + { + "epoch": 0.04038, + "grad_norm": 0.8335470507296103, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 4038 + }, + { + "epoch": 0.04039, + "grad_norm": 0.8645850468976461, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 4039 + }, + { + "epoch": 0.0404, + "grad_norm": 0.7530842619650773, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 4040 + }, + { + "epoch": 0.04041, + "grad_norm": 0.5842132443480766, + "learning_rate": 0.003, + "loss": 4.1457, + "step": 4041 + }, + { + "epoch": 0.04042, + "grad_norm": 0.5354331761654865, + "learning_rate": 0.003, + "loss": 4.1835, + "step": 4042 + }, + { + "epoch": 0.04043, + "grad_norm": 0.5185871828066747, + "learning_rate": 0.003, + "loss": 4.1689, + "step": 4043 + }, + { + "epoch": 0.04044, + "grad_norm": 0.5097132412370223, + "learning_rate": 0.003, + "loss": 4.162, + "step": 4044 + }, + { + "epoch": 0.04045, + "grad_norm": 0.553775594849916, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 4045 + }, + { + "epoch": 0.04046, + "grad_norm": 0.49141462695183785, + "learning_rate": 0.003, + "loss": 4.1533, + "step": 4046 + }, + { + "epoch": 0.04047, + "grad_norm": 0.41512219304607756, + "learning_rate": 0.003, + "loss": 4.1427, + "step": 4047 + }, + { + "epoch": 0.04048, + "grad_norm": 0.4225143586443279, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 4048 + }, + { + "epoch": 0.04049, + "grad_norm": 0.3984988970584974, + "learning_rate": 0.003, + "loss": 4.1735, + "step": 4049 + }, + { + "epoch": 0.0405, + "grad_norm": 0.3749419268913753, + "learning_rate": 0.003, + "loss": 4.1684, + "step": 4050 + }, + { + "epoch": 0.04051, + "grad_norm": 0.39476946001794994, + "learning_rate": 0.003, + "loss": 4.147, + "step": 4051 + }, + { + "epoch": 0.04052, + "grad_norm": 0.4554884596116926, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 4052 + }, + { + "epoch": 0.04053, + "grad_norm": 0.6242383494718413, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 4053 + }, + { + "epoch": 0.04054, + "grad_norm": 0.7860677992363685, + "learning_rate": 0.003, + "loss": 4.1382, + "step": 4054 + }, + { + "epoch": 0.04055, + "grad_norm": 0.8126593846409768, + "learning_rate": 0.003, + "loss": 4.137, + "step": 4055 + }, + { + "epoch": 0.04056, + "grad_norm": 0.7391710444931487, + "learning_rate": 0.003, + "loss": 4.1597, + "step": 4056 + }, + { + "epoch": 0.04057, + "grad_norm": 0.7425519640955306, + "learning_rate": 0.003, + "loss": 4.1623, + "step": 4057 + }, + { + "epoch": 0.04058, + "grad_norm": 0.7750101755361524, + "learning_rate": 0.003, + "loss": 4.1387, + "step": 4058 + }, + { + "epoch": 0.04059, + "grad_norm": 0.6970458468533857, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 4059 + }, + { + "epoch": 0.0406, + "grad_norm": 0.6081117396919562, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 4060 + }, + { + "epoch": 0.04061, + "grad_norm": 0.539360737374925, + "learning_rate": 0.003, + "loss": 4.1598, + "step": 4061 + }, + { + "epoch": 0.04062, + "grad_norm": 0.5166363975835321, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 4062 + }, + { + "epoch": 0.04063, + "grad_norm": 0.49611935522557493, + "learning_rate": 0.003, + "loss": 4.1435, + "step": 4063 + }, + { + "epoch": 0.04064, + "grad_norm": 0.41285257049195684, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 4064 + }, + { + "epoch": 0.04065, + "grad_norm": 0.4763881231604338, + "learning_rate": 0.003, + "loss": 4.193, + "step": 4065 + }, + { + "epoch": 0.04066, + "grad_norm": 0.4741067038687965, + "learning_rate": 0.003, + "loss": 4.15, + "step": 4066 + }, + { + "epoch": 0.04067, + "grad_norm": 0.4426062959642298, + "learning_rate": 0.003, + "loss": 4.1514, + "step": 4067 + }, + { + "epoch": 0.04068, + "grad_norm": 0.42915467353070186, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 4068 + }, + { + "epoch": 0.04069, + "grad_norm": 0.45412191308027944, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 4069 + }, + { + "epoch": 0.0407, + "grad_norm": 0.4671086228860357, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 4070 + }, + { + "epoch": 0.04071, + "grad_norm": 0.5692230139112596, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 4071 + }, + { + "epoch": 0.04072, + "grad_norm": 0.7764468905836547, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 4072 + }, + { + "epoch": 0.04073, + "grad_norm": 0.8448322221505851, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 4073 + }, + { + "epoch": 0.04074, + "grad_norm": 0.856020875384491, + "learning_rate": 0.003, + "loss": 4.1745, + "step": 4074 + }, + { + "epoch": 0.04075, + "grad_norm": 0.9287167181671592, + "learning_rate": 0.003, + "loss": 4.1779, + "step": 4075 + }, + { + "epoch": 0.04076, + "grad_norm": 0.873329112405605, + "learning_rate": 0.003, + "loss": 4.1789, + "step": 4076 + }, + { + "epoch": 0.04077, + "grad_norm": 0.7601580397215363, + "learning_rate": 0.003, + "loss": 4.1881, + "step": 4077 + }, + { + "epoch": 0.04078, + "grad_norm": 0.8042586607784782, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 4078 + }, + { + "epoch": 0.04079, + "grad_norm": 0.8176950474454038, + "learning_rate": 0.003, + "loss": 4.1771, + "step": 4079 + }, + { + "epoch": 0.0408, + "grad_norm": 0.9030758743289882, + "learning_rate": 0.003, + "loss": 4.1811, + "step": 4080 + }, + { + "epoch": 0.04081, + "grad_norm": 0.7986927315562776, + "learning_rate": 0.003, + "loss": 4.1573, + "step": 4081 + }, + { + "epoch": 0.04082, + "grad_norm": 0.7456348575122663, + "learning_rate": 0.003, + "loss": 4.1933, + "step": 4082 + }, + { + "epoch": 0.04083, + "grad_norm": 0.7464945818151806, + "learning_rate": 0.003, + "loss": 4.1693, + "step": 4083 + }, + { + "epoch": 0.04084, + "grad_norm": 0.5910864081536713, + "learning_rate": 0.003, + "loss": 4.2007, + "step": 4084 + }, + { + "epoch": 0.04085, + "grad_norm": 0.6374107110244114, + "learning_rate": 0.003, + "loss": 4.1813, + "step": 4085 + }, + { + "epoch": 0.04086, + "grad_norm": 0.644544973240151, + "learning_rate": 0.003, + "loss": 4.2032, + "step": 4086 + }, + { + "epoch": 0.04087, + "grad_norm": 0.6474810812977669, + "learning_rate": 0.003, + "loss": 4.1827, + "step": 4087 + }, + { + "epoch": 0.04088, + "grad_norm": 0.48757799745230485, + "learning_rate": 0.003, + "loss": 4.1563, + "step": 4088 + }, + { + "epoch": 0.04089, + "grad_norm": 0.5082604112884798, + "learning_rate": 0.003, + "loss": 4.1548, + "step": 4089 + }, + { + "epoch": 0.0409, + "grad_norm": 0.4251634976124169, + "learning_rate": 0.003, + "loss": 4.1671, + "step": 4090 + }, + { + "epoch": 0.04091, + "grad_norm": 0.40607507200162474, + "learning_rate": 0.003, + "loss": 4.1758, + "step": 4091 + }, + { + "epoch": 0.04092, + "grad_norm": 0.4118371741461281, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 4092 + }, + { + "epoch": 0.04093, + "grad_norm": 0.379924610879027, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 4093 + }, + { + "epoch": 0.04094, + "grad_norm": 0.43768923530899545, + "learning_rate": 0.003, + "loss": 4.1461, + "step": 4094 + }, + { + "epoch": 0.04095, + "grad_norm": 0.411023776294053, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 4095 + }, + { + "epoch": 0.04096, + "grad_norm": 0.45768098717184347, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 4096 + }, + { + "epoch": 0.04097, + "grad_norm": 0.557314037287976, + "learning_rate": 0.003, + "loss": 4.1461, + "step": 4097 + }, + { + "epoch": 0.04098, + "grad_norm": 0.6699585119473419, + "learning_rate": 0.003, + "loss": 4.1559, + "step": 4098 + }, + { + "epoch": 0.04099, + "grad_norm": 0.6624628981660053, + "learning_rate": 0.003, + "loss": 4.1765, + "step": 4099 + }, + { + "epoch": 0.041, + "grad_norm": 0.4938757392791384, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 4100 + }, + { + "epoch": 0.04101, + "grad_norm": 0.41979137569186115, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 4101 + }, + { + "epoch": 0.04102, + "grad_norm": 0.49582634443681156, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 4102 + }, + { + "epoch": 0.04103, + "grad_norm": 0.46586436339945675, + "learning_rate": 0.003, + "loss": 4.166, + "step": 4103 + }, + { + "epoch": 0.04104, + "grad_norm": 0.45341411443238716, + "learning_rate": 0.003, + "loss": 4.1493, + "step": 4104 + }, + { + "epoch": 0.04105, + "grad_norm": 0.39192920748405663, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 4105 + }, + { + "epoch": 0.04106, + "grad_norm": 0.40403661174930955, + "learning_rate": 0.003, + "loss": 4.1493, + "step": 4106 + }, + { + "epoch": 0.04107, + "grad_norm": 0.46259866064643457, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 4107 + }, + { + "epoch": 0.04108, + "grad_norm": 0.5224958035249453, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 4108 + }, + { + "epoch": 0.04109, + "grad_norm": 0.5925525541895633, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 4109 + }, + { + "epoch": 0.0411, + "grad_norm": 0.6870748923239856, + "learning_rate": 0.003, + "loss": 4.142, + "step": 4110 + }, + { + "epoch": 0.04111, + "grad_norm": 0.6433679126097386, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 4111 + }, + { + "epoch": 0.04112, + "grad_norm": 0.5391529554622597, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 4112 + }, + { + "epoch": 0.04113, + "grad_norm": 0.5453895793045661, + "learning_rate": 0.003, + "loss": 4.1858, + "step": 4113 + }, + { + "epoch": 0.04114, + "grad_norm": 0.6398919299706648, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 4114 + }, + { + "epoch": 0.04115, + "grad_norm": 0.5856750517364259, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 4115 + }, + { + "epoch": 0.04116, + "grad_norm": 0.5640052575174052, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 4116 + }, + { + "epoch": 0.04117, + "grad_norm": 0.5578995001301751, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 4117 + }, + { + "epoch": 0.04118, + "grad_norm": 0.6189606161266092, + "learning_rate": 0.003, + "loss": 4.1802, + "step": 4118 + }, + { + "epoch": 0.04119, + "grad_norm": 0.69284741853494, + "learning_rate": 0.003, + "loss": 4.1574, + "step": 4119 + }, + { + "epoch": 0.0412, + "grad_norm": 0.8122773604588729, + "learning_rate": 0.003, + "loss": 4.1568, + "step": 4120 + }, + { + "epoch": 0.04121, + "grad_norm": 0.7788291402367661, + "learning_rate": 0.003, + "loss": 4.1805, + "step": 4121 + }, + { + "epoch": 0.04122, + "grad_norm": 0.7205856618274178, + "learning_rate": 0.003, + "loss": 4.1543, + "step": 4122 + }, + { + "epoch": 0.04123, + "grad_norm": 0.6723108027243482, + "learning_rate": 0.003, + "loss": 4.1631, + "step": 4123 + }, + { + "epoch": 0.04124, + "grad_norm": 0.5742707663821022, + "learning_rate": 0.003, + "loss": 4.138, + "step": 4124 + }, + { + "epoch": 0.04125, + "grad_norm": 0.5921662326911094, + "learning_rate": 0.003, + "loss": 4.1635, + "step": 4125 + }, + { + "epoch": 0.04126, + "grad_norm": 0.5065182010593955, + "learning_rate": 0.003, + "loss": 4.151, + "step": 4126 + }, + { + "epoch": 0.04127, + "grad_norm": 0.49270157976785267, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 4127 + }, + { + "epoch": 0.04128, + "grad_norm": 0.47833247711412197, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 4128 + }, + { + "epoch": 0.04129, + "grad_norm": 0.6245977843098431, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 4129 + }, + { + "epoch": 0.0413, + "grad_norm": 0.7299723124473214, + "learning_rate": 0.003, + "loss": 4.1378, + "step": 4130 + }, + { + "epoch": 0.04131, + "grad_norm": 0.7562574300910655, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 4131 + }, + { + "epoch": 0.04132, + "grad_norm": 0.7690429680218056, + "learning_rate": 0.003, + "loss": 4.1343, + "step": 4132 + }, + { + "epoch": 0.04133, + "grad_norm": 0.7790262395321094, + "learning_rate": 0.003, + "loss": 4.1785, + "step": 4133 + }, + { + "epoch": 0.04134, + "grad_norm": 0.7036209580453355, + "learning_rate": 0.003, + "loss": 4.1681, + "step": 4134 + }, + { + "epoch": 0.04135, + "grad_norm": 0.7431461288088913, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 4135 + }, + { + "epoch": 0.04136, + "grad_norm": 0.7470037276613238, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 4136 + }, + { + "epoch": 0.04137, + "grad_norm": 0.8449453146727118, + "learning_rate": 0.003, + "loss": 4.1972, + "step": 4137 + }, + { + "epoch": 0.04138, + "grad_norm": 0.6568218214769733, + "learning_rate": 0.003, + "loss": 4.1742, + "step": 4138 + }, + { + "epoch": 0.04139, + "grad_norm": 0.6371205783656457, + "learning_rate": 0.003, + "loss": 4.1718, + "step": 4139 + }, + { + "epoch": 0.0414, + "grad_norm": 0.724881052006821, + "learning_rate": 0.003, + "loss": 4.176, + "step": 4140 + }, + { + "epoch": 0.04141, + "grad_norm": 0.7146032900181981, + "learning_rate": 0.003, + "loss": 4.1537, + "step": 4141 + }, + { + "epoch": 0.04142, + "grad_norm": 0.8156917337253932, + "learning_rate": 0.003, + "loss": 4.1811, + "step": 4142 + }, + { + "epoch": 0.04143, + "grad_norm": 0.8785836902260167, + "learning_rate": 0.003, + "loss": 4.1747, + "step": 4143 + }, + { + "epoch": 0.04144, + "grad_norm": 0.7999443682311421, + "learning_rate": 0.003, + "loss": 4.1764, + "step": 4144 + }, + { + "epoch": 0.04145, + "grad_norm": 0.82858324833918, + "learning_rate": 0.003, + "loss": 4.1621, + "step": 4145 + }, + { + "epoch": 0.04146, + "grad_norm": 0.8235890608340272, + "learning_rate": 0.003, + "loss": 4.1882, + "step": 4146 + }, + { + "epoch": 0.04147, + "grad_norm": 0.858747508934936, + "learning_rate": 0.003, + "loss": 4.2011, + "step": 4147 + }, + { + "epoch": 0.04148, + "grad_norm": 0.7838271345732937, + "learning_rate": 0.003, + "loss": 4.1568, + "step": 4148 + }, + { + "epoch": 0.04149, + "grad_norm": 0.7263149003922105, + "learning_rate": 0.003, + "loss": 4.1664, + "step": 4149 + }, + { + "epoch": 0.0415, + "grad_norm": 0.6942802417486142, + "learning_rate": 0.003, + "loss": 4.1659, + "step": 4150 + }, + { + "epoch": 0.04151, + "grad_norm": 0.6248553867172758, + "learning_rate": 0.003, + "loss": 4.1813, + "step": 4151 + }, + { + "epoch": 0.04152, + "grad_norm": 0.6112599353340841, + "learning_rate": 0.003, + "loss": 4.1796, + "step": 4152 + }, + { + "epoch": 0.04153, + "grad_norm": 0.5629861808054807, + "learning_rate": 0.003, + "loss": 4.143, + "step": 4153 + }, + { + "epoch": 0.04154, + "grad_norm": 0.5649468221105395, + "learning_rate": 0.003, + "loss": 4.1757, + "step": 4154 + }, + { + "epoch": 0.04155, + "grad_norm": 0.5286657686017564, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 4155 + }, + { + "epoch": 0.04156, + "grad_norm": 0.4616483876621516, + "learning_rate": 0.003, + "loss": 4.1646, + "step": 4156 + }, + { + "epoch": 0.04157, + "grad_norm": 0.43208170797618906, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 4157 + }, + { + "epoch": 0.04158, + "grad_norm": 0.47690558549648143, + "learning_rate": 0.003, + "loss": 4.1765, + "step": 4158 + }, + { + "epoch": 0.04159, + "grad_norm": 0.548409828074487, + "learning_rate": 0.003, + "loss": 4.1476, + "step": 4159 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5628758636950728, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 4160 + }, + { + "epoch": 0.04161, + "grad_norm": 0.6122953755048903, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 4161 + }, + { + "epoch": 0.04162, + "grad_norm": 0.5313833560293301, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 4162 + }, + { + "epoch": 0.04163, + "grad_norm": 0.4545671521128938, + "learning_rate": 0.003, + "loss": 4.1485, + "step": 4163 + }, + { + "epoch": 0.04164, + "grad_norm": 0.48581450867041054, + "learning_rate": 0.003, + "loss": 4.169, + "step": 4164 + }, + { + "epoch": 0.04165, + "grad_norm": 0.48634350202313903, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 4165 + }, + { + "epoch": 0.04166, + "grad_norm": 0.42940905174204824, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 4166 + }, + { + "epoch": 0.04167, + "grad_norm": 0.39230202494683986, + "learning_rate": 0.003, + "loss": 4.1526, + "step": 4167 + }, + { + "epoch": 0.04168, + "grad_norm": 0.39538400082145897, + "learning_rate": 0.003, + "loss": 4.1535, + "step": 4168 + }, + { + "epoch": 0.04169, + "grad_norm": 0.3447120216665393, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 4169 + }, + { + "epoch": 0.0417, + "grad_norm": 0.36603795136119954, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 4170 + }, + { + "epoch": 0.04171, + "grad_norm": 0.38374101547618106, + "learning_rate": 0.003, + "loss": 4.1395, + "step": 4171 + }, + { + "epoch": 0.04172, + "grad_norm": 0.5162775730950696, + "learning_rate": 0.003, + "loss": 4.1617, + "step": 4172 + }, + { + "epoch": 0.04173, + "grad_norm": 0.5760736714087062, + "learning_rate": 0.003, + "loss": 4.1592, + "step": 4173 + }, + { + "epoch": 0.04174, + "grad_norm": 0.5576881465067848, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 4174 + }, + { + "epoch": 0.04175, + "grad_norm": 0.6077258689017496, + "learning_rate": 0.003, + "loss": 4.1771, + "step": 4175 + }, + { + "epoch": 0.04176, + "grad_norm": 0.6416498201931042, + "learning_rate": 0.003, + "loss": 4.1406, + "step": 4176 + }, + { + "epoch": 0.04177, + "grad_norm": 0.5865172170220208, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 4177 + }, + { + "epoch": 0.04178, + "grad_norm": 0.6121973272164818, + "learning_rate": 0.003, + "loss": 4.1495, + "step": 4178 + }, + { + "epoch": 0.04179, + "grad_norm": 0.7466591370840786, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 4179 + }, + { + "epoch": 0.0418, + "grad_norm": 0.7407658974314641, + "learning_rate": 0.003, + "loss": 4.1469, + "step": 4180 + }, + { + "epoch": 0.04181, + "grad_norm": 0.7876489798566603, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 4181 + }, + { + "epoch": 0.04182, + "grad_norm": 0.6878533187156349, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 4182 + }, + { + "epoch": 0.04183, + "grad_norm": 0.6899295854896734, + "learning_rate": 0.003, + "loss": 4.1428, + "step": 4183 + }, + { + "epoch": 0.04184, + "grad_norm": 0.6688165143357564, + "learning_rate": 0.003, + "loss": 4.1979, + "step": 4184 + }, + { + "epoch": 0.04185, + "grad_norm": 0.6585905689769214, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 4185 + }, + { + "epoch": 0.04186, + "grad_norm": 0.706982773019547, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 4186 + }, + { + "epoch": 0.04187, + "grad_norm": 0.7286914544536822, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 4187 + }, + { + "epoch": 0.04188, + "grad_norm": 0.7190589294145873, + "learning_rate": 0.003, + "loss": 4.1395, + "step": 4188 + }, + { + "epoch": 0.04189, + "grad_norm": 0.7351396129481365, + "learning_rate": 0.003, + "loss": 4.1607, + "step": 4189 + }, + { + "epoch": 0.0419, + "grad_norm": 0.7646912861138266, + "learning_rate": 0.003, + "loss": 4.1584, + "step": 4190 + }, + { + "epoch": 0.04191, + "grad_norm": 0.8912342474994416, + "learning_rate": 0.003, + "loss": 4.1703, + "step": 4191 + }, + { + "epoch": 0.04192, + "grad_norm": 0.8349167750236294, + "learning_rate": 0.003, + "loss": 4.155, + "step": 4192 + }, + { + "epoch": 0.04193, + "grad_norm": 0.8412143448929614, + "learning_rate": 0.003, + "loss": 4.139, + "step": 4193 + }, + { + "epoch": 0.04194, + "grad_norm": 0.7468627479626141, + "learning_rate": 0.003, + "loss": 4.1323, + "step": 4194 + }, + { + "epoch": 0.04195, + "grad_norm": 0.675962819896126, + "learning_rate": 0.003, + "loss": 4.1829, + "step": 4195 + }, + { + "epoch": 0.04196, + "grad_norm": 0.6834824762686134, + "learning_rate": 0.003, + "loss": 4.143, + "step": 4196 + }, + { + "epoch": 0.04197, + "grad_norm": 0.6470038410606324, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 4197 + }, + { + "epoch": 0.04198, + "grad_norm": 0.6260474494865785, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 4198 + }, + { + "epoch": 0.04199, + "grad_norm": 0.6233390255040572, + "learning_rate": 0.003, + "loss": 4.1821, + "step": 4199 + }, + { + "epoch": 0.042, + "grad_norm": 0.5419194177090347, + "learning_rate": 0.003, + "loss": 4.1585, + "step": 4200 + }, + { + "epoch": 0.04201, + "grad_norm": 0.4956778815254003, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 4201 + }, + { + "epoch": 0.04202, + "grad_norm": 0.4424272529732883, + "learning_rate": 0.003, + "loss": 4.1485, + "step": 4202 + }, + { + "epoch": 0.04203, + "grad_norm": 0.4059745715455596, + "learning_rate": 0.003, + "loss": 4.1669, + "step": 4203 + }, + { + "epoch": 0.04204, + "grad_norm": 0.3551788544607894, + "learning_rate": 0.003, + "loss": 4.1305, + "step": 4204 + }, + { + "epoch": 0.04205, + "grad_norm": 0.36710978367184766, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 4205 + }, + { + "epoch": 0.04206, + "grad_norm": 0.37067515609326535, + "learning_rate": 0.003, + "loss": 4.1466, + "step": 4206 + }, + { + "epoch": 0.04207, + "grad_norm": 0.3909030628374286, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 4207 + }, + { + "epoch": 0.04208, + "grad_norm": 0.3512353865469784, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 4208 + }, + { + "epoch": 0.04209, + "grad_norm": 0.3222350876286574, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 4209 + }, + { + "epoch": 0.0421, + "grad_norm": 0.33699670679266025, + "learning_rate": 0.003, + "loss": 4.103, + "step": 4210 + }, + { + "epoch": 0.04211, + "grad_norm": 0.397353308681094, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 4211 + }, + { + "epoch": 0.04212, + "grad_norm": 0.4950752479622572, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 4212 + }, + { + "epoch": 0.04213, + "grad_norm": 0.6908228603245805, + "learning_rate": 0.003, + "loss": 4.1243, + "step": 4213 + }, + { + "epoch": 0.04214, + "grad_norm": 0.8171734853482235, + "learning_rate": 0.003, + "loss": 4.137, + "step": 4214 + }, + { + "epoch": 0.04215, + "grad_norm": 0.8297471255167204, + "learning_rate": 0.003, + "loss": 4.1618, + "step": 4215 + }, + { + "epoch": 0.04216, + "grad_norm": 0.8171229032271536, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 4216 + }, + { + "epoch": 0.04217, + "grad_norm": 0.7119480146884566, + "learning_rate": 0.003, + "loss": 4.1515, + "step": 4217 + }, + { + "epoch": 0.04218, + "grad_norm": 0.7835951039751599, + "learning_rate": 0.003, + "loss": 4.1866, + "step": 4218 + }, + { + "epoch": 0.04219, + "grad_norm": 0.7238721651116449, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 4219 + }, + { + "epoch": 0.0422, + "grad_norm": 0.6704096903245251, + "learning_rate": 0.003, + "loss": 4.1709, + "step": 4220 + }, + { + "epoch": 0.04221, + "grad_norm": 0.6137772128626461, + "learning_rate": 0.003, + "loss": 4.1455, + "step": 4221 + }, + { + "epoch": 0.04222, + "grad_norm": 0.6312616992519636, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 4222 + }, + { + "epoch": 0.04223, + "grad_norm": 0.7309830489217892, + "learning_rate": 0.003, + "loss": 4.179, + "step": 4223 + }, + { + "epoch": 0.04224, + "grad_norm": 0.8353004708644942, + "learning_rate": 0.003, + "loss": 4.1874, + "step": 4224 + }, + { + "epoch": 0.04225, + "grad_norm": 0.8671668423571915, + "learning_rate": 0.003, + "loss": 4.1793, + "step": 4225 + }, + { + "epoch": 0.04226, + "grad_norm": 0.7341929503846356, + "learning_rate": 0.003, + "loss": 4.1611, + "step": 4226 + }, + { + "epoch": 0.04227, + "grad_norm": 0.6287599565012932, + "learning_rate": 0.003, + "loss": 4.1584, + "step": 4227 + }, + { + "epoch": 0.04228, + "grad_norm": 0.594514738745631, + "learning_rate": 0.003, + "loss": 4.1394, + "step": 4228 + }, + { + "epoch": 0.04229, + "grad_norm": 0.6229152206319786, + "learning_rate": 0.003, + "loss": 4.1691, + "step": 4229 + }, + { + "epoch": 0.0423, + "grad_norm": 0.6207671248596826, + "learning_rate": 0.003, + "loss": 4.1563, + "step": 4230 + }, + { + "epoch": 0.04231, + "grad_norm": 0.622706916945179, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 4231 + }, + { + "epoch": 0.04232, + "grad_norm": 0.6141015860612352, + "learning_rate": 0.003, + "loss": 4.1626, + "step": 4232 + }, + { + "epoch": 0.04233, + "grad_norm": 0.5575040241097531, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 4233 + }, + { + "epoch": 0.04234, + "grad_norm": 0.5779738605421891, + "learning_rate": 0.003, + "loss": 4.147, + "step": 4234 + }, + { + "epoch": 0.04235, + "grad_norm": 0.6628268406822522, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 4235 + }, + { + "epoch": 0.04236, + "grad_norm": 0.5848188526492533, + "learning_rate": 0.003, + "loss": 4.1428, + "step": 4236 + }, + { + "epoch": 0.04237, + "grad_norm": 0.4606869163911272, + "learning_rate": 0.003, + "loss": 4.109, + "step": 4237 + }, + { + "epoch": 0.04238, + "grad_norm": 0.44274972044788236, + "learning_rate": 0.003, + "loss": 4.1695, + "step": 4238 + }, + { + "epoch": 0.04239, + "grad_norm": 0.4300796410055906, + "learning_rate": 0.003, + "loss": 4.15, + "step": 4239 + }, + { + "epoch": 0.0424, + "grad_norm": 0.3931497226975424, + "learning_rate": 0.003, + "loss": 4.1486, + "step": 4240 + }, + { + "epoch": 0.04241, + "grad_norm": 0.44468609283123023, + "learning_rate": 0.003, + "loss": 4.1623, + "step": 4241 + }, + { + "epoch": 0.04242, + "grad_norm": 0.42999491902207304, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 4242 + }, + { + "epoch": 0.04243, + "grad_norm": 0.45444243019546493, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 4243 + }, + { + "epoch": 0.04244, + "grad_norm": 0.4177997103887438, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 4244 + }, + { + "epoch": 0.04245, + "grad_norm": 0.4042766192791119, + "learning_rate": 0.003, + "loss": 4.1297, + "step": 4245 + }, + { + "epoch": 0.04246, + "grad_norm": 0.49465297913164696, + "learning_rate": 0.003, + "loss": 4.1578, + "step": 4246 + }, + { + "epoch": 0.04247, + "grad_norm": 0.7023548593149965, + "learning_rate": 0.003, + "loss": 4.1382, + "step": 4247 + }, + { + "epoch": 0.04248, + "grad_norm": 0.944713514521764, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 4248 + }, + { + "epoch": 0.04249, + "grad_norm": 0.9299347494145838, + "learning_rate": 0.003, + "loss": 4.1498, + "step": 4249 + }, + { + "epoch": 0.0425, + "grad_norm": 0.7051640509051822, + "learning_rate": 0.003, + "loss": 4.1595, + "step": 4250 + }, + { + "epoch": 0.04251, + "grad_norm": 0.6812273114666765, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 4251 + }, + { + "epoch": 0.04252, + "grad_norm": 0.6230213953232125, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 4252 + }, + { + "epoch": 0.04253, + "grad_norm": 0.6107603990750144, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 4253 + }, + { + "epoch": 0.04254, + "grad_norm": 0.5714927077917459, + "learning_rate": 0.003, + "loss": 4.1498, + "step": 4254 + }, + { + "epoch": 0.04255, + "grad_norm": 0.5988656811798401, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 4255 + }, + { + "epoch": 0.04256, + "grad_norm": 0.6394087469893323, + "learning_rate": 0.003, + "loss": 4.1425, + "step": 4256 + }, + { + "epoch": 0.04257, + "grad_norm": 0.5967251959756738, + "learning_rate": 0.003, + "loss": 4.1615, + "step": 4257 + }, + { + "epoch": 0.04258, + "grad_norm": 0.5945289215936239, + "learning_rate": 0.003, + "loss": 4.1601, + "step": 4258 + }, + { + "epoch": 0.04259, + "grad_norm": 0.5867660553093561, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 4259 + }, + { + "epoch": 0.0426, + "grad_norm": 0.48714297403717527, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 4260 + }, + { + "epoch": 0.04261, + "grad_norm": 0.4560387546134075, + "learning_rate": 0.003, + "loss": 4.1483, + "step": 4261 + }, + { + "epoch": 0.04262, + "grad_norm": 0.43694421386753557, + "learning_rate": 0.003, + "loss": 4.1656, + "step": 4262 + }, + { + "epoch": 0.04263, + "grad_norm": 0.474489343873346, + "learning_rate": 0.003, + "loss": 4.1465, + "step": 4263 + }, + { + "epoch": 0.04264, + "grad_norm": 0.48969639817641303, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 4264 + }, + { + "epoch": 0.04265, + "grad_norm": 0.48935865768007264, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 4265 + }, + { + "epoch": 0.04266, + "grad_norm": 0.4739244721175001, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 4266 + }, + { + "epoch": 0.04267, + "grad_norm": 0.48494742028757803, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 4267 + }, + { + "epoch": 0.04268, + "grad_norm": 0.5422961243923582, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 4268 + }, + { + "epoch": 0.04269, + "grad_norm": 0.5460647987973027, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 4269 + }, + { + "epoch": 0.0427, + "grad_norm": 0.608961907674264, + "learning_rate": 0.003, + "loss": 4.134, + "step": 4270 + }, + { + "epoch": 0.04271, + "grad_norm": 0.6558443912098731, + "learning_rate": 0.003, + "loss": 4.1504, + "step": 4271 + }, + { + "epoch": 0.04272, + "grad_norm": 0.663079753729842, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 4272 + }, + { + "epoch": 0.04273, + "grad_norm": 0.7592770165276204, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 4273 + }, + { + "epoch": 0.04274, + "grad_norm": 0.6676368129361678, + "learning_rate": 0.003, + "loss": 4.1466, + "step": 4274 + }, + { + "epoch": 0.04275, + "grad_norm": 0.6623028472426901, + "learning_rate": 0.003, + "loss": 4.1522, + "step": 4275 + }, + { + "epoch": 0.04276, + "grad_norm": 0.6624560252528698, + "learning_rate": 0.003, + "loss": 4.1719, + "step": 4276 + }, + { + "epoch": 0.04277, + "grad_norm": 0.6210345683298933, + "learning_rate": 0.003, + "loss": 4.1563, + "step": 4277 + }, + { + "epoch": 0.04278, + "grad_norm": 0.5991391029139438, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 4278 + }, + { + "epoch": 0.04279, + "grad_norm": 0.6350138668928702, + "learning_rate": 0.003, + "loss": 4.1353, + "step": 4279 + }, + { + "epoch": 0.0428, + "grad_norm": 0.6849523452457494, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 4280 + }, + { + "epoch": 0.04281, + "grad_norm": 0.6921302141996111, + "learning_rate": 0.003, + "loss": 4.1786, + "step": 4281 + }, + { + "epoch": 0.04282, + "grad_norm": 0.756590885346882, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 4282 + }, + { + "epoch": 0.04283, + "grad_norm": 0.7245456429489706, + "learning_rate": 0.003, + "loss": 4.1927, + "step": 4283 + }, + { + "epoch": 0.04284, + "grad_norm": 0.6672309834707763, + "learning_rate": 0.003, + "loss": 4.1425, + "step": 4284 + }, + { + "epoch": 0.04285, + "grad_norm": 0.6246574113348472, + "learning_rate": 0.003, + "loss": 4.157, + "step": 4285 + }, + { + "epoch": 0.04286, + "grad_norm": 0.4846532441701148, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 4286 + }, + { + "epoch": 0.04287, + "grad_norm": 0.49611202994371745, + "learning_rate": 0.003, + "loss": 4.1484, + "step": 4287 + }, + { + "epoch": 0.04288, + "grad_norm": 0.49123248349204507, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 4288 + }, + { + "epoch": 0.04289, + "grad_norm": 0.4112901668688879, + "learning_rate": 0.003, + "loss": 4.1338, + "step": 4289 + }, + { + "epoch": 0.0429, + "grad_norm": 0.38028576043319295, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 4290 + }, + { + "epoch": 0.04291, + "grad_norm": 0.3996251945749772, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 4291 + }, + { + "epoch": 0.04292, + "grad_norm": 0.43349568162385765, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 4292 + }, + { + "epoch": 0.04293, + "grad_norm": 0.5220793875805044, + "learning_rate": 0.003, + "loss": 4.1623, + "step": 4293 + }, + { + "epoch": 0.04294, + "grad_norm": 0.6050495154218054, + "learning_rate": 0.003, + "loss": 4.1458, + "step": 4294 + }, + { + "epoch": 0.04295, + "grad_norm": 0.7860329987891421, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 4295 + }, + { + "epoch": 0.04296, + "grad_norm": 0.9004405617366277, + "learning_rate": 0.003, + "loss": 4.1327, + "step": 4296 + }, + { + "epoch": 0.04297, + "grad_norm": 0.8479021177195584, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 4297 + }, + { + "epoch": 0.04298, + "grad_norm": 0.7336646368221134, + "learning_rate": 0.003, + "loss": 4.1664, + "step": 4298 + }, + { + "epoch": 0.04299, + "grad_norm": 0.7078429400943396, + "learning_rate": 0.003, + "loss": 4.1542, + "step": 4299 + }, + { + "epoch": 0.043, + "grad_norm": 0.7513235769819381, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 4300 + }, + { + "epoch": 0.04301, + "grad_norm": 0.8326441152753217, + "learning_rate": 0.003, + "loss": 4.1932, + "step": 4301 + }, + { + "epoch": 0.04302, + "grad_norm": 0.6815941455473161, + "learning_rate": 0.003, + "loss": 4.176, + "step": 4302 + }, + { + "epoch": 0.04303, + "grad_norm": 0.6890232000405212, + "learning_rate": 0.003, + "loss": 4.1666, + "step": 4303 + }, + { + "epoch": 0.04304, + "grad_norm": 0.6319808958937929, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 4304 + }, + { + "epoch": 0.04305, + "grad_norm": 0.5899508711069417, + "learning_rate": 0.003, + "loss": 4.1579, + "step": 4305 + }, + { + "epoch": 0.04306, + "grad_norm": 0.5383422790486718, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 4306 + }, + { + "epoch": 0.04307, + "grad_norm": 0.48802486139669815, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 4307 + }, + { + "epoch": 0.04308, + "grad_norm": 0.5309500644286907, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 4308 + }, + { + "epoch": 0.04309, + "grad_norm": 0.5454729884280429, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 4309 + }, + { + "epoch": 0.0431, + "grad_norm": 0.5943838417760108, + "learning_rate": 0.003, + "loss": 4.163, + "step": 4310 + }, + { + "epoch": 0.04311, + "grad_norm": 0.6278301410987507, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 4311 + }, + { + "epoch": 0.04312, + "grad_norm": 0.523985785894097, + "learning_rate": 0.003, + "loss": 4.134, + "step": 4312 + }, + { + "epoch": 0.04313, + "grad_norm": 0.4476351084578838, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 4313 + }, + { + "epoch": 0.04314, + "grad_norm": 0.43202288706395875, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 4314 + }, + { + "epoch": 0.04315, + "grad_norm": 0.47058217727703, + "learning_rate": 0.003, + "loss": 4.1513, + "step": 4315 + }, + { + "epoch": 0.04316, + "grad_norm": 0.5151993523236311, + "learning_rate": 0.003, + "loss": 4.1515, + "step": 4316 + }, + { + "epoch": 0.04317, + "grad_norm": 0.6504034739996317, + "learning_rate": 0.003, + "loss": 4.1687, + "step": 4317 + }, + { + "epoch": 0.04318, + "grad_norm": 0.722856030914651, + "learning_rate": 0.003, + "loss": 4.1142, + "step": 4318 + }, + { + "epoch": 0.04319, + "grad_norm": 0.7408662854196157, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 4319 + }, + { + "epoch": 0.0432, + "grad_norm": 0.6108907111379617, + "learning_rate": 0.003, + "loss": 4.131, + "step": 4320 + }, + { + "epoch": 0.04321, + "grad_norm": 0.5580728193515315, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 4321 + }, + { + "epoch": 0.04322, + "grad_norm": 0.6293577640178539, + "learning_rate": 0.003, + "loss": 4.1718, + "step": 4322 + }, + { + "epoch": 0.04323, + "grad_norm": 0.680052451325024, + "learning_rate": 0.003, + "loss": 4.139, + "step": 4323 + }, + { + "epoch": 0.04324, + "grad_norm": 0.6420666473310551, + "learning_rate": 0.003, + "loss": 4.134, + "step": 4324 + }, + { + "epoch": 0.04325, + "grad_norm": 0.7103856885257132, + "learning_rate": 0.003, + "loss": 4.1668, + "step": 4325 + }, + { + "epoch": 0.04326, + "grad_norm": 0.6921778251891293, + "learning_rate": 0.003, + "loss": 4.1469, + "step": 4326 + }, + { + "epoch": 0.04327, + "grad_norm": 0.688475290776878, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 4327 + }, + { + "epoch": 0.04328, + "grad_norm": 0.6182648113705878, + "learning_rate": 0.003, + "loss": 4.1423, + "step": 4328 + }, + { + "epoch": 0.04329, + "grad_norm": 0.6893506649601877, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 4329 + }, + { + "epoch": 0.0433, + "grad_norm": 0.7382201356692273, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 4330 + }, + { + "epoch": 0.04331, + "grad_norm": 0.6908297901661757, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 4331 + }, + { + "epoch": 0.04332, + "grad_norm": 0.6538224132935573, + "learning_rate": 0.003, + "loss": 4.1798, + "step": 4332 + }, + { + "epoch": 0.04333, + "grad_norm": 0.5839042658168669, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 4333 + }, + { + "epoch": 0.04334, + "grad_norm": 0.5113278780509889, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 4334 + }, + { + "epoch": 0.04335, + "grad_norm": 0.5145875368624392, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 4335 + }, + { + "epoch": 0.04336, + "grad_norm": 0.45945029823422817, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 4336 + }, + { + "epoch": 0.04337, + "grad_norm": 0.48662366939348867, + "learning_rate": 0.003, + "loss": 4.1382, + "step": 4337 + }, + { + "epoch": 0.04338, + "grad_norm": 0.5497905856065477, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 4338 + }, + { + "epoch": 0.04339, + "grad_norm": 0.6427639317081701, + "learning_rate": 0.003, + "loss": 4.1499, + "step": 4339 + }, + { + "epoch": 0.0434, + "grad_norm": 0.751540691985128, + "learning_rate": 0.003, + "loss": 4.1357, + "step": 4340 + }, + { + "epoch": 0.04341, + "grad_norm": 0.8048212003756245, + "learning_rate": 0.003, + "loss": 4.153, + "step": 4341 + }, + { + "epoch": 0.04342, + "grad_norm": 0.7030263986542218, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 4342 + }, + { + "epoch": 0.04343, + "grad_norm": 0.6032636490778056, + "learning_rate": 0.003, + "loss": 4.1739, + "step": 4343 + }, + { + "epoch": 0.04344, + "grad_norm": 0.6918101884406253, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 4344 + }, + { + "epoch": 0.04345, + "grad_norm": 0.822156034408674, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 4345 + }, + { + "epoch": 0.04346, + "grad_norm": 0.8112944749590358, + "learning_rate": 0.003, + "loss": 4.1692, + "step": 4346 + }, + { + "epoch": 0.04347, + "grad_norm": 0.7773363458697162, + "learning_rate": 0.003, + "loss": 4.1737, + "step": 4347 + }, + { + "epoch": 0.04348, + "grad_norm": 0.7166070078793744, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 4348 + }, + { + "epoch": 0.04349, + "grad_norm": 0.7557376957323917, + "learning_rate": 0.003, + "loss": 4.1704, + "step": 4349 + }, + { + "epoch": 0.0435, + "grad_norm": 0.7306221155287103, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 4350 + }, + { + "epoch": 0.04351, + "grad_norm": 0.7396583467273142, + "learning_rate": 0.003, + "loss": 4.1687, + "step": 4351 + }, + { + "epoch": 0.04352, + "grad_norm": 0.7800507975012694, + "learning_rate": 0.003, + "loss": 4.1888, + "step": 4352 + }, + { + "epoch": 0.04353, + "grad_norm": 1.021228379310444, + "learning_rate": 0.003, + "loss": 4.1744, + "step": 4353 + }, + { + "epoch": 0.04354, + "grad_norm": 0.9464406672758259, + "learning_rate": 0.003, + "loss": 4.1858, + "step": 4354 + }, + { + "epoch": 0.04355, + "grad_norm": 0.9680937005908226, + "learning_rate": 0.003, + "loss": 4.19, + "step": 4355 + }, + { + "epoch": 0.04356, + "grad_norm": 0.8607469609935269, + "learning_rate": 0.003, + "loss": 4.2014, + "step": 4356 + }, + { + "epoch": 0.04357, + "grad_norm": 0.823519837090474, + "learning_rate": 0.003, + "loss": 4.1828, + "step": 4357 + }, + { + "epoch": 0.04358, + "grad_norm": 0.9595628631285592, + "learning_rate": 0.003, + "loss": 4.181, + "step": 4358 + }, + { + "epoch": 0.04359, + "grad_norm": 0.9518657368376723, + "learning_rate": 0.003, + "loss": 4.2091, + "step": 4359 + }, + { + "epoch": 0.0436, + "grad_norm": 0.8367488853787286, + "learning_rate": 0.003, + "loss": 4.2039, + "step": 4360 + }, + { + "epoch": 0.04361, + "grad_norm": 0.6767616158798689, + "learning_rate": 0.003, + "loss": 4.1626, + "step": 4361 + }, + { + "epoch": 0.04362, + "grad_norm": 0.6754203563658906, + "learning_rate": 0.003, + "loss": 4.1747, + "step": 4362 + }, + { + "epoch": 0.04363, + "grad_norm": 0.6302727609189303, + "learning_rate": 0.003, + "loss": 4.1735, + "step": 4363 + }, + { + "epoch": 0.04364, + "grad_norm": 0.6320795297761237, + "learning_rate": 0.003, + "loss": 4.1873, + "step": 4364 + }, + { + "epoch": 0.04365, + "grad_norm": 0.6155242365026946, + "learning_rate": 0.003, + "loss": 4.1835, + "step": 4365 + }, + { + "epoch": 0.04366, + "grad_norm": 0.5750840343277154, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 4366 + }, + { + "epoch": 0.04367, + "grad_norm": 0.5395062951089282, + "learning_rate": 0.003, + "loss": 4.1827, + "step": 4367 + }, + { + "epoch": 0.04368, + "grad_norm": 0.5070165557962105, + "learning_rate": 0.003, + "loss": 4.1657, + "step": 4368 + }, + { + "epoch": 0.04369, + "grad_norm": 0.469120529046112, + "learning_rate": 0.003, + "loss": 4.1558, + "step": 4369 + }, + { + "epoch": 0.0437, + "grad_norm": 0.42165323317321096, + "learning_rate": 0.003, + "loss": 4.178, + "step": 4370 + }, + { + "epoch": 0.04371, + "grad_norm": 0.4102155524937744, + "learning_rate": 0.003, + "loss": 4.1607, + "step": 4371 + }, + { + "epoch": 0.04372, + "grad_norm": 0.45131221099147395, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 4372 + }, + { + "epoch": 0.04373, + "grad_norm": 0.5064889554273249, + "learning_rate": 0.003, + "loss": 4.1634, + "step": 4373 + }, + { + "epoch": 0.04374, + "grad_norm": 0.6523859432482797, + "learning_rate": 0.003, + "loss": 4.2092, + "step": 4374 + }, + { + "epoch": 0.04375, + "grad_norm": 0.7145171139398658, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 4375 + }, + { + "epoch": 0.04376, + "grad_norm": 0.6067647597260136, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 4376 + }, + { + "epoch": 0.04377, + "grad_norm": 0.4114371807638281, + "learning_rate": 0.003, + "loss": 4.1817, + "step": 4377 + }, + { + "epoch": 0.04378, + "grad_norm": 0.4767337347167691, + "learning_rate": 0.003, + "loss": 4.1646, + "step": 4378 + }, + { + "epoch": 0.04379, + "grad_norm": 0.5776389327364934, + "learning_rate": 0.003, + "loss": 4.1831, + "step": 4379 + }, + { + "epoch": 0.0438, + "grad_norm": 0.5129468592408087, + "learning_rate": 0.003, + "loss": 4.1488, + "step": 4380 + }, + { + "epoch": 0.04381, + "grad_norm": 0.4520287221841305, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 4381 + }, + { + "epoch": 0.04382, + "grad_norm": 0.5197515685911448, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 4382 + }, + { + "epoch": 0.04383, + "grad_norm": 0.5177934608058117, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 4383 + }, + { + "epoch": 0.04384, + "grad_norm": 0.5061115119488518, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 4384 + }, + { + "epoch": 0.04385, + "grad_norm": 0.4979837350700543, + "learning_rate": 0.003, + "loss": 4.168, + "step": 4385 + }, + { + "epoch": 0.04386, + "grad_norm": 0.5170426202552209, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 4386 + }, + { + "epoch": 0.04387, + "grad_norm": 0.497700525982779, + "learning_rate": 0.003, + "loss": 4.164, + "step": 4387 + }, + { + "epoch": 0.04388, + "grad_norm": 0.5110241434141121, + "learning_rate": 0.003, + "loss": 4.125, + "step": 4388 + }, + { + "epoch": 0.04389, + "grad_norm": 0.47594850470553707, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 4389 + }, + { + "epoch": 0.0439, + "grad_norm": 0.5028500411846144, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 4390 + }, + { + "epoch": 0.04391, + "grad_norm": 0.4958670611124616, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 4391 + }, + { + "epoch": 0.04392, + "grad_norm": 0.45433519309198606, + "learning_rate": 0.003, + "loss": 4.129, + "step": 4392 + }, + { + "epoch": 0.04393, + "grad_norm": 0.42745058981506323, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 4393 + }, + { + "epoch": 0.04394, + "grad_norm": 0.4270590633401501, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 4394 + }, + { + "epoch": 0.04395, + "grad_norm": 0.48083329680294346, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 4395 + }, + { + "epoch": 0.04396, + "grad_norm": 0.4459452323390242, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 4396 + }, + { + "epoch": 0.04397, + "grad_norm": 0.4716168217418122, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 4397 + }, + { + "epoch": 0.04398, + "grad_norm": 0.5721086479882349, + "learning_rate": 0.003, + "loss": 4.1297, + "step": 4398 + }, + { + "epoch": 0.04399, + "grad_norm": 0.629767772703653, + "learning_rate": 0.003, + "loss": 4.1394, + "step": 4399 + }, + { + "epoch": 0.044, + "grad_norm": 0.6149549402399486, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 4400 + }, + { + "epoch": 0.04401, + "grad_norm": 0.573854062146023, + "learning_rate": 0.003, + "loss": 4.1469, + "step": 4401 + }, + { + "epoch": 0.04402, + "grad_norm": 0.5937371470991296, + "learning_rate": 0.003, + "loss": 4.1514, + "step": 4402 + }, + { + "epoch": 0.04403, + "grad_norm": 0.7181194724479543, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 4403 + }, + { + "epoch": 0.04404, + "grad_norm": 0.7454554555523588, + "learning_rate": 0.003, + "loss": 4.1537, + "step": 4404 + }, + { + "epoch": 0.04405, + "grad_norm": 0.7902792519841142, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 4405 + }, + { + "epoch": 0.04406, + "grad_norm": 0.7207216322892295, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 4406 + }, + { + "epoch": 0.04407, + "grad_norm": 0.6745132067112365, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 4407 + }, + { + "epoch": 0.04408, + "grad_norm": 0.6455801650691928, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 4408 + }, + { + "epoch": 0.04409, + "grad_norm": 0.6394593941217028, + "learning_rate": 0.003, + "loss": 4.1634, + "step": 4409 + }, + { + "epoch": 0.0441, + "grad_norm": 0.49947842679511295, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 4410 + }, + { + "epoch": 0.04411, + "grad_norm": 0.49572809609459384, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 4411 + }, + { + "epoch": 0.04412, + "grad_norm": 0.5979052104111249, + "learning_rate": 0.003, + "loss": 4.116, + "step": 4412 + }, + { + "epoch": 0.04413, + "grad_norm": 0.7297806765539856, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 4413 + }, + { + "epoch": 0.04414, + "grad_norm": 0.8440545970233042, + "learning_rate": 0.003, + "loss": 4.1895, + "step": 4414 + }, + { + "epoch": 0.04415, + "grad_norm": 0.8680059315838015, + "learning_rate": 0.003, + "loss": 4.1572, + "step": 4415 + }, + { + "epoch": 0.04416, + "grad_norm": 0.6447487616588353, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 4416 + }, + { + "epoch": 0.04417, + "grad_norm": 0.5118950493406013, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 4417 + }, + { + "epoch": 0.04418, + "grad_norm": 0.524336962344827, + "learning_rate": 0.003, + "loss": 4.1536, + "step": 4418 + }, + { + "epoch": 0.04419, + "grad_norm": 0.5573856100230878, + "learning_rate": 0.003, + "loss": 4.1718, + "step": 4419 + }, + { + "epoch": 0.0442, + "grad_norm": 0.5231826126803166, + "learning_rate": 0.003, + "loss": 4.1397, + "step": 4420 + }, + { + "epoch": 0.04421, + "grad_norm": 0.5267586062051023, + "learning_rate": 0.003, + "loss": 4.1727, + "step": 4421 + }, + { + "epoch": 0.04422, + "grad_norm": 0.6148173783561381, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 4422 + }, + { + "epoch": 0.04423, + "grad_norm": 0.639677614962493, + "learning_rate": 0.003, + "loss": 4.1571, + "step": 4423 + }, + { + "epoch": 0.04424, + "grad_norm": 0.7269549825343812, + "learning_rate": 0.003, + "loss": 4.108, + "step": 4424 + }, + { + "epoch": 0.04425, + "grad_norm": 0.7800602356462868, + "learning_rate": 0.003, + "loss": 4.1375, + "step": 4425 + }, + { + "epoch": 0.04426, + "grad_norm": 0.6552710671788101, + "learning_rate": 0.003, + "loss": 4.1586, + "step": 4426 + }, + { + "epoch": 0.04427, + "grad_norm": 0.6600251269485187, + "learning_rate": 0.003, + "loss": 4.1459, + "step": 4427 + }, + { + "epoch": 0.04428, + "grad_norm": 0.7074082478081082, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 4428 + }, + { + "epoch": 0.04429, + "grad_norm": 0.6687710065867836, + "learning_rate": 0.003, + "loss": 4.1838, + "step": 4429 + }, + { + "epoch": 0.0443, + "grad_norm": 0.6237696770618166, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 4430 + }, + { + "epoch": 0.04431, + "grad_norm": 0.5184995550940078, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 4431 + }, + { + "epoch": 0.04432, + "grad_norm": 0.5237611236731565, + "learning_rate": 0.003, + "loss": 4.1643, + "step": 4432 + }, + { + "epoch": 0.04433, + "grad_norm": 0.4836240843515611, + "learning_rate": 0.003, + "loss": 4.136, + "step": 4433 + }, + { + "epoch": 0.04434, + "grad_norm": 0.4375263143804872, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 4434 + }, + { + "epoch": 0.04435, + "grad_norm": 0.4294571339856274, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 4435 + }, + { + "epoch": 0.04436, + "grad_norm": 0.4235691217798773, + "learning_rate": 0.003, + "loss": 4.119, + "step": 4436 + }, + { + "epoch": 0.04437, + "grad_norm": 0.38987028812840846, + "learning_rate": 0.003, + "loss": 4.1415, + "step": 4437 + }, + { + "epoch": 0.04438, + "grad_norm": 0.366302357806493, + "learning_rate": 0.003, + "loss": 4.1767, + "step": 4438 + }, + { + "epoch": 0.04439, + "grad_norm": 0.39300818537655363, + "learning_rate": 0.003, + "loss": 4.1452, + "step": 4439 + }, + { + "epoch": 0.0444, + "grad_norm": 0.4338089650520033, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 4440 + }, + { + "epoch": 0.04441, + "grad_norm": 0.48573366379460514, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 4441 + }, + { + "epoch": 0.04442, + "grad_norm": 0.5334130138073742, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 4442 + }, + { + "epoch": 0.04443, + "grad_norm": 0.5670666571686344, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 4443 + }, + { + "epoch": 0.04444, + "grad_norm": 0.5320583177110734, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 4444 + }, + { + "epoch": 0.04445, + "grad_norm": 0.489885387553983, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 4445 + }, + { + "epoch": 0.04446, + "grad_norm": 0.559048561777276, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 4446 + }, + { + "epoch": 0.04447, + "grad_norm": 0.6964141693448826, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 4447 + }, + { + "epoch": 0.04448, + "grad_norm": 0.7839348711029371, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 4448 + }, + { + "epoch": 0.04449, + "grad_norm": 0.8168410712463996, + "learning_rate": 0.003, + "loss": 4.1417, + "step": 4449 + }, + { + "epoch": 0.0445, + "grad_norm": 0.7469909706513354, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 4450 + }, + { + "epoch": 0.04451, + "grad_norm": 0.7772290265480061, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 4451 + }, + { + "epoch": 0.04452, + "grad_norm": 0.750429713415397, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 4452 + }, + { + "epoch": 0.04453, + "grad_norm": 0.8132686578476264, + "learning_rate": 0.003, + "loss": 4.1734, + "step": 4453 + }, + { + "epoch": 0.04454, + "grad_norm": 0.7560392406490976, + "learning_rate": 0.003, + "loss": 4.1543, + "step": 4454 + }, + { + "epoch": 0.04455, + "grad_norm": 0.6772193323346015, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 4455 + }, + { + "epoch": 0.04456, + "grad_norm": 0.6001846034669396, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 4456 + }, + { + "epoch": 0.04457, + "grad_norm": 0.555927693348537, + "learning_rate": 0.003, + "loss": 4.1401, + "step": 4457 + }, + { + "epoch": 0.04458, + "grad_norm": 0.567937186369503, + "learning_rate": 0.003, + "loss": 4.1625, + "step": 4458 + }, + { + "epoch": 0.04459, + "grad_norm": 0.6268058209460201, + "learning_rate": 0.003, + "loss": 4.1697, + "step": 4459 + }, + { + "epoch": 0.0446, + "grad_norm": 0.6774954694650952, + "learning_rate": 0.003, + "loss": 4.1699, + "step": 4460 + }, + { + "epoch": 0.04461, + "grad_norm": 0.7113381022778253, + "learning_rate": 0.003, + "loss": 4.1744, + "step": 4461 + }, + { + "epoch": 0.04462, + "grad_norm": 0.7089477778475621, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 4462 + }, + { + "epoch": 0.04463, + "grad_norm": 0.7149044485381956, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 4463 + }, + { + "epoch": 0.04464, + "grad_norm": 0.7001719619214657, + "learning_rate": 0.003, + "loss": 4.1603, + "step": 4464 + }, + { + "epoch": 0.04465, + "grad_norm": 0.641827326565005, + "learning_rate": 0.003, + "loss": 4.1468, + "step": 4465 + }, + { + "epoch": 0.04466, + "grad_norm": 0.5710205222869948, + "learning_rate": 0.003, + "loss": 4.1542, + "step": 4466 + }, + { + "epoch": 0.04467, + "grad_norm": 0.5879622422772139, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 4467 + }, + { + "epoch": 0.04468, + "grad_norm": 0.5833275454344445, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 4468 + }, + { + "epoch": 0.04469, + "grad_norm": 0.5035664280245044, + "learning_rate": 0.003, + "loss": 4.1536, + "step": 4469 + }, + { + "epoch": 0.0447, + "grad_norm": 0.5330652391453462, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 4470 + }, + { + "epoch": 0.04471, + "grad_norm": 0.5415312535775053, + "learning_rate": 0.003, + "loss": 4.1754, + "step": 4471 + }, + { + "epoch": 0.04472, + "grad_norm": 0.6201601279104281, + "learning_rate": 0.003, + "loss": 4.129, + "step": 4472 + }, + { + "epoch": 0.04473, + "grad_norm": 0.5823075422954469, + "learning_rate": 0.003, + "loss": 4.1304, + "step": 4473 + }, + { + "epoch": 0.04474, + "grad_norm": 0.5211525998630241, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 4474 + }, + { + "epoch": 0.04475, + "grad_norm": 0.5787022872529871, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 4475 + }, + { + "epoch": 0.04476, + "grad_norm": 0.6268645285401386, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 4476 + }, + { + "epoch": 0.04477, + "grad_norm": 0.7238878952105227, + "learning_rate": 0.003, + "loss": 4.1527, + "step": 4477 + }, + { + "epoch": 0.04478, + "grad_norm": 0.7625353869137804, + "learning_rate": 0.003, + "loss": 4.1522, + "step": 4478 + }, + { + "epoch": 0.04479, + "grad_norm": 0.7572839617963258, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 4479 + }, + { + "epoch": 0.0448, + "grad_norm": 0.7894575112052167, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 4480 + }, + { + "epoch": 0.04481, + "grad_norm": 0.74839779052581, + "learning_rate": 0.003, + "loss": 4.1423, + "step": 4481 + }, + { + "epoch": 0.04482, + "grad_norm": 0.7334427098809556, + "learning_rate": 0.003, + "loss": 4.1657, + "step": 4482 + }, + { + "epoch": 0.04483, + "grad_norm": 0.7262695320930564, + "learning_rate": 0.003, + "loss": 4.152, + "step": 4483 + }, + { + "epoch": 0.04484, + "grad_norm": 0.6647684381777006, + "learning_rate": 0.003, + "loss": 4.165, + "step": 4484 + }, + { + "epoch": 0.04485, + "grad_norm": 0.6381708748446918, + "learning_rate": 0.003, + "loss": 4.1719, + "step": 4485 + }, + { + "epoch": 0.04486, + "grad_norm": 0.5675731699113569, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 4486 + }, + { + "epoch": 0.04487, + "grad_norm": 0.6472424225913131, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 4487 + }, + { + "epoch": 0.04488, + "grad_norm": 0.694626079092874, + "learning_rate": 0.003, + "loss": 4.1617, + "step": 4488 + }, + { + "epoch": 0.04489, + "grad_norm": 0.6658909000300576, + "learning_rate": 0.003, + "loss": 4.1656, + "step": 4489 + }, + { + "epoch": 0.0449, + "grad_norm": 0.5596108130436326, + "learning_rate": 0.003, + "loss": 4.156, + "step": 4490 + }, + { + "epoch": 0.04491, + "grad_norm": 0.4950212459279141, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 4491 + }, + { + "epoch": 0.04492, + "grad_norm": 0.5347911305158913, + "learning_rate": 0.003, + "loss": 4.1536, + "step": 4492 + }, + { + "epoch": 0.04493, + "grad_norm": 0.583677850500047, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 4493 + }, + { + "epoch": 0.04494, + "grad_norm": 0.5509029147435317, + "learning_rate": 0.003, + "loss": 4.1386, + "step": 4494 + }, + { + "epoch": 0.04495, + "grad_norm": 0.5187917488469639, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 4495 + }, + { + "epoch": 0.04496, + "grad_norm": 0.530988798745176, + "learning_rate": 0.003, + "loss": 4.1746, + "step": 4496 + }, + { + "epoch": 0.04497, + "grad_norm": 0.5020112887190497, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 4497 + }, + { + "epoch": 0.04498, + "grad_norm": 0.5882091094240027, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 4498 + }, + { + "epoch": 0.04499, + "grad_norm": 0.6882774056835383, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 4499 + }, + { + "epoch": 0.045, + "grad_norm": 0.7302113825660921, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 4500 + }, + { + "epoch": 0.04501, + "grad_norm": 0.6431193209544874, + "learning_rate": 0.003, + "loss": 4.1285, + "step": 4501 + }, + { + "epoch": 0.04502, + "grad_norm": 0.6128789446358794, + "learning_rate": 0.003, + "loss": 4.1703, + "step": 4502 + }, + { + "epoch": 0.04503, + "grad_norm": 0.563632556792813, + "learning_rate": 0.003, + "loss": 4.167, + "step": 4503 + }, + { + "epoch": 0.04504, + "grad_norm": 0.49069503580557566, + "learning_rate": 0.003, + "loss": 4.1633, + "step": 4504 + }, + { + "epoch": 0.04505, + "grad_norm": 0.5316897314034865, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 4505 + }, + { + "epoch": 0.04506, + "grad_norm": 0.485539206424938, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 4506 + }, + { + "epoch": 0.04507, + "grad_norm": 0.4651162125504691, + "learning_rate": 0.003, + "loss": 4.109, + "step": 4507 + }, + { + "epoch": 0.04508, + "grad_norm": 0.47136511588197955, + "learning_rate": 0.003, + "loss": 4.1303, + "step": 4508 + }, + { + "epoch": 0.04509, + "grad_norm": 0.48078179210487143, + "learning_rate": 0.003, + "loss": 4.153, + "step": 4509 + }, + { + "epoch": 0.0451, + "grad_norm": 0.5697241941596817, + "learning_rate": 0.003, + "loss": 4.1586, + "step": 4510 + }, + { + "epoch": 0.04511, + "grad_norm": 0.6691571245585908, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 4511 + }, + { + "epoch": 0.04512, + "grad_norm": 0.7515863901593889, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 4512 + }, + { + "epoch": 0.04513, + "grad_norm": 0.7483616185624625, + "learning_rate": 0.003, + "loss": 4.1391, + "step": 4513 + }, + { + "epoch": 0.04514, + "grad_norm": 0.703461108276439, + "learning_rate": 0.003, + "loss": 4.1439, + "step": 4514 + }, + { + "epoch": 0.04515, + "grad_norm": 0.7246314655645058, + "learning_rate": 0.003, + "loss": 4.1418, + "step": 4515 + }, + { + "epoch": 0.04516, + "grad_norm": 0.6368515120508723, + "learning_rate": 0.003, + "loss": 4.1523, + "step": 4516 + }, + { + "epoch": 0.04517, + "grad_norm": 0.5222304173800686, + "learning_rate": 0.003, + "loss": 4.1611, + "step": 4517 + }, + { + "epoch": 0.04518, + "grad_norm": 0.48504759929098307, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 4518 + }, + { + "epoch": 0.04519, + "grad_norm": 0.43996749102619037, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 4519 + }, + { + "epoch": 0.0452, + "grad_norm": 0.4756958760047127, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 4520 + }, + { + "epoch": 0.04521, + "grad_norm": 0.49681842219333233, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 4521 + }, + { + "epoch": 0.04522, + "grad_norm": 0.5177557892182135, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 4522 + }, + { + "epoch": 0.04523, + "grad_norm": 0.4834475000365292, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 4523 + }, + { + "epoch": 0.04524, + "grad_norm": 0.4294821611706401, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 4524 + }, + { + "epoch": 0.04525, + "grad_norm": 0.4784372227541117, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 4525 + }, + { + "epoch": 0.04526, + "grad_norm": 0.5155297039314984, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 4526 + }, + { + "epoch": 0.04527, + "grad_norm": 0.6810222903041584, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 4527 + }, + { + "epoch": 0.04528, + "grad_norm": 0.8822750025167144, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 4528 + }, + { + "epoch": 0.04529, + "grad_norm": 0.8919184923159118, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 4529 + }, + { + "epoch": 0.0453, + "grad_norm": 0.8374148469158906, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 4530 + }, + { + "epoch": 0.04531, + "grad_norm": 0.7926240119332635, + "learning_rate": 0.003, + "loss": 4.1315, + "step": 4531 + }, + { + "epoch": 0.04532, + "grad_norm": 0.8568328376834813, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 4532 + }, + { + "epoch": 0.04533, + "grad_norm": 1.0159060122619772, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 4533 + }, + { + "epoch": 0.04534, + "grad_norm": 0.9536103887238125, + "learning_rate": 0.003, + "loss": 4.1827, + "step": 4534 + }, + { + "epoch": 0.04535, + "grad_norm": 0.9201117164869018, + "learning_rate": 0.003, + "loss": 4.2131, + "step": 4535 + }, + { + "epoch": 0.04536, + "grad_norm": 1.0425882626135783, + "learning_rate": 0.003, + "loss": 4.2164, + "step": 4536 + }, + { + "epoch": 0.04537, + "grad_norm": 1.1171942484625808, + "learning_rate": 0.003, + "loss": 4.2062, + "step": 4537 + }, + { + "epoch": 0.04538, + "grad_norm": 1.095605747573551, + "learning_rate": 0.003, + "loss": 4.2133, + "step": 4538 + }, + { + "epoch": 0.04539, + "grad_norm": 0.9260638010898388, + "learning_rate": 0.003, + "loss": 4.1705, + "step": 4539 + }, + { + "epoch": 0.0454, + "grad_norm": 0.7768806609844044, + "learning_rate": 0.003, + "loss": 4.175, + "step": 4540 + }, + { + "epoch": 0.04541, + "grad_norm": 0.819877783723442, + "learning_rate": 0.003, + "loss": 4.2224, + "step": 4541 + }, + { + "epoch": 0.04542, + "grad_norm": 0.8805829563806863, + "learning_rate": 0.003, + "loss": 4.1752, + "step": 4542 + }, + { + "epoch": 0.04543, + "grad_norm": 0.7923901917703942, + "learning_rate": 0.003, + "loss": 4.207, + "step": 4543 + }, + { + "epoch": 0.04544, + "grad_norm": 0.5941044825763567, + "learning_rate": 0.003, + "loss": 4.1737, + "step": 4544 + }, + { + "epoch": 0.04545, + "grad_norm": 0.6408411687252461, + "learning_rate": 0.003, + "loss": 4.1822, + "step": 4545 + }, + { + "epoch": 0.04546, + "grad_norm": 0.8237308204937738, + "learning_rate": 0.003, + "loss": 4.1724, + "step": 4546 + }, + { + "epoch": 0.04547, + "grad_norm": 0.9235544857511309, + "learning_rate": 0.003, + "loss": 4.1822, + "step": 4547 + }, + { + "epoch": 0.04548, + "grad_norm": 0.7235487434091521, + "learning_rate": 0.003, + "loss": 4.1901, + "step": 4548 + }, + { + "epoch": 0.04549, + "grad_norm": 0.6422371993419722, + "learning_rate": 0.003, + "loss": 4.1732, + "step": 4549 + }, + { + "epoch": 0.0455, + "grad_norm": 0.5600832762072884, + "learning_rate": 0.003, + "loss": 4.1988, + "step": 4550 + }, + { + "epoch": 0.04551, + "grad_norm": 0.5228348539847921, + "learning_rate": 0.003, + "loss": 4.206, + "step": 4551 + }, + { + "epoch": 0.04552, + "grad_norm": 0.4730508563524038, + "learning_rate": 0.003, + "loss": 4.1829, + "step": 4552 + }, + { + "epoch": 0.04553, + "grad_norm": 0.5620543138219484, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 4553 + }, + { + "epoch": 0.04554, + "grad_norm": 0.5765966592222326, + "learning_rate": 0.003, + "loss": 4.1553, + "step": 4554 + }, + { + "epoch": 0.04555, + "grad_norm": 0.577781369783007, + "learning_rate": 0.003, + "loss": 4.1637, + "step": 4555 + }, + { + "epoch": 0.04556, + "grad_norm": 0.5035922467137508, + "learning_rate": 0.003, + "loss": 4.1622, + "step": 4556 + }, + { + "epoch": 0.04557, + "grad_norm": 0.5559102933120925, + "learning_rate": 0.003, + "loss": 4.1442, + "step": 4557 + }, + { + "epoch": 0.04558, + "grad_norm": 0.7462315222900177, + "learning_rate": 0.003, + "loss": 4.1815, + "step": 4558 + }, + { + "epoch": 0.04559, + "grad_norm": 0.7835969334333466, + "learning_rate": 0.003, + "loss": 4.1966, + "step": 4559 + }, + { + "epoch": 0.0456, + "grad_norm": 0.6591837955511258, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 4560 + }, + { + "epoch": 0.04561, + "grad_norm": 0.7187360930779703, + "learning_rate": 0.003, + "loss": 4.1604, + "step": 4561 + }, + { + "epoch": 0.04562, + "grad_norm": 0.7184968771307666, + "learning_rate": 0.003, + "loss": 4.1834, + "step": 4562 + }, + { + "epoch": 0.04563, + "grad_norm": 0.7171157380489706, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 4563 + }, + { + "epoch": 0.04564, + "grad_norm": 0.6631256505733477, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 4564 + }, + { + "epoch": 0.04565, + "grad_norm": 0.5733198686410141, + "learning_rate": 0.003, + "loss": 4.128, + "step": 4565 + }, + { + "epoch": 0.04566, + "grad_norm": 0.4813774167549479, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 4566 + }, + { + "epoch": 0.04567, + "grad_norm": 0.40853415838936963, + "learning_rate": 0.003, + "loss": 4.1494, + "step": 4567 + }, + { + "epoch": 0.04568, + "grad_norm": 0.37364202822845655, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 4568 + }, + { + "epoch": 0.04569, + "grad_norm": 0.3618276444120114, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 4569 + }, + { + "epoch": 0.0457, + "grad_norm": 0.3609968139572239, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 4570 + }, + { + "epoch": 0.04571, + "grad_norm": 0.37747195577489445, + "learning_rate": 0.003, + "loss": 4.1615, + "step": 4571 + }, + { + "epoch": 0.04572, + "grad_norm": 0.3874865960286465, + "learning_rate": 0.003, + "loss": 4.1339, + "step": 4572 + }, + { + "epoch": 0.04573, + "grad_norm": 0.4109072807516651, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 4573 + }, + { + "epoch": 0.04574, + "grad_norm": 0.39179890606589207, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 4574 + }, + { + "epoch": 0.04575, + "grad_norm": 0.4386907207682461, + "learning_rate": 0.003, + "loss": 4.1446, + "step": 4575 + }, + { + "epoch": 0.04576, + "grad_norm": 0.5065967670777235, + "learning_rate": 0.003, + "loss": 4.1355, + "step": 4576 + }, + { + "epoch": 0.04577, + "grad_norm": 0.6143902378642073, + "learning_rate": 0.003, + "loss": 4.1597, + "step": 4577 + }, + { + "epoch": 0.04578, + "grad_norm": 0.6827140793497046, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 4578 + }, + { + "epoch": 0.04579, + "grad_norm": 0.6929677517815375, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 4579 + }, + { + "epoch": 0.0458, + "grad_norm": 0.7109361429436778, + "learning_rate": 0.003, + "loss": 4.15, + "step": 4580 + }, + { + "epoch": 0.04581, + "grad_norm": 0.5770733016902231, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 4581 + }, + { + "epoch": 0.04582, + "grad_norm": 0.4702915512162915, + "learning_rate": 0.003, + "loss": 4.1561, + "step": 4582 + }, + { + "epoch": 0.04583, + "grad_norm": 0.45277658262029463, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 4583 + }, + { + "epoch": 0.04584, + "grad_norm": 0.4519208543100193, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 4584 + }, + { + "epoch": 0.04585, + "grad_norm": 0.4665143329624495, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 4585 + }, + { + "epoch": 0.04586, + "grad_norm": 0.5475838831331661, + "learning_rate": 0.003, + "loss": 4.1339, + "step": 4586 + }, + { + "epoch": 0.04587, + "grad_norm": 0.6898223965554029, + "learning_rate": 0.003, + "loss": 4.1575, + "step": 4587 + }, + { + "epoch": 0.04588, + "grad_norm": 0.7350712744563861, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 4588 + }, + { + "epoch": 0.04589, + "grad_norm": 0.6924903850485442, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 4589 + }, + { + "epoch": 0.0459, + "grad_norm": 0.6678707302029508, + "learning_rate": 0.003, + "loss": 4.1476, + "step": 4590 + }, + { + "epoch": 0.04591, + "grad_norm": 0.6499473571050832, + "learning_rate": 0.003, + "loss": 4.117, + "step": 4591 + }, + { + "epoch": 0.04592, + "grad_norm": 0.6817526881561883, + "learning_rate": 0.003, + "loss": 4.1425, + "step": 4592 + }, + { + "epoch": 0.04593, + "grad_norm": 0.6518777120562186, + "learning_rate": 0.003, + "loss": 4.1644, + "step": 4593 + }, + { + "epoch": 0.04594, + "grad_norm": 0.5747202505666615, + "learning_rate": 0.003, + "loss": 4.1544, + "step": 4594 + }, + { + "epoch": 0.04595, + "grad_norm": 0.5397845618052467, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 4595 + }, + { + "epoch": 0.04596, + "grad_norm": 0.510854997128848, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 4596 + }, + { + "epoch": 0.04597, + "grad_norm": 0.49627082518670596, + "learning_rate": 0.003, + "loss": 4.147, + "step": 4597 + }, + { + "epoch": 0.04598, + "grad_norm": 0.530586170374032, + "learning_rate": 0.003, + "loss": 4.1793, + "step": 4598 + }, + { + "epoch": 0.04599, + "grad_norm": 0.48072108059262636, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 4599 + }, + { + "epoch": 0.046, + "grad_norm": 0.4162968401651856, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 4600 + }, + { + "epoch": 0.04601, + "grad_norm": 0.4060240320878838, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 4601 + }, + { + "epoch": 0.04602, + "grad_norm": 0.4911601212276765, + "learning_rate": 0.003, + "loss": 4.1333, + "step": 4602 + }, + { + "epoch": 0.04603, + "grad_norm": 0.5112302821705641, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 4603 + }, + { + "epoch": 0.04604, + "grad_norm": 0.5642182423159343, + "learning_rate": 0.003, + "loss": 4.146, + "step": 4604 + }, + { + "epoch": 0.04605, + "grad_norm": 0.6193357979940537, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 4605 + }, + { + "epoch": 0.04606, + "grad_norm": 0.7047357474337432, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 4606 + }, + { + "epoch": 0.04607, + "grad_norm": 0.782876195240902, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 4607 + }, + { + "epoch": 0.04608, + "grad_norm": 0.6985321303741415, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 4608 + }, + { + "epoch": 0.04609, + "grad_norm": 0.49949454925317704, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 4609 + }, + { + "epoch": 0.0461, + "grad_norm": 0.4635494280797984, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 4610 + }, + { + "epoch": 0.04611, + "grad_norm": 0.4523573678413289, + "learning_rate": 0.003, + "loss": 4.1611, + "step": 4611 + }, + { + "epoch": 0.04612, + "grad_norm": 0.5042320328946598, + "learning_rate": 0.003, + "loss": 4.143, + "step": 4612 + }, + { + "epoch": 0.04613, + "grad_norm": 0.5210913810336651, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 4613 + }, + { + "epoch": 0.04614, + "grad_norm": 0.6245444112274343, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 4614 + }, + { + "epoch": 0.04615, + "grad_norm": 0.6535761170229347, + "learning_rate": 0.003, + "loss": 4.138, + "step": 4615 + }, + { + "epoch": 0.04616, + "grad_norm": 0.6420802226733096, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 4616 + }, + { + "epoch": 0.04617, + "grad_norm": 0.5687953058470812, + "learning_rate": 0.003, + "loss": 4.1442, + "step": 4617 + }, + { + "epoch": 0.04618, + "grad_norm": 0.5083532285672069, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 4618 + }, + { + "epoch": 0.04619, + "grad_norm": 0.5265907169405639, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 4619 + }, + { + "epoch": 0.0462, + "grad_norm": 0.5890366123582629, + "learning_rate": 0.003, + "loss": 4.155, + "step": 4620 + }, + { + "epoch": 0.04621, + "grad_norm": 0.6504323601577833, + "learning_rate": 0.003, + "loss": 4.165, + "step": 4621 + }, + { + "epoch": 0.04622, + "grad_norm": 0.6707598130254062, + "learning_rate": 0.003, + "loss": 4.1612, + "step": 4622 + }, + { + "epoch": 0.04623, + "grad_norm": 0.6302847440276245, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 4623 + }, + { + "epoch": 0.04624, + "grad_norm": 0.6466984839849552, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 4624 + }, + { + "epoch": 0.04625, + "grad_norm": 0.6909336873679446, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 4625 + }, + { + "epoch": 0.04626, + "grad_norm": 0.6100952148809866, + "learning_rate": 0.003, + "loss": 4.146, + "step": 4626 + }, + { + "epoch": 0.04627, + "grad_norm": 0.54274307554158, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 4627 + }, + { + "epoch": 0.04628, + "grad_norm": 0.5405464195031551, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 4628 + }, + { + "epoch": 0.04629, + "grad_norm": 0.5457077621265701, + "learning_rate": 0.003, + "loss": 4.1313, + "step": 4629 + }, + { + "epoch": 0.0463, + "grad_norm": 0.5985608016616293, + "learning_rate": 0.003, + "loss": 4.1464, + "step": 4630 + }, + { + "epoch": 0.04631, + "grad_norm": 0.6821487067722284, + "learning_rate": 0.003, + "loss": 4.1345, + "step": 4631 + }, + { + "epoch": 0.04632, + "grad_norm": 0.7372265744416058, + "learning_rate": 0.003, + "loss": 4.1302, + "step": 4632 + }, + { + "epoch": 0.04633, + "grad_norm": 0.8415550301042956, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 4633 + }, + { + "epoch": 0.04634, + "grad_norm": 0.8373989629700342, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 4634 + }, + { + "epoch": 0.04635, + "grad_norm": 0.7581438335563392, + "learning_rate": 0.003, + "loss": 4.1486, + "step": 4635 + }, + { + "epoch": 0.04636, + "grad_norm": 0.7125351605332876, + "learning_rate": 0.003, + "loss": 4.183, + "step": 4636 + }, + { + "epoch": 0.04637, + "grad_norm": 0.7232757442184159, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 4637 + }, + { + "epoch": 0.04638, + "grad_norm": 0.6757057573333891, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 4638 + }, + { + "epoch": 0.04639, + "grad_norm": 0.719912413703624, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 4639 + }, + { + "epoch": 0.0464, + "grad_norm": 0.7428965743218653, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 4640 + }, + { + "epoch": 0.04641, + "grad_norm": 0.8598211728393256, + "learning_rate": 0.003, + "loss": 4.1798, + "step": 4641 + }, + { + "epoch": 0.04642, + "grad_norm": 0.8907386217362441, + "learning_rate": 0.003, + "loss": 4.1302, + "step": 4642 + }, + { + "epoch": 0.04643, + "grad_norm": 0.7795644135221422, + "learning_rate": 0.003, + "loss": 4.152, + "step": 4643 + }, + { + "epoch": 0.04644, + "grad_norm": 0.5620211951242756, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 4644 + }, + { + "epoch": 0.04645, + "grad_norm": 0.5137991034790863, + "learning_rate": 0.003, + "loss": 4.1401, + "step": 4645 + }, + { + "epoch": 0.04646, + "grad_norm": 0.5506672519917369, + "learning_rate": 0.003, + "loss": 4.1418, + "step": 4646 + }, + { + "epoch": 0.04647, + "grad_norm": 0.545899380374656, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 4647 + }, + { + "epoch": 0.04648, + "grad_norm": 0.5926193733697508, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 4648 + }, + { + "epoch": 0.04649, + "grad_norm": 0.6418478720978149, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 4649 + }, + { + "epoch": 0.0465, + "grad_norm": 0.6971029121589252, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 4650 + }, + { + "epoch": 0.04651, + "grad_norm": 0.7128225140702158, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 4651 + }, + { + "epoch": 0.04652, + "grad_norm": 0.6817984613215538, + "learning_rate": 0.003, + "loss": 4.1556, + "step": 4652 + }, + { + "epoch": 0.04653, + "grad_norm": 0.6586749661533192, + "learning_rate": 0.003, + "loss": 4.157, + "step": 4653 + }, + { + "epoch": 0.04654, + "grad_norm": 0.5413982579807713, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 4654 + }, + { + "epoch": 0.04655, + "grad_norm": 0.4819381736457758, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 4655 + }, + { + "epoch": 0.04656, + "grad_norm": 0.5090740776020435, + "learning_rate": 0.003, + "loss": 4.12, + "step": 4656 + }, + { + "epoch": 0.04657, + "grad_norm": 0.4529973940302403, + "learning_rate": 0.003, + "loss": 4.1551, + "step": 4657 + }, + { + "epoch": 0.04658, + "grad_norm": 0.3863158282687969, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 4658 + }, + { + "epoch": 0.04659, + "grad_norm": 0.3846174175293322, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 4659 + }, + { + "epoch": 0.0466, + "grad_norm": 0.33629970185853586, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 4660 + }, + { + "epoch": 0.04661, + "grad_norm": 0.3733824492556284, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 4661 + }, + { + "epoch": 0.04662, + "grad_norm": 0.40706146799826654, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 4662 + }, + { + "epoch": 0.04663, + "grad_norm": 0.4624049284667894, + "learning_rate": 0.003, + "loss": 4.1355, + "step": 4663 + }, + { + "epoch": 0.04664, + "grad_norm": 0.6204385926386629, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 4664 + }, + { + "epoch": 0.04665, + "grad_norm": 0.8486297396587145, + "learning_rate": 0.003, + "loss": 4.1261, + "step": 4665 + }, + { + "epoch": 0.04666, + "grad_norm": 0.8396332728949535, + "learning_rate": 0.003, + "loss": 4.128, + "step": 4666 + }, + { + "epoch": 0.04667, + "grad_norm": 0.604029857762682, + "learning_rate": 0.003, + "loss": 4.1204, + "step": 4667 + }, + { + "epoch": 0.04668, + "grad_norm": 0.6187443793003241, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 4668 + }, + { + "epoch": 0.04669, + "grad_norm": 0.671068656091709, + "learning_rate": 0.003, + "loss": 4.1551, + "step": 4669 + }, + { + "epoch": 0.0467, + "grad_norm": 0.582982047763638, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 4670 + }, + { + "epoch": 0.04671, + "grad_norm": 0.5791042512401672, + "learning_rate": 0.003, + "loss": 4.128, + "step": 4671 + }, + { + "epoch": 0.04672, + "grad_norm": 0.5822833904953063, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 4672 + }, + { + "epoch": 0.04673, + "grad_norm": 0.6690185777600851, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 4673 + }, + { + "epoch": 0.04674, + "grad_norm": 0.6501772820492476, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 4674 + }, + { + "epoch": 0.04675, + "grad_norm": 0.5619140983596763, + "learning_rate": 0.003, + "loss": 4.143, + "step": 4675 + }, + { + "epoch": 0.04676, + "grad_norm": 0.4844274130103253, + "learning_rate": 0.003, + "loss": 4.145, + "step": 4676 + }, + { + "epoch": 0.04677, + "grad_norm": 0.46020296554025397, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 4677 + }, + { + "epoch": 0.04678, + "grad_norm": 0.4560378554438905, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 4678 + }, + { + "epoch": 0.04679, + "grad_norm": 0.48237840297163587, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 4679 + }, + { + "epoch": 0.0468, + "grad_norm": 0.545076677673247, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 4680 + }, + { + "epoch": 0.04681, + "grad_norm": 0.6224973549149077, + "learning_rate": 0.003, + "loss": 4.1596, + "step": 4681 + }, + { + "epoch": 0.04682, + "grad_norm": 0.7719478560198243, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 4682 + }, + { + "epoch": 0.04683, + "grad_norm": 0.8064115060114956, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 4683 + }, + { + "epoch": 0.04684, + "grad_norm": 0.8855175249633898, + "learning_rate": 0.003, + "loss": 4.1577, + "step": 4684 + }, + { + "epoch": 0.04685, + "grad_norm": 0.8534062578922067, + "learning_rate": 0.003, + "loss": 4.145, + "step": 4685 + }, + { + "epoch": 0.04686, + "grad_norm": 0.797573707115956, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 4686 + }, + { + "epoch": 0.04687, + "grad_norm": 0.713270606155539, + "learning_rate": 0.003, + "loss": 4.181, + "step": 4687 + }, + { + "epoch": 0.04688, + "grad_norm": 0.6241645872789137, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 4688 + }, + { + "epoch": 0.04689, + "grad_norm": 0.559937415479369, + "learning_rate": 0.003, + "loss": 4.1712, + "step": 4689 + }, + { + "epoch": 0.0469, + "grad_norm": 0.5404635622147993, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 4690 + }, + { + "epoch": 0.04691, + "grad_norm": 0.486653564258482, + "learning_rate": 0.003, + "loss": 4.1318, + "step": 4691 + }, + { + "epoch": 0.04692, + "grad_norm": 0.481468369732997, + "learning_rate": 0.003, + "loss": 4.1351, + "step": 4692 + }, + { + "epoch": 0.04693, + "grad_norm": 0.5011365510399343, + "learning_rate": 0.003, + "loss": 4.1601, + "step": 4693 + }, + { + "epoch": 0.04694, + "grad_norm": 0.48431784153510343, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 4694 + }, + { + "epoch": 0.04695, + "grad_norm": 0.5590735035376782, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 4695 + }, + { + "epoch": 0.04696, + "grad_norm": 0.5991808838927974, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 4696 + }, + { + "epoch": 0.04697, + "grad_norm": 0.6802689730153363, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 4697 + }, + { + "epoch": 0.04698, + "grad_norm": 0.6842952307392385, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 4698 + }, + { + "epoch": 0.04699, + "grad_norm": 0.6034505759619246, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 4699 + }, + { + "epoch": 0.047, + "grad_norm": 0.5907548018053599, + "learning_rate": 0.003, + "loss": 4.1459, + "step": 4700 + }, + { + "epoch": 0.04701, + "grad_norm": 0.5241329554797072, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 4701 + }, + { + "epoch": 0.04702, + "grad_norm": 0.5399206512572382, + "learning_rate": 0.003, + "loss": 4.1446, + "step": 4702 + }, + { + "epoch": 0.04703, + "grad_norm": 0.5629949677153792, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 4703 + }, + { + "epoch": 0.04704, + "grad_norm": 0.4982163977250413, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 4704 + }, + { + "epoch": 0.04705, + "grad_norm": 0.5165399833831362, + "learning_rate": 0.003, + "loss": 4.1559, + "step": 4705 + }, + { + "epoch": 0.04706, + "grad_norm": 0.5588859123289356, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 4706 + }, + { + "epoch": 0.04707, + "grad_norm": 0.6493712508280153, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 4707 + }, + { + "epoch": 0.04708, + "grad_norm": 0.7592099124000022, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 4708 + }, + { + "epoch": 0.04709, + "grad_norm": 0.7643143840544172, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 4709 + }, + { + "epoch": 0.0471, + "grad_norm": 0.7117650865961848, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 4710 + }, + { + "epoch": 0.04711, + "grad_norm": 0.6222639694626878, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 4711 + }, + { + "epoch": 0.04712, + "grad_norm": 0.5676066371017507, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 4712 + }, + { + "epoch": 0.04713, + "grad_norm": 0.5770424755113991, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 4713 + }, + { + "epoch": 0.04714, + "grad_norm": 0.5617675778124301, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 4714 + }, + { + "epoch": 0.04715, + "grad_norm": 0.5635879231709099, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 4715 + }, + { + "epoch": 0.04716, + "grad_norm": 0.5756399495553071, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 4716 + }, + { + "epoch": 0.04717, + "grad_norm": 0.5516676091531315, + "learning_rate": 0.003, + "loss": 4.146, + "step": 4717 + }, + { + "epoch": 0.04718, + "grad_norm": 0.5287651595406879, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 4718 + }, + { + "epoch": 0.04719, + "grad_norm": 0.5253847377704106, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 4719 + }, + { + "epoch": 0.0472, + "grad_norm": 0.5350874385832446, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 4720 + }, + { + "epoch": 0.04721, + "grad_norm": 0.6004531613918229, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 4721 + }, + { + "epoch": 0.04722, + "grad_norm": 0.7164746962025985, + "learning_rate": 0.003, + "loss": 4.1357, + "step": 4722 + }, + { + "epoch": 0.04723, + "grad_norm": 0.7305093269163504, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 4723 + }, + { + "epoch": 0.04724, + "grad_norm": 0.6386395939306334, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 4724 + }, + { + "epoch": 0.04725, + "grad_norm": 0.5477055183445729, + "learning_rate": 0.003, + "loss": 4.1373, + "step": 4725 + }, + { + "epoch": 0.04726, + "grad_norm": 0.6698851087894558, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 4726 + }, + { + "epoch": 0.04727, + "grad_norm": 0.6952010909817607, + "learning_rate": 0.003, + "loss": 4.121, + "step": 4727 + }, + { + "epoch": 0.04728, + "grad_norm": 0.6237561354472855, + "learning_rate": 0.003, + "loss": 4.1242, + "step": 4728 + }, + { + "epoch": 0.04729, + "grad_norm": 0.5824726583084653, + "learning_rate": 0.003, + "loss": 4.1416, + "step": 4729 + }, + { + "epoch": 0.0473, + "grad_norm": 0.6234484457938285, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 4730 + }, + { + "epoch": 0.04731, + "grad_norm": 0.7040984462948884, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 4731 + }, + { + "epoch": 0.04732, + "grad_norm": 0.8060281611564052, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 4732 + }, + { + "epoch": 0.04733, + "grad_norm": 0.8908771747222946, + "learning_rate": 0.003, + "loss": 4.1476, + "step": 4733 + }, + { + "epoch": 0.04734, + "grad_norm": 0.8915755383205525, + "learning_rate": 0.003, + "loss": 4.1618, + "step": 4734 + }, + { + "epoch": 0.04735, + "grad_norm": 0.8597675990151213, + "learning_rate": 0.003, + "loss": 4.1801, + "step": 4735 + }, + { + "epoch": 0.04736, + "grad_norm": 0.9421924904802637, + "learning_rate": 0.003, + "loss": 4.1714, + "step": 4736 + }, + { + "epoch": 0.04737, + "grad_norm": 0.9723514275784639, + "learning_rate": 0.003, + "loss": 4.1406, + "step": 4737 + }, + { + "epoch": 0.04738, + "grad_norm": 0.8765529889726066, + "learning_rate": 0.003, + "loss": 4.1853, + "step": 4738 + }, + { + "epoch": 0.04739, + "grad_norm": 0.6891707628164019, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 4739 + }, + { + "epoch": 0.0474, + "grad_norm": 0.6320172894482899, + "learning_rate": 0.003, + "loss": 4.1783, + "step": 4740 + }, + { + "epoch": 0.04741, + "grad_norm": 0.6185420901083515, + "learning_rate": 0.003, + "loss": 4.1667, + "step": 4741 + }, + { + "epoch": 0.04742, + "grad_norm": 0.5743911871453854, + "learning_rate": 0.003, + "loss": 4.1474, + "step": 4742 + }, + { + "epoch": 0.04743, + "grad_norm": 0.5220289188801445, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 4743 + }, + { + "epoch": 0.04744, + "grad_norm": 0.509105103411712, + "learning_rate": 0.003, + "loss": 4.1261, + "step": 4744 + }, + { + "epoch": 0.04745, + "grad_norm": 0.47978510769114535, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 4745 + }, + { + "epoch": 0.04746, + "grad_norm": 0.5254663892408653, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 4746 + }, + { + "epoch": 0.04747, + "grad_norm": 0.5939317340288737, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 4747 + }, + { + "epoch": 0.04748, + "grad_norm": 0.5634019114968404, + "learning_rate": 0.003, + "loss": 4.1662, + "step": 4748 + }, + { + "epoch": 0.04749, + "grad_norm": 0.5497840563348653, + "learning_rate": 0.003, + "loss": 4.1719, + "step": 4749 + }, + { + "epoch": 0.0475, + "grad_norm": 0.5714725339017903, + "learning_rate": 0.003, + "loss": 4.1375, + "step": 4750 + }, + { + "epoch": 0.04751, + "grad_norm": 0.6084741881095581, + "learning_rate": 0.003, + "loss": 4.103, + "step": 4751 + }, + { + "epoch": 0.04752, + "grad_norm": 0.6608722355986122, + "learning_rate": 0.003, + "loss": 4.1375, + "step": 4752 + }, + { + "epoch": 0.04753, + "grad_norm": 0.7109251785941167, + "learning_rate": 0.003, + "loss": 4.1449, + "step": 4753 + }, + { + "epoch": 0.04754, + "grad_norm": 0.6628914079263116, + "learning_rate": 0.003, + "loss": 4.1659, + "step": 4754 + }, + { + "epoch": 0.04755, + "grad_norm": 0.5644082779422211, + "learning_rate": 0.003, + "loss": 4.139, + "step": 4755 + }, + { + "epoch": 0.04756, + "grad_norm": 0.5287140638812969, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 4756 + }, + { + "epoch": 0.04757, + "grad_norm": 0.49259530652550815, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 4757 + }, + { + "epoch": 0.04758, + "grad_norm": 0.5614284037719612, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 4758 + }, + { + "epoch": 0.04759, + "grad_norm": 0.5959601501901447, + "learning_rate": 0.003, + "loss": 4.1404, + "step": 4759 + }, + { + "epoch": 0.0476, + "grad_norm": 0.6904403583130148, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 4760 + }, + { + "epoch": 0.04761, + "grad_norm": 0.7246423353646352, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 4761 + }, + { + "epoch": 0.04762, + "grad_norm": 0.6915319453958364, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 4762 + }, + { + "epoch": 0.04763, + "grad_norm": 0.6256527340482957, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 4763 + }, + { + "epoch": 0.04764, + "grad_norm": 0.6439794931734228, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 4764 + }, + { + "epoch": 0.04765, + "grad_norm": 0.6077799243331904, + "learning_rate": 0.003, + "loss": 4.1654, + "step": 4765 + }, + { + "epoch": 0.04766, + "grad_norm": 0.5987824824691682, + "learning_rate": 0.003, + "loss": 4.1407, + "step": 4766 + }, + { + "epoch": 0.04767, + "grad_norm": 0.5323630132045762, + "learning_rate": 0.003, + "loss": 4.1343, + "step": 4767 + }, + { + "epoch": 0.04768, + "grad_norm": 0.49097893919403107, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 4768 + }, + { + "epoch": 0.04769, + "grad_norm": 0.41538834375199724, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 4769 + }, + { + "epoch": 0.0477, + "grad_norm": 0.4062286279109976, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 4770 + }, + { + "epoch": 0.04771, + "grad_norm": 0.3969385937952626, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 4771 + }, + { + "epoch": 0.04772, + "grad_norm": 0.47881496679265567, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 4772 + }, + { + "epoch": 0.04773, + "grad_norm": 0.5857337921957999, + "learning_rate": 0.003, + "loss": 4.1524, + "step": 4773 + }, + { + "epoch": 0.04774, + "grad_norm": 0.6545046565097566, + "learning_rate": 0.003, + "loss": 4.1399, + "step": 4774 + }, + { + "epoch": 0.04775, + "grad_norm": 0.6909582502342425, + "learning_rate": 0.003, + "loss": 4.1388, + "step": 4775 + }, + { + "epoch": 0.04776, + "grad_norm": 0.6830425397433746, + "learning_rate": 0.003, + "loss": 4.1266, + "step": 4776 + }, + { + "epoch": 0.04777, + "grad_norm": 0.5407062720120711, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 4777 + }, + { + "epoch": 0.04778, + "grad_norm": 0.4928003344996406, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 4778 + }, + { + "epoch": 0.04779, + "grad_norm": 0.5540563562005282, + "learning_rate": 0.003, + "loss": 4.137, + "step": 4779 + }, + { + "epoch": 0.0478, + "grad_norm": 0.5919870365085376, + "learning_rate": 0.003, + "loss": 4.141, + "step": 4780 + }, + { + "epoch": 0.04781, + "grad_norm": 0.6502125533927232, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 4781 + }, + { + "epoch": 0.04782, + "grad_norm": 0.5583168452423385, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 4782 + }, + { + "epoch": 0.04783, + "grad_norm": 0.49694295687820944, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 4783 + }, + { + "epoch": 0.04784, + "grad_norm": 0.4979847074086822, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 4784 + }, + { + "epoch": 0.04785, + "grad_norm": 0.5392917734790091, + "learning_rate": 0.003, + "loss": 4.1386, + "step": 4785 + }, + { + "epoch": 0.04786, + "grad_norm": 0.5426680689275899, + "learning_rate": 0.003, + "loss": 4.1409, + "step": 4786 + }, + { + "epoch": 0.04787, + "grad_norm": 0.5518443356995324, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 4787 + }, + { + "epoch": 0.04788, + "grad_norm": 0.6334713170503672, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 4788 + }, + { + "epoch": 0.04789, + "grad_norm": 0.6443157992564054, + "learning_rate": 0.003, + "loss": 4.109, + "step": 4789 + }, + { + "epoch": 0.0479, + "grad_norm": 0.6616151594742191, + "learning_rate": 0.003, + "loss": 4.127, + "step": 4790 + }, + { + "epoch": 0.04791, + "grad_norm": 0.7664115436248651, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 4791 + }, + { + "epoch": 0.04792, + "grad_norm": 0.8885911114335623, + "learning_rate": 0.003, + "loss": 4.1663, + "step": 4792 + }, + { + "epoch": 0.04793, + "grad_norm": 1.0730053893316884, + "learning_rate": 0.003, + "loss": 4.1418, + "step": 4793 + }, + { + "epoch": 0.04794, + "grad_norm": 0.7779447364165427, + "learning_rate": 0.003, + "loss": 4.1567, + "step": 4794 + }, + { + "epoch": 0.04795, + "grad_norm": 0.588536399495119, + "learning_rate": 0.003, + "loss": 4.147, + "step": 4795 + }, + { + "epoch": 0.04796, + "grad_norm": 0.68568663004306, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 4796 + }, + { + "epoch": 0.04797, + "grad_norm": 0.681720094933613, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 4797 + }, + { + "epoch": 0.04798, + "grad_norm": 0.7679505695308819, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 4798 + }, + { + "epoch": 0.04799, + "grad_norm": 0.7317665379567332, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 4799 + }, + { + "epoch": 0.048, + "grad_norm": 0.6100825656074705, + "learning_rate": 0.003, + "loss": 4.1333, + "step": 4800 + }, + { + "epoch": 0.04801, + "grad_norm": 0.5012414047123679, + "learning_rate": 0.003, + "loss": 4.1386, + "step": 4801 + }, + { + "epoch": 0.04802, + "grad_norm": 0.4479983336578663, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 4802 + }, + { + "epoch": 0.04803, + "grad_norm": 0.4184616004474593, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 4803 + }, + { + "epoch": 0.04804, + "grad_norm": 0.41305580441153505, + "learning_rate": 0.003, + "loss": 4.147, + "step": 4804 + }, + { + "epoch": 0.04805, + "grad_norm": 0.4398073604549804, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 4805 + }, + { + "epoch": 0.04806, + "grad_norm": 0.42743235225805976, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 4806 + }, + { + "epoch": 0.04807, + "grad_norm": 0.4727115258160579, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 4807 + }, + { + "epoch": 0.04808, + "grad_norm": 0.5919755301104812, + "learning_rate": 0.003, + "loss": 4.126, + "step": 4808 + }, + { + "epoch": 0.04809, + "grad_norm": 0.7289693894093642, + "learning_rate": 0.003, + "loss": 4.1239, + "step": 4809 + }, + { + "epoch": 0.0481, + "grad_norm": 0.7839859175085296, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 4810 + }, + { + "epoch": 0.04811, + "grad_norm": 0.6884295236179085, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 4811 + }, + { + "epoch": 0.04812, + "grad_norm": 0.7352119188470826, + "learning_rate": 0.003, + "loss": 4.1488, + "step": 4812 + }, + { + "epoch": 0.04813, + "grad_norm": 0.695166473264558, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 4813 + }, + { + "epoch": 0.04814, + "grad_norm": 0.6553813909063907, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 4814 + }, + { + "epoch": 0.04815, + "grad_norm": 0.5872101085364063, + "learning_rate": 0.003, + "loss": 4.1306, + "step": 4815 + }, + { + "epoch": 0.04816, + "grad_norm": 0.6233113378649032, + "learning_rate": 0.003, + "loss": 4.1404, + "step": 4816 + }, + { + "epoch": 0.04817, + "grad_norm": 0.6150064661388728, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 4817 + }, + { + "epoch": 0.04818, + "grad_norm": 0.5836692065444768, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 4818 + }, + { + "epoch": 0.04819, + "grad_norm": 0.66525230376135, + "learning_rate": 0.003, + "loss": 4.138, + "step": 4819 + }, + { + "epoch": 0.0482, + "grad_norm": 0.734508856599843, + "learning_rate": 0.003, + "loss": 4.131, + "step": 4820 + }, + { + "epoch": 0.04821, + "grad_norm": 0.7602706927514201, + "learning_rate": 0.003, + "loss": 4.1636, + "step": 4821 + }, + { + "epoch": 0.04822, + "grad_norm": 0.7876820121443141, + "learning_rate": 0.003, + "loss": 4.1439, + "step": 4822 + }, + { + "epoch": 0.04823, + "grad_norm": 0.7863376722170511, + "learning_rate": 0.003, + "loss": 4.1376, + "step": 4823 + }, + { + "epoch": 0.04824, + "grad_norm": 0.8763060513910201, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 4824 + }, + { + "epoch": 0.04825, + "grad_norm": 0.8932088904498784, + "learning_rate": 0.003, + "loss": 4.1501, + "step": 4825 + }, + { + "epoch": 0.04826, + "grad_norm": 0.7974932345630039, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 4826 + }, + { + "epoch": 0.04827, + "grad_norm": 0.787269582726668, + "learning_rate": 0.003, + "loss": 4.1569, + "step": 4827 + }, + { + "epoch": 0.04828, + "grad_norm": 0.8334258319646332, + "learning_rate": 0.003, + "loss": 4.1713, + "step": 4828 + }, + { + "epoch": 0.04829, + "grad_norm": 0.8348604948088247, + "learning_rate": 0.003, + "loss": 4.1556, + "step": 4829 + }, + { + "epoch": 0.0483, + "grad_norm": 0.8624019255671944, + "learning_rate": 0.003, + "loss": 4.1718, + "step": 4830 + }, + { + "epoch": 0.04831, + "grad_norm": 0.9982066722626229, + "learning_rate": 0.003, + "loss": 4.1513, + "step": 4831 + }, + { + "epoch": 0.04832, + "grad_norm": 0.7923904912144631, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 4832 + }, + { + "epoch": 0.04833, + "grad_norm": 0.7754468674602506, + "learning_rate": 0.003, + "loss": 4.173, + "step": 4833 + }, + { + "epoch": 0.04834, + "grad_norm": 0.6518452811577893, + "learning_rate": 0.003, + "loss": 4.134, + "step": 4834 + }, + { + "epoch": 0.04835, + "grad_norm": 0.7302318791564857, + "learning_rate": 0.003, + "loss": 4.1796, + "step": 4835 + }, + { + "epoch": 0.04836, + "grad_norm": 0.6773722956545634, + "learning_rate": 0.003, + "loss": 4.146, + "step": 4836 + }, + { + "epoch": 0.04837, + "grad_norm": 0.5824847865414027, + "learning_rate": 0.003, + "loss": 4.1626, + "step": 4837 + }, + { + "epoch": 0.04838, + "grad_norm": 0.5995229116547709, + "learning_rate": 0.003, + "loss": 4.1738, + "step": 4838 + }, + { + "epoch": 0.04839, + "grad_norm": 0.6433667604247222, + "learning_rate": 0.003, + "loss": 4.1595, + "step": 4839 + }, + { + "epoch": 0.0484, + "grad_norm": 0.6015686841024611, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 4840 + }, + { + "epoch": 0.04841, + "grad_norm": 0.513992549739537, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 4841 + }, + { + "epoch": 0.04842, + "grad_norm": 0.5245975343951695, + "learning_rate": 0.003, + "loss": 4.168, + "step": 4842 + }, + { + "epoch": 0.04843, + "grad_norm": 0.48743719353902437, + "learning_rate": 0.003, + "loss": 4.119, + "step": 4843 + }, + { + "epoch": 0.04844, + "grad_norm": 0.4307867246160189, + "learning_rate": 0.003, + "loss": 4.1464, + "step": 4844 + }, + { + "epoch": 0.04845, + "grad_norm": 0.3751006510902314, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 4845 + }, + { + "epoch": 0.04846, + "grad_norm": 0.39953129595847076, + "learning_rate": 0.003, + "loss": 4.1491, + "step": 4846 + }, + { + "epoch": 0.04847, + "grad_norm": 0.36070808802461135, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 4847 + }, + { + "epoch": 0.04848, + "grad_norm": 0.39594455872525836, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 4848 + }, + { + "epoch": 0.04849, + "grad_norm": 0.38000994045173514, + "learning_rate": 0.003, + "loss": 4.1425, + "step": 4849 + }, + { + "epoch": 0.0485, + "grad_norm": 0.4205813881723009, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 4850 + }, + { + "epoch": 0.04851, + "grad_norm": 0.4045018073402591, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 4851 + }, + { + "epoch": 0.04852, + "grad_norm": 0.37568412955203784, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 4852 + }, + { + "epoch": 0.04853, + "grad_norm": 0.3455978343249278, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 4853 + }, + { + "epoch": 0.04854, + "grad_norm": 0.3952113128712618, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 4854 + }, + { + "epoch": 0.04855, + "grad_norm": 0.5622316469812096, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 4855 + }, + { + "epoch": 0.04856, + "grad_norm": 0.8797178488644849, + "learning_rate": 0.003, + "loss": 4.1514, + "step": 4856 + }, + { + "epoch": 0.04857, + "grad_norm": 1.1335123213922318, + "learning_rate": 0.003, + "loss": 4.1859, + "step": 4857 + }, + { + "epoch": 0.04858, + "grad_norm": 0.6147110414360265, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 4858 + }, + { + "epoch": 0.04859, + "grad_norm": 0.6973892953945595, + "learning_rate": 0.003, + "loss": 4.1734, + "step": 4859 + }, + { + "epoch": 0.0486, + "grad_norm": 0.7043154871004238, + "learning_rate": 0.003, + "loss": 4.1524, + "step": 4860 + }, + { + "epoch": 0.04861, + "grad_norm": 0.7327036767884083, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 4861 + }, + { + "epoch": 0.04862, + "grad_norm": 0.712112403189955, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 4862 + }, + { + "epoch": 0.04863, + "grad_norm": 0.7121663551182991, + "learning_rate": 0.003, + "loss": 4.1673, + "step": 4863 + }, + { + "epoch": 0.04864, + "grad_norm": 0.6310157347751136, + "learning_rate": 0.003, + "loss": 4.1417, + "step": 4864 + }, + { + "epoch": 0.04865, + "grad_norm": 0.6082126733812362, + "learning_rate": 0.003, + "loss": 4.1569, + "step": 4865 + }, + { + "epoch": 0.04866, + "grad_norm": 0.569379004286016, + "learning_rate": 0.003, + "loss": 4.1621, + "step": 4866 + }, + { + "epoch": 0.04867, + "grad_norm": 0.543390213497262, + "learning_rate": 0.003, + "loss": 4.1275, + "step": 4867 + }, + { + "epoch": 0.04868, + "grad_norm": 0.5636989030319737, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 4868 + }, + { + "epoch": 0.04869, + "grad_norm": 0.5250393396952697, + "learning_rate": 0.003, + "loss": 4.1459, + "step": 4869 + }, + { + "epoch": 0.0487, + "grad_norm": 0.4712102787061369, + "learning_rate": 0.003, + "loss": 4.1341, + "step": 4870 + }, + { + "epoch": 0.04871, + "grad_norm": 0.4137409461601053, + "learning_rate": 0.003, + "loss": 4.1494, + "step": 4871 + }, + { + "epoch": 0.04872, + "grad_norm": 0.34190339505881295, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 4872 + }, + { + "epoch": 0.04873, + "grad_norm": 0.3786358077114681, + "learning_rate": 0.003, + "loss": 4.138, + "step": 4873 + }, + { + "epoch": 0.04874, + "grad_norm": 0.3569458423955174, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 4874 + }, + { + "epoch": 0.04875, + "grad_norm": 0.34447042337429823, + "learning_rate": 0.003, + "loss": 4.119, + "step": 4875 + }, + { + "epoch": 0.04876, + "grad_norm": 0.40030916663729343, + "learning_rate": 0.003, + "loss": 4.121, + "step": 4876 + }, + { + "epoch": 0.04877, + "grad_norm": 0.4592360630373684, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 4877 + }, + { + "epoch": 0.04878, + "grad_norm": 0.47334555855094895, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 4878 + }, + { + "epoch": 0.04879, + "grad_norm": 0.4994212755863587, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 4879 + }, + { + "epoch": 0.0488, + "grad_norm": 0.5513466708644019, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 4880 + }, + { + "epoch": 0.04881, + "grad_norm": 0.6373315380482077, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 4881 + }, + { + "epoch": 0.04882, + "grad_norm": 0.6684216804530342, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 4882 + }, + { + "epoch": 0.04883, + "grad_norm": 0.5368064523422058, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 4883 + }, + { + "epoch": 0.04884, + "grad_norm": 0.5972330760546537, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 4884 + }, + { + "epoch": 0.04885, + "grad_norm": 0.8933715601802283, + "learning_rate": 0.003, + "loss": 4.1427, + "step": 4885 + }, + { + "epoch": 0.04886, + "grad_norm": 0.898355015821335, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 4886 + }, + { + "epoch": 0.04887, + "grad_norm": 0.7173276903905438, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 4887 + }, + { + "epoch": 0.04888, + "grad_norm": 0.6726150274910551, + "learning_rate": 0.003, + "loss": 4.1516, + "step": 4888 + }, + { + "epoch": 0.04889, + "grad_norm": 0.6876961916754174, + "learning_rate": 0.003, + "loss": 4.1306, + "step": 4889 + }, + { + "epoch": 0.0489, + "grad_norm": 0.6584901279572161, + "learning_rate": 0.003, + "loss": 4.1409, + "step": 4890 + }, + { + "epoch": 0.04891, + "grad_norm": 0.5884964182869652, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 4891 + }, + { + "epoch": 0.04892, + "grad_norm": 0.7007131660081303, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 4892 + }, + { + "epoch": 0.04893, + "grad_norm": 0.856434726166467, + "learning_rate": 0.003, + "loss": 4.1573, + "step": 4893 + }, + { + "epoch": 0.04894, + "grad_norm": 0.8623969209321906, + "learning_rate": 0.003, + "loss": 4.1724, + "step": 4894 + }, + { + "epoch": 0.04895, + "grad_norm": 0.9386453675933166, + "learning_rate": 0.003, + "loss": 4.1501, + "step": 4895 + }, + { + "epoch": 0.04896, + "grad_norm": 1.0225765758614995, + "learning_rate": 0.003, + "loss": 4.174, + "step": 4896 + }, + { + "epoch": 0.04897, + "grad_norm": 1.0592162822847404, + "learning_rate": 0.003, + "loss": 4.1956, + "step": 4897 + }, + { + "epoch": 0.04898, + "grad_norm": 0.9806770790608599, + "learning_rate": 0.003, + "loss": 4.1844, + "step": 4898 + }, + { + "epoch": 0.04899, + "grad_norm": 1.0087204735328514, + "learning_rate": 0.003, + "loss": 4.1796, + "step": 4899 + }, + { + "epoch": 0.049, + "grad_norm": 0.9807645706509096, + "learning_rate": 0.003, + "loss": 4.2071, + "step": 4900 + }, + { + "epoch": 0.04901, + "grad_norm": 0.8655711245910307, + "learning_rate": 0.003, + "loss": 4.1672, + "step": 4901 + }, + { + "epoch": 0.04902, + "grad_norm": 0.7265751482705578, + "learning_rate": 0.003, + "loss": 4.1956, + "step": 4902 + }, + { + "epoch": 0.04903, + "grad_norm": 0.7751913689828299, + "learning_rate": 0.003, + "loss": 4.1967, + "step": 4903 + }, + { + "epoch": 0.04904, + "grad_norm": 0.791355963573371, + "learning_rate": 0.003, + "loss": 4.1646, + "step": 4904 + }, + { + "epoch": 0.04905, + "grad_norm": 0.7148922976648996, + "learning_rate": 0.003, + "loss": 4.1585, + "step": 4905 + }, + { + "epoch": 0.04906, + "grad_norm": 0.6244001153297727, + "learning_rate": 0.003, + "loss": 4.1667, + "step": 4906 + }, + { + "epoch": 0.04907, + "grad_norm": 0.766532703122461, + "learning_rate": 0.003, + "loss": 4.1521, + "step": 4907 + }, + { + "epoch": 0.04908, + "grad_norm": 0.9239125390916663, + "learning_rate": 0.003, + "loss": 4.1747, + "step": 4908 + }, + { + "epoch": 0.04909, + "grad_norm": 0.8062598058153201, + "learning_rate": 0.003, + "loss": 4.1762, + "step": 4909 + }, + { + "epoch": 0.0491, + "grad_norm": 0.6645500078640559, + "learning_rate": 0.003, + "loss": 4.1636, + "step": 4910 + }, + { + "epoch": 0.04911, + "grad_norm": 0.6285244715158727, + "learning_rate": 0.003, + "loss": 4.1564, + "step": 4911 + }, + { + "epoch": 0.04912, + "grad_norm": 0.6953556617834721, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 4912 + }, + { + "epoch": 0.04913, + "grad_norm": 0.7284223880047193, + "learning_rate": 0.003, + "loss": 4.1577, + "step": 4913 + }, + { + "epoch": 0.04914, + "grad_norm": 0.7207287527598001, + "learning_rate": 0.003, + "loss": 4.165, + "step": 4914 + }, + { + "epoch": 0.04915, + "grad_norm": 0.598886194111445, + "learning_rate": 0.003, + "loss": 4.1661, + "step": 4915 + }, + { + "epoch": 0.04916, + "grad_norm": 0.575625916272145, + "learning_rate": 0.003, + "loss": 4.1602, + "step": 4916 + }, + { + "epoch": 0.04917, + "grad_norm": 0.6136059699261334, + "learning_rate": 0.003, + "loss": 4.117, + "step": 4917 + }, + { + "epoch": 0.04918, + "grad_norm": 0.5509547477979617, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 4918 + }, + { + "epoch": 0.04919, + "grad_norm": 0.4500315004245685, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 4919 + }, + { + "epoch": 0.0492, + "grad_norm": 0.40527549449786815, + "learning_rate": 0.003, + "loss": 4.1467, + "step": 4920 + }, + { + "epoch": 0.04921, + "grad_norm": 0.42108054279033674, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 4921 + }, + { + "epoch": 0.04922, + "grad_norm": 0.4404322304412163, + "learning_rate": 0.003, + "loss": 4.1338, + "step": 4922 + }, + { + "epoch": 0.04923, + "grad_norm": 0.47502761296305623, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 4923 + }, + { + "epoch": 0.04924, + "grad_norm": 0.6308445729709988, + "learning_rate": 0.003, + "loss": 4.1681, + "step": 4924 + }, + { + "epoch": 0.04925, + "grad_norm": 0.8223519029190577, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 4925 + }, + { + "epoch": 0.04926, + "grad_norm": 0.7978760513652026, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 4926 + }, + { + "epoch": 0.04927, + "grad_norm": 0.5978087975120926, + "learning_rate": 0.003, + "loss": 4.1601, + "step": 4927 + }, + { + "epoch": 0.04928, + "grad_norm": 0.6116960538491134, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 4928 + }, + { + "epoch": 0.04929, + "grad_norm": 0.6956029366529681, + "learning_rate": 0.003, + "loss": 4.1413, + "step": 4929 + }, + { + "epoch": 0.0493, + "grad_norm": 0.6193307396741099, + "learning_rate": 0.003, + "loss": 4.1285, + "step": 4930 + }, + { + "epoch": 0.04931, + "grad_norm": 0.5548956756238017, + "learning_rate": 0.003, + "loss": 4.11, + "step": 4931 + }, + { + "epoch": 0.04932, + "grad_norm": 0.5257796103914479, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 4932 + }, + { + "epoch": 0.04933, + "grad_norm": 0.525537631722876, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 4933 + }, + { + "epoch": 0.04934, + "grad_norm": 0.5597601882963287, + "learning_rate": 0.003, + "loss": 4.1608, + "step": 4934 + }, + { + "epoch": 0.04935, + "grad_norm": 0.5183399428365818, + "learning_rate": 0.003, + "loss": 4.155, + "step": 4935 + }, + { + "epoch": 0.04936, + "grad_norm": 0.45976573800777254, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 4936 + }, + { + "epoch": 0.04937, + "grad_norm": 0.4718016863136529, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 4937 + }, + { + "epoch": 0.04938, + "grad_norm": 0.5145237878577654, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 4938 + }, + { + "epoch": 0.04939, + "grad_norm": 0.49376291714918813, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 4939 + }, + { + "epoch": 0.0494, + "grad_norm": 0.40835265558787853, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 4940 + }, + { + "epoch": 0.04941, + "grad_norm": 0.39634154664022514, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 4941 + }, + { + "epoch": 0.04942, + "grad_norm": 0.3697169007385673, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 4942 + }, + { + "epoch": 0.04943, + "grad_norm": 0.3331096726429694, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 4943 + }, + { + "epoch": 0.04944, + "grad_norm": 0.35453569789946104, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 4944 + }, + { + "epoch": 0.04945, + "grad_norm": 0.3239762249913837, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 4945 + }, + { + "epoch": 0.04946, + "grad_norm": 0.31174772171665677, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 4946 + }, + { + "epoch": 0.04947, + "grad_norm": 0.2863836162320785, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 4947 + }, + { + "epoch": 0.04948, + "grad_norm": 0.2929298130193931, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 4948 + }, + { + "epoch": 0.04949, + "grad_norm": 0.3283619905331677, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 4949 + }, + { + "epoch": 0.0495, + "grad_norm": 0.35583655037221473, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 4950 + }, + { + "epoch": 0.04951, + "grad_norm": 0.4306179937269439, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 4951 + }, + { + "epoch": 0.04952, + "grad_norm": 0.5484424313628553, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 4952 + }, + { + "epoch": 0.04953, + "grad_norm": 0.683881072985923, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 4953 + }, + { + "epoch": 0.04954, + "grad_norm": 0.8387871275000114, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 4954 + }, + { + "epoch": 0.04955, + "grad_norm": 1.0719745768190279, + "learning_rate": 0.003, + "loss": 4.142, + "step": 4955 + }, + { + "epoch": 0.04956, + "grad_norm": 1.0142567681805608, + "learning_rate": 0.003, + "loss": 4.1641, + "step": 4956 + }, + { + "epoch": 0.04957, + "grad_norm": 0.7860187886136664, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 4957 + }, + { + "epoch": 0.04958, + "grad_norm": 0.7666806226784963, + "learning_rate": 0.003, + "loss": 4.1464, + "step": 4958 + }, + { + "epoch": 0.04959, + "grad_norm": 0.7024610691029676, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 4959 + }, + { + "epoch": 0.0496, + "grad_norm": 0.634345285618044, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 4960 + }, + { + "epoch": 0.04961, + "grad_norm": 0.5794913569394096, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 4961 + }, + { + "epoch": 0.04962, + "grad_norm": 0.6259340689173926, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 4962 + }, + { + "epoch": 0.04963, + "grad_norm": 0.6609688779347547, + "learning_rate": 0.003, + "loss": 4.176, + "step": 4963 + }, + { + "epoch": 0.04964, + "grad_norm": 0.6719506081449138, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 4964 + }, + { + "epoch": 0.04965, + "grad_norm": 0.6471802630352936, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 4965 + }, + { + "epoch": 0.04966, + "grad_norm": 0.7401257014237659, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 4966 + }, + { + "epoch": 0.04967, + "grad_norm": 0.7264688238953103, + "learning_rate": 0.003, + "loss": 4.1502, + "step": 4967 + }, + { + "epoch": 0.04968, + "grad_norm": 0.6850630441933709, + "learning_rate": 0.003, + "loss": 4.1738, + "step": 4968 + }, + { + "epoch": 0.04969, + "grad_norm": 0.7103603969004639, + "learning_rate": 0.003, + "loss": 4.148, + "step": 4969 + }, + { + "epoch": 0.0497, + "grad_norm": 0.7119697504065976, + "learning_rate": 0.003, + "loss": 4.1431, + "step": 4970 + }, + { + "epoch": 0.04971, + "grad_norm": 0.6691773043773404, + "learning_rate": 0.003, + "loss": 4.1618, + "step": 4971 + }, + { + "epoch": 0.04972, + "grad_norm": 0.7584471656947329, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 4972 + }, + { + "epoch": 0.04973, + "grad_norm": 0.7074430853606097, + "learning_rate": 0.003, + "loss": 4.1896, + "step": 4973 + }, + { + "epoch": 0.04974, + "grad_norm": 0.5713006659555607, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 4974 + }, + { + "epoch": 0.04975, + "grad_norm": 0.5130707295349296, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 4975 + }, + { + "epoch": 0.04976, + "grad_norm": 0.476744226744805, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 4976 + }, + { + "epoch": 0.04977, + "grad_norm": 0.436532472120347, + "learning_rate": 0.003, + "loss": 4.1425, + "step": 4977 + }, + { + "epoch": 0.04978, + "grad_norm": 0.4481149086483841, + "learning_rate": 0.003, + "loss": 4.1405, + "step": 4978 + }, + { + "epoch": 0.04979, + "grad_norm": 0.4658196526981218, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 4979 + }, + { + "epoch": 0.0498, + "grad_norm": 0.48184153753418996, + "learning_rate": 0.003, + "loss": 4.1351, + "step": 4980 + }, + { + "epoch": 0.04981, + "grad_norm": 0.5711200331385257, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 4981 + }, + { + "epoch": 0.04982, + "grad_norm": 0.7811982054881438, + "learning_rate": 0.003, + "loss": 4.1394, + "step": 4982 + }, + { + "epoch": 0.04983, + "grad_norm": 0.9119195104356123, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 4983 + }, + { + "epoch": 0.04984, + "grad_norm": 0.809114284515937, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 4984 + }, + { + "epoch": 0.04985, + "grad_norm": 0.6537212838716787, + "learning_rate": 0.003, + "loss": 4.1368, + "step": 4985 + }, + { + "epoch": 0.04986, + "grad_norm": 0.6869403529206495, + "learning_rate": 0.003, + "loss": 4.1589, + "step": 4986 + }, + { + "epoch": 0.04987, + "grad_norm": 0.613841094604171, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 4987 + }, + { + "epoch": 0.04988, + "grad_norm": 0.6618484956538617, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 4988 + }, + { + "epoch": 0.04989, + "grad_norm": 0.5936176795447469, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 4989 + }, + { + "epoch": 0.0499, + "grad_norm": 0.5392404418710565, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 4990 + }, + { + "epoch": 0.04991, + "grad_norm": 0.621246461554044, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 4991 + }, + { + "epoch": 0.04992, + "grad_norm": 0.670783622770409, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 4992 + }, + { + "epoch": 0.04993, + "grad_norm": 0.6045345128052574, + "learning_rate": 0.003, + "loss": 4.1313, + "step": 4993 + }, + { + "epoch": 0.04994, + "grad_norm": 0.6101486114599416, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 4994 + }, + { + "epoch": 0.04995, + "grad_norm": 0.55705863270894, + "learning_rate": 0.003, + "loss": 4.1366, + "step": 4995 + }, + { + "epoch": 0.04996, + "grad_norm": 0.44788775188911484, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 4996 + }, + { + "epoch": 0.04997, + "grad_norm": 0.4309530448591752, + "learning_rate": 0.003, + "loss": 4.13, + "step": 4997 + }, + { + "epoch": 0.04998, + "grad_norm": 0.428484725286708, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 4998 + }, + { + "epoch": 0.04999, + "grad_norm": 0.41683300725155314, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 4999 + }, + { + "epoch": 0.05, + "grad_norm": 0.3880481560630285, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 5000 + }, + { + "epoch": 0.05001, + "grad_norm": 0.44002892670528426, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 5001 + }, + { + "epoch": 0.05002, + "grad_norm": 0.5267413563943687, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 5002 + }, + { + "epoch": 0.05003, + "grad_norm": 0.6974673629709173, + "learning_rate": 0.003, + "loss": 4.14, + "step": 5003 + }, + { + "epoch": 0.05004, + "grad_norm": 0.858937261820554, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 5004 + }, + { + "epoch": 0.05005, + "grad_norm": 0.9141406787634108, + "learning_rate": 0.003, + "loss": 4.1638, + "step": 5005 + }, + { + "epoch": 0.05006, + "grad_norm": 0.687864263963555, + "learning_rate": 0.003, + "loss": 4.1338, + "step": 5006 + }, + { + "epoch": 0.05007, + "grad_norm": 0.5773822019308112, + "learning_rate": 0.003, + "loss": 4.147, + "step": 5007 + }, + { + "epoch": 0.05008, + "grad_norm": 0.5547957033613983, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 5008 + }, + { + "epoch": 0.05009, + "grad_norm": 0.5411686344353427, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 5009 + }, + { + "epoch": 0.0501, + "grad_norm": 0.489119317455081, + "learning_rate": 0.003, + "loss": 4.134, + "step": 5010 + }, + { + "epoch": 0.05011, + "grad_norm": 0.464838598175237, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 5011 + }, + { + "epoch": 0.05012, + "grad_norm": 0.4580943694467521, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 5012 + }, + { + "epoch": 0.05013, + "grad_norm": 0.4808521248590666, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 5013 + }, + { + "epoch": 0.05014, + "grad_norm": 0.5229887414294506, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 5014 + }, + { + "epoch": 0.05015, + "grad_norm": 0.5777151908604863, + "learning_rate": 0.003, + "loss": 4.14, + "step": 5015 + }, + { + "epoch": 0.05016, + "grad_norm": 0.5235213366994899, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 5016 + }, + { + "epoch": 0.05017, + "grad_norm": 0.445246255748685, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 5017 + }, + { + "epoch": 0.05018, + "grad_norm": 0.45246478410906943, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 5018 + }, + { + "epoch": 0.05019, + "grad_norm": 0.5231762403732283, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 5019 + }, + { + "epoch": 0.0502, + "grad_norm": 0.6594471849420648, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 5020 + }, + { + "epoch": 0.05021, + "grad_norm": 0.6904442298729323, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 5021 + }, + { + "epoch": 0.05022, + "grad_norm": 0.5909951697887482, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 5022 + }, + { + "epoch": 0.05023, + "grad_norm": 0.577685062094026, + "learning_rate": 0.003, + "loss": 4.1484, + "step": 5023 + }, + { + "epoch": 0.05024, + "grad_norm": 0.5728157983655138, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 5024 + }, + { + "epoch": 0.05025, + "grad_norm": 0.5379702885678885, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 5025 + }, + { + "epoch": 0.05026, + "grad_norm": 0.6155371625202312, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 5026 + }, + { + "epoch": 0.05027, + "grad_norm": 0.7519388420060837, + "learning_rate": 0.003, + "loss": 4.1457, + "step": 5027 + }, + { + "epoch": 0.05028, + "grad_norm": 0.9311839881105047, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 5028 + }, + { + "epoch": 0.05029, + "grad_norm": 0.9260888417245792, + "learning_rate": 0.003, + "loss": 4.1534, + "step": 5029 + }, + { + "epoch": 0.0503, + "grad_norm": 0.7728492641666489, + "learning_rate": 0.003, + "loss": 4.123, + "step": 5030 + }, + { + "epoch": 0.05031, + "grad_norm": 0.777930269535098, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 5031 + }, + { + "epoch": 0.05032, + "grad_norm": 0.7350784470721563, + "learning_rate": 0.003, + "loss": 4.145, + "step": 5032 + }, + { + "epoch": 0.05033, + "grad_norm": 0.5291553968829333, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 5033 + }, + { + "epoch": 0.05034, + "grad_norm": 0.4919036368843819, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 5034 + }, + { + "epoch": 0.05035, + "grad_norm": 0.4975664864391519, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 5035 + }, + { + "epoch": 0.05036, + "grad_norm": 0.5163959831953332, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 5036 + }, + { + "epoch": 0.05037, + "grad_norm": 0.5569175575619792, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 5037 + }, + { + "epoch": 0.05038, + "grad_norm": 0.5227477080467972, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 5038 + }, + { + "epoch": 0.05039, + "grad_norm": 0.5597860102419677, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 5039 + }, + { + "epoch": 0.0504, + "grad_norm": 0.6074226279213635, + "learning_rate": 0.003, + "loss": 4.1376, + "step": 5040 + }, + { + "epoch": 0.05041, + "grad_norm": 0.6238557653084295, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 5041 + }, + { + "epoch": 0.05042, + "grad_norm": 0.6153407136204087, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 5042 + }, + { + "epoch": 0.05043, + "grad_norm": 0.6514614471239298, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 5043 + }, + { + "epoch": 0.05044, + "grad_norm": 0.6930287002961413, + "learning_rate": 0.003, + "loss": 4.1501, + "step": 5044 + }, + { + "epoch": 0.05045, + "grad_norm": 0.8304592547990185, + "learning_rate": 0.003, + "loss": 4.1731, + "step": 5045 + }, + { + "epoch": 0.05046, + "grad_norm": 0.8026034540044529, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 5046 + }, + { + "epoch": 0.05047, + "grad_norm": 0.9245681532952602, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 5047 + }, + { + "epoch": 0.05048, + "grad_norm": 0.8960724676621618, + "learning_rate": 0.003, + "loss": 4.1587, + "step": 5048 + }, + { + "epoch": 0.05049, + "grad_norm": 1.0151937627560343, + "learning_rate": 0.003, + "loss": 4.1266, + "step": 5049 + }, + { + "epoch": 0.0505, + "grad_norm": 0.884162859927295, + "learning_rate": 0.003, + "loss": 4.2198, + "step": 5050 + }, + { + "epoch": 0.05051, + "grad_norm": 0.8307433200396237, + "learning_rate": 0.003, + "loss": 4.1792, + "step": 5051 + }, + { + "epoch": 0.05052, + "grad_norm": 0.7384842172109713, + "learning_rate": 0.003, + "loss": 4.1467, + "step": 5052 + }, + { + "epoch": 0.05053, + "grad_norm": 0.7203012020157923, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 5053 + }, + { + "epoch": 0.05054, + "grad_norm": 0.7327188596143545, + "learning_rate": 0.003, + "loss": 4.1775, + "step": 5054 + }, + { + "epoch": 0.05055, + "grad_norm": 0.7559923263710384, + "learning_rate": 0.003, + "loss": 4.1551, + "step": 5055 + }, + { + "epoch": 0.05056, + "grad_norm": 0.6831499776250408, + "learning_rate": 0.003, + "loss": 4.1645, + "step": 5056 + }, + { + "epoch": 0.05057, + "grad_norm": 0.6878997827607911, + "learning_rate": 0.003, + "loss": 4.169, + "step": 5057 + }, + { + "epoch": 0.05058, + "grad_norm": 0.6963511057251066, + "learning_rate": 0.003, + "loss": 4.1689, + "step": 5058 + }, + { + "epoch": 0.05059, + "grad_norm": 0.7218382309881082, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 5059 + }, + { + "epoch": 0.0506, + "grad_norm": 0.7641404073482049, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 5060 + }, + { + "epoch": 0.05061, + "grad_norm": 0.7492054114953671, + "learning_rate": 0.003, + "loss": 4.1514, + "step": 5061 + }, + { + "epoch": 0.05062, + "grad_norm": 0.580002983697803, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 5062 + }, + { + "epoch": 0.05063, + "grad_norm": 0.465857319923329, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 5063 + }, + { + "epoch": 0.05064, + "grad_norm": 0.4639475584360793, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 5064 + }, + { + "epoch": 0.05065, + "grad_norm": 0.4316878203187786, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 5065 + }, + { + "epoch": 0.05066, + "grad_norm": 0.4129716036113381, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 5066 + }, + { + "epoch": 0.05067, + "grad_norm": 0.3545337277972109, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 5067 + }, + { + "epoch": 0.05068, + "grad_norm": 0.35000428560320695, + "learning_rate": 0.003, + "loss": 4.1455, + "step": 5068 + }, + { + "epoch": 0.05069, + "grad_norm": 0.28475691607233394, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 5069 + }, + { + "epoch": 0.0507, + "grad_norm": 0.3211180881977199, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 5070 + }, + { + "epoch": 0.05071, + "grad_norm": 0.31520195677598756, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 5071 + }, + { + "epoch": 0.05072, + "grad_norm": 0.33671460211119836, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 5072 + }, + { + "epoch": 0.05073, + "grad_norm": 0.33769028368475607, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 5073 + }, + { + "epoch": 0.05074, + "grad_norm": 0.3448787140101478, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 5074 + }, + { + "epoch": 0.05075, + "grad_norm": 0.3957609850918836, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 5075 + }, + { + "epoch": 0.05076, + "grad_norm": 0.48241503749103687, + "learning_rate": 0.003, + "loss": 4.139, + "step": 5076 + }, + { + "epoch": 0.05077, + "grad_norm": 0.6132233951932243, + "learning_rate": 0.003, + "loss": 4.1467, + "step": 5077 + }, + { + "epoch": 0.05078, + "grad_norm": 0.7745174445055403, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 5078 + }, + { + "epoch": 0.05079, + "grad_norm": 0.8279381574948442, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 5079 + }, + { + "epoch": 0.0508, + "grad_norm": 0.8263557958238591, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 5080 + }, + { + "epoch": 0.05081, + "grad_norm": 0.8277412011902472, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 5081 + }, + { + "epoch": 0.05082, + "grad_norm": 0.7915182287182168, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 5082 + }, + { + "epoch": 0.05083, + "grad_norm": 0.7357570092313975, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 5083 + }, + { + "epoch": 0.05084, + "grad_norm": 0.9633741172415596, + "learning_rate": 0.003, + "loss": 4.1468, + "step": 5084 + }, + { + "epoch": 0.05085, + "grad_norm": 1.0990777684274788, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 5085 + }, + { + "epoch": 0.05086, + "grad_norm": 0.7846549440538764, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 5086 + }, + { + "epoch": 0.05087, + "grad_norm": 0.6846851816877524, + "learning_rate": 0.003, + "loss": 4.1432, + "step": 5087 + }, + { + "epoch": 0.05088, + "grad_norm": 0.6318100802689848, + "learning_rate": 0.003, + "loss": 4.1434, + "step": 5088 + }, + { + "epoch": 0.05089, + "grad_norm": 0.6520703017626633, + "learning_rate": 0.003, + "loss": 4.1387, + "step": 5089 + }, + { + "epoch": 0.0509, + "grad_norm": 0.5565734582358871, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 5090 + }, + { + "epoch": 0.05091, + "grad_norm": 0.5349654586172464, + "learning_rate": 0.003, + "loss": 4.1305, + "step": 5091 + }, + { + "epoch": 0.05092, + "grad_norm": 0.5297741402897067, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 5092 + }, + { + "epoch": 0.05093, + "grad_norm": 0.5575684104199916, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 5093 + }, + { + "epoch": 0.05094, + "grad_norm": 0.5605888909754495, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 5094 + }, + { + "epoch": 0.05095, + "grad_norm": 0.5343018695121468, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 5095 + }, + { + "epoch": 0.05096, + "grad_norm": 0.43949601788526343, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 5096 + }, + { + "epoch": 0.05097, + "grad_norm": 0.4810979086570852, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 5097 + }, + { + "epoch": 0.05098, + "grad_norm": 0.4654228949280776, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 5098 + }, + { + "epoch": 0.05099, + "grad_norm": 0.40501669029112214, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 5099 + }, + { + "epoch": 0.051, + "grad_norm": 0.4223377546263004, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 5100 + }, + { + "epoch": 0.05101, + "grad_norm": 0.44525788101780217, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 5101 + }, + { + "epoch": 0.05102, + "grad_norm": 0.4871872595505277, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 5102 + }, + { + "epoch": 0.05103, + "grad_norm": 0.5447347242656629, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 5103 + }, + { + "epoch": 0.05104, + "grad_norm": 0.5964115151305719, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 5104 + }, + { + "epoch": 0.05105, + "grad_norm": 0.5867716136824593, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 5105 + }, + { + "epoch": 0.05106, + "grad_norm": 0.5562979835891493, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 5106 + }, + { + "epoch": 0.05107, + "grad_norm": 0.5181396244592503, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 5107 + }, + { + "epoch": 0.05108, + "grad_norm": 0.5440116113449289, + "learning_rate": 0.003, + "loss": 4.1537, + "step": 5108 + }, + { + "epoch": 0.05109, + "grad_norm": 0.5961075613550932, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 5109 + }, + { + "epoch": 0.0511, + "grad_norm": 0.533390436132316, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 5110 + }, + { + "epoch": 0.05111, + "grad_norm": 0.48024251864431305, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 5111 + }, + { + "epoch": 0.05112, + "grad_norm": 0.4598495045627523, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 5112 + }, + { + "epoch": 0.05113, + "grad_norm": 0.5112726037297928, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 5113 + }, + { + "epoch": 0.05114, + "grad_norm": 0.6162499535834111, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 5114 + }, + { + "epoch": 0.05115, + "grad_norm": 0.6330435450833568, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 5115 + }, + { + "epoch": 0.05116, + "grad_norm": 0.7922157375423148, + "learning_rate": 0.003, + "loss": 4.1422, + "step": 5116 + }, + { + "epoch": 0.05117, + "grad_norm": 0.8777485197768994, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 5117 + }, + { + "epoch": 0.05118, + "grad_norm": 0.8487101677130807, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 5118 + }, + { + "epoch": 0.05119, + "grad_norm": 0.6672995585854586, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 5119 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5347949292529923, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 5120 + }, + { + "epoch": 0.05121, + "grad_norm": 0.47651085513877506, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 5121 + }, + { + "epoch": 0.05122, + "grad_norm": 0.5586400651301906, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 5122 + }, + { + "epoch": 0.05123, + "grad_norm": 0.6300817408188087, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 5123 + }, + { + "epoch": 0.05124, + "grad_norm": 0.6701205648572075, + "learning_rate": 0.003, + "loss": 4.1609, + "step": 5124 + }, + { + "epoch": 0.05125, + "grad_norm": 0.6468239470621561, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 5125 + }, + { + "epoch": 0.05126, + "grad_norm": 0.5445267421148194, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 5126 + }, + { + "epoch": 0.05127, + "grad_norm": 0.4799763707457368, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 5127 + }, + { + "epoch": 0.05128, + "grad_norm": 0.5803016836080052, + "learning_rate": 0.003, + "loss": 4.1391, + "step": 5128 + }, + { + "epoch": 0.05129, + "grad_norm": 0.678952357079139, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 5129 + }, + { + "epoch": 0.0513, + "grad_norm": 0.8039176919182028, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 5130 + }, + { + "epoch": 0.05131, + "grad_norm": 0.8445292736195356, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 5131 + }, + { + "epoch": 0.05132, + "grad_norm": 0.8088539903803569, + "learning_rate": 0.003, + "loss": 4.131, + "step": 5132 + }, + { + "epoch": 0.05133, + "grad_norm": 0.8452742411739833, + "learning_rate": 0.003, + "loss": 4.1457, + "step": 5133 + }, + { + "epoch": 0.05134, + "grad_norm": 0.7211547370019534, + "learning_rate": 0.003, + "loss": 4.1522, + "step": 5134 + }, + { + "epoch": 0.05135, + "grad_norm": 0.7552143125752953, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 5135 + }, + { + "epoch": 0.05136, + "grad_norm": 0.8178560583882517, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 5136 + }, + { + "epoch": 0.05137, + "grad_norm": 0.7937255401431907, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 5137 + }, + { + "epoch": 0.05138, + "grad_norm": 0.7029385951002983, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 5138 + }, + { + "epoch": 0.05139, + "grad_norm": 0.755399264561879, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 5139 + }, + { + "epoch": 0.0514, + "grad_norm": 0.6372525763339177, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 5140 + }, + { + "epoch": 0.05141, + "grad_norm": 0.6168576930336179, + "learning_rate": 0.003, + "loss": 4.1854, + "step": 5141 + }, + { + "epoch": 0.05142, + "grad_norm": 0.6036281974051024, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 5142 + }, + { + "epoch": 0.05143, + "grad_norm": 0.5036860750600824, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 5143 + }, + { + "epoch": 0.05144, + "grad_norm": 0.46857974439093014, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 5144 + }, + { + "epoch": 0.05145, + "grad_norm": 0.42494497123142294, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 5145 + }, + { + "epoch": 0.05146, + "grad_norm": 0.44380638929392596, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 5146 + }, + { + "epoch": 0.05147, + "grad_norm": 0.40327298093540914, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 5147 + }, + { + "epoch": 0.05148, + "grad_norm": 0.3953841068063316, + "learning_rate": 0.003, + "loss": 4.128, + "step": 5148 + }, + { + "epoch": 0.05149, + "grad_norm": 0.4012036518868883, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 5149 + }, + { + "epoch": 0.0515, + "grad_norm": 0.39949323869855685, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 5150 + }, + { + "epoch": 0.05151, + "grad_norm": 0.3876338989691726, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 5151 + }, + { + "epoch": 0.05152, + "grad_norm": 0.43621474925166587, + "learning_rate": 0.003, + "loss": 4.1375, + "step": 5152 + }, + { + "epoch": 0.05153, + "grad_norm": 0.5301350008384232, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 5153 + }, + { + "epoch": 0.05154, + "grad_norm": 0.7035426773318153, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 5154 + }, + { + "epoch": 0.05155, + "grad_norm": 0.9094366977526885, + "learning_rate": 0.003, + "loss": 4.1434, + "step": 5155 + }, + { + "epoch": 0.05156, + "grad_norm": 0.8124683719101861, + "learning_rate": 0.003, + "loss": 4.1683, + "step": 5156 + }, + { + "epoch": 0.05157, + "grad_norm": 0.6963497652538516, + "learning_rate": 0.003, + "loss": 4.1343, + "step": 5157 + }, + { + "epoch": 0.05158, + "grad_norm": 0.7426530943881816, + "learning_rate": 0.003, + "loss": 4.1305, + "step": 5158 + }, + { + "epoch": 0.05159, + "grad_norm": 0.6705321816606814, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 5159 + }, + { + "epoch": 0.0516, + "grad_norm": 0.6026791060312885, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 5160 + }, + { + "epoch": 0.05161, + "grad_norm": 0.6460386669653281, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 5161 + }, + { + "epoch": 0.05162, + "grad_norm": 0.7274049490439595, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 5162 + }, + { + "epoch": 0.05163, + "grad_norm": 0.6827610438042071, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 5163 + }, + { + "epoch": 0.05164, + "grad_norm": 0.6633511077303998, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 5164 + }, + { + "epoch": 0.05165, + "grad_norm": 0.8069016890146392, + "learning_rate": 0.003, + "loss": 4.1464, + "step": 5165 + }, + { + "epoch": 0.05166, + "grad_norm": 0.7564185954506445, + "learning_rate": 0.003, + "loss": 4.1466, + "step": 5166 + }, + { + "epoch": 0.05167, + "grad_norm": 0.6729290579348203, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 5167 + }, + { + "epoch": 0.05168, + "grad_norm": 0.681973653519437, + "learning_rate": 0.003, + "loss": 4.1784, + "step": 5168 + }, + { + "epoch": 0.05169, + "grad_norm": 0.6894995787523694, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 5169 + }, + { + "epoch": 0.0517, + "grad_norm": 0.6251406068036489, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 5170 + }, + { + "epoch": 0.05171, + "grad_norm": 0.6022206459158862, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 5171 + }, + { + "epoch": 0.05172, + "grad_norm": 0.5547021446284994, + "learning_rate": 0.003, + "loss": 4.1413, + "step": 5172 + }, + { + "epoch": 0.05173, + "grad_norm": 0.5511637086768453, + "learning_rate": 0.003, + "loss": 4.1536, + "step": 5173 + }, + { + "epoch": 0.05174, + "grad_norm": 0.549834258038828, + "learning_rate": 0.003, + "loss": 4.1405, + "step": 5174 + }, + { + "epoch": 0.05175, + "grad_norm": 0.5143957250446763, + "learning_rate": 0.003, + "loss": 4.1488, + "step": 5175 + }, + { + "epoch": 0.05176, + "grad_norm": 0.5016512485210078, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 5176 + }, + { + "epoch": 0.05177, + "grad_norm": 0.5010375401941776, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 5177 + }, + { + "epoch": 0.05178, + "grad_norm": 0.45886355153966824, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 5178 + }, + { + "epoch": 0.05179, + "grad_norm": 0.4774614456980692, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 5179 + }, + { + "epoch": 0.0518, + "grad_norm": 0.5343856749000508, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 5180 + }, + { + "epoch": 0.05181, + "grad_norm": 0.5847156904354145, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 5181 + }, + { + "epoch": 0.05182, + "grad_norm": 0.7281346718853275, + "learning_rate": 0.003, + "loss": 4.1458, + "step": 5182 + }, + { + "epoch": 0.05183, + "grad_norm": 0.7959120436672125, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 5183 + }, + { + "epoch": 0.05184, + "grad_norm": 0.7708701874016135, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 5184 + }, + { + "epoch": 0.05185, + "grad_norm": 0.7830980202404018, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 5185 + }, + { + "epoch": 0.05186, + "grad_norm": 0.6793457202900599, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 5186 + }, + { + "epoch": 0.05187, + "grad_norm": 0.6855736266075582, + "learning_rate": 0.003, + "loss": 4.171, + "step": 5187 + }, + { + "epoch": 0.05188, + "grad_norm": 0.760859985233772, + "learning_rate": 0.003, + "loss": 4.1501, + "step": 5188 + }, + { + "epoch": 0.05189, + "grad_norm": 0.7264115896262829, + "learning_rate": 0.003, + "loss": 4.124, + "step": 5189 + }, + { + "epoch": 0.0519, + "grad_norm": 0.7175290962540748, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 5190 + }, + { + "epoch": 0.05191, + "grad_norm": 0.6496658085565145, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 5191 + }, + { + "epoch": 0.05192, + "grad_norm": 0.6267055244498101, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 5192 + }, + { + "epoch": 0.05193, + "grad_norm": 0.5727047155848178, + "learning_rate": 0.003, + "loss": 4.1409, + "step": 5193 + }, + { + "epoch": 0.05194, + "grad_norm": 0.5491895800888712, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 5194 + }, + { + "epoch": 0.05195, + "grad_norm": 0.5312113948934261, + "learning_rate": 0.003, + "loss": 4.1422, + "step": 5195 + }, + { + "epoch": 0.05196, + "grad_norm": 0.5838275239944962, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 5196 + }, + { + "epoch": 0.05197, + "grad_norm": 0.5319117287619517, + "learning_rate": 0.003, + "loss": 4.1404, + "step": 5197 + }, + { + "epoch": 0.05198, + "grad_norm": 0.474231170755562, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 5198 + }, + { + "epoch": 0.05199, + "grad_norm": 0.5339097503934678, + "learning_rate": 0.003, + "loss": 4.098, + "step": 5199 + }, + { + "epoch": 0.052, + "grad_norm": 0.552513644020886, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 5200 + }, + { + "epoch": 0.05201, + "grad_norm": 0.5499912684018807, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 5201 + }, + { + "epoch": 0.05202, + "grad_norm": 0.5884166479851186, + "learning_rate": 0.003, + "loss": 4.114, + "step": 5202 + }, + { + "epoch": 0.05203, + "grad_norm": 0.6234832076161475, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 5203 + }, + { + "epoch": 0.05204, + "grad_norm": 0.6370950178244867, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 5204 + }, + { + "epoch": 0.05205, + "grad_norm": 0.7268995146225012, + "learning_rate": 0.003, + "loss": 4.1446, + "step": 5205 + }, + { + "epoch": 0.05206, + "grad_norm": 0.8429209951426253, + "learning_rate": 0.003, + "loss": 4.1341, + "step": 5206 + }, + { + "epoch": 0.05207, + "grad_norm": 0.891067417785423, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 5207 + }, + { + "epoch": 0.05208, + "grad_norm": 0.880801827049288, + "learning_rate": 0.003, + "loss": 4.1419, + "step": 5208 + }, + { + "epoch": 0.05209, + "grad_norm": 0.743628891207245, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 5209 + }, + { + "epoch": 0.0521, + "grad_norm": 0.6585381210736347, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 5210 + }, + { + "epoch": 0.05211, + "grad_norm": 0.6590374105413944, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 5211 + }, + { + "epoch": 0.05212, + "grad_norm": 0.5838327606908597, + "learning_rate": 0.003, + "loss": 4.1734, + "step": 5212 + }, + { + "epoch": 0.05213, + "grad_norm": 0.5577967457265012, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 5213 + }, + { + "epoch": 0.05214, + "grad_norm": 0.5831733721175694, + "learning_rate": 0.003, + "loss": 4.1188, + "step": 5214 + }, + { + "epoch": 0.05215, + "grad_norm": 0.5947055436999698, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 5215 + }, + { + "epoch": 0.05216, + "grad_norm": 0.5162752873889312, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 5216 + }, + { + "epoch": 0.05217, + "grad_norm": 0.5215914347147623, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 5217 + }, + { + "epoch": 0.05218, + "grad_norm": 0.495811759691527, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 5218 + }, + { + "epoch": 0.05219, + "grad_norm": 0.5616102027479555, + "learning_rate": 0.003, + "loss": 4.1402, + "step": 5219 + }, + { + "epoch": 0.0522, + "grad_norm": 0.596993154601823, + "learning_rate": 0.003, + "loss": 4.107, + "step": 5220 + }, + { + "epoch": 0.05221, + "grad_norm": 0.6135656226590566, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 5221 + }, + { + "epoch": 0.05222, + "grad_norm": 0.6651818813368179, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 5222 + }, + { + "epoch": 0.05223, + "grad_norm": 0.693814397802243, + "learning_rate": 0.003, + "loss": 4.1454, + "step": 5223 + }, + { + "epoch": 0.05224, + "grad_norm": 0.6753245461392549, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 5224 + }, + { + "epoch": 0.05225, + "grad_norm": 0.5962298095800613, + "learning_rate": 0.003, + "loss": 4.145, + "step": 5225 + }, + { + "epoch": 0.05226, + "grad_norm": 0.5681174514487727, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 5226 + }, + { + "epoch": 0.05227, + "grad_norm": 0.6355387480928073, + "learning_rate": 0.003, + "loss": 4.1595, + "step": 5227 + }, + { + "epoch": 0.05228, + "grad_norm": 0.63070568269053, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 5228 + }, + { + "epoch": 0.05229, + "grad_norm": 0.6541251553608277, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 5229 + }, + { + "epoch": 0.0523, + "grad_norm": 0.5976720272605445, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 5230 + }, + { + "epoch": 0.05231, + "grad_norm": 0.5666985347278075, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 5231 + }, + { + "epoch": 0.05232, + "grad_norm": 0.5817937128133905, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 5232 + }, + { + "epoch": 0.05233, + "grad_norm": 0.5938688559238984, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 5233 + }, + { + "epoch": 0.05234, + "grad_norm": 0.5954347096748437, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 5234 + }, + { + "epoch": 0.05235, + "grad_norm": 0.5732073934324781, + "learning_rate": 0.003, + "loss": 4.127, + "step": 5235 + }, + { + "epoch": 0.05236, + "grad_norm": 0.5696284586443667, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 5236 + }, + { + "epoch": 0.05237, + "grad_norm": 0.5556858968355276, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 5237 + }, + { + "epoch": 0.05238, + "grad_norm": 0.5512655986372486, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 5238 + }, + { + "epoch": 0.05239, + "grad_norm": 0.6544063071682276, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 5239 + }, + { + "epoch": 0.0524, + "grad_norm": 0.7356500161192003, + "learning_rate": 0.003, + "loss": 4.1391, + "step": 5240 + }, + { + "epoch": 0.05241, + "grad_norm": 0.88373689710533, + "learning_rate": 0.003, + "loss": 4.1526, + "step": 5241 + }, + { + "epoch": 0.05242, + "grad_norm": 0.811345798439545, + "learning_rate": 0.003, + "loss": 4.1503, + "step": 5242 + }, + { + "epoch": 0.05243, + "grad_norm": 0.7905544586858798, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 5243 + }, + { + "epoch": 0.05244, + "grad_norm": 0.7298808104661723, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 5244 + }, + { + "epoch": 0.05245, + "grad_norm": 0.7094573896220808, + "learning_rate": 0.003, + "loss": 4.1281, + "step": 5245 + }, + { + "epoch": 0.05246, + "grad_norm": 0.7963225003446655, + "learning_rate": 0.003, + "loss": 4.134, + "step": 5246 + }, + { + "epoch": 0.05247, + "grad_norm": 0.902516557295206, + "learning_rate": 0.003, + "loss": 4.1453, + "step": 5247 + }, + { + "epoch": 0.05248, + "grad_norm": 0.7878245147008873, + "learning_rate": 0.003, + "loss": 4.1491, + "step": 5248 + }, + { + "epoch": 0.05249, + "grad_norm": 0.6945422313261818, + "learning_rate": 0.003, + "loss": 4.1562, + "step": 5249 + }, + { + "epoch": 0.0525, + "grad_norm": 0.622441402060283, + "learning_rate": 0.003, + "loss": 4.146, + "step": 5250 + }, + { + "epoch": 0.05251, + "grad_norm": 0.5779397363926364, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 5251 + }, + { + "epoch": 0.05252, + "grad_norm": 0.577099244293535, + "learning_rate": 0.003, + "loss": 4.1569, + "step": 5252 + }, + { + "epoch": 0.05253, + "grad_norm": 0.5742421525793271, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 5253 + }, + { + "epoch": 0.05254, + "grad_norm": 0.5856615794186555, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 5254 + }, + { + "epoch": 0.05255, + "grad_norm": 0.5988982637518733, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 5255 + }, + { + "epoch": 0.05256, + "grad_norm": 0.6516682432466444, + "learning_rate": 0.003, + "loss": 4.1391, + "step": 5256 + }, + { + "epoch": 0.05257, + "grad_norm": 0.7521657888490443, + "learning_rate": 0.003, + "loss": 4.1339, + "step": 5257 + }, + { + "epoch": 0.05258, + "grad_norm": 0.7752727754327018, + "learning_rate": 0.003, + "loss": 4.138, + "step": 5258 + }, + { + "epoch": 0.05259, + "grad_norm": 0.7507817009513204, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 5259 + }, + { + "epoch": 0.0526, + "grad_norm": 0.8283788110748729, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 5260 + }, + { + "epoch": 0.05261, + "grad_norm": 0.9839152508479165, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 5261 + }, + { + "epoch": 0.05262, + "grad_norm": 0.8181134341889587, + "learning_rate": 0.003, + "loss": 4.1247, + "step": 5262 + }, + { + "epoch": 0.05263, + "grad_norm": 0.8179493296972696, + "learning_rate": 0.003, + "loss": 4.1756, + "step": 5263 + }, + { + "epoch": 0.05264, + "grad_norm": 0.8507075051576984, + "learning_rate": 0.003, + "loss": 4.1758, + "step": 5264 + }, + { + "epoch": 0.05265, + "grad_norm": 0.7875040331282762, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 5265 + }, + { + "epoch": 0.05266, + "grad_norm": 0.7660878019896183, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 5266 + }, + { + "epoch": 0.05267, + "grad_norm": 0.7381530659889034, + "learning_rate": 0.003, + "loss": 4.1753, + "step": 5267 + }, + { + "epoch": 0.05268, + "grad_norm": 0.7516718782373394, + "learning_rate": 0.003, + "loss": 4.1372, + "step": 5268 + }, + { + "epoch": 0.05269, + "grad_norm": 0.7754342018650271, + "learning_rate": 0.003, + "loss": 4.117, + "step": 5269 + }, + { + "epoch": 0.0527, + "grad_norm": 0.7814902853941849, + "learning_rate": 0.003, + "loss": 4.1535, + "step": 5270 + }, + { + "epoch": 0.05271, + "grad_norm": 0.7175519140149296, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 5271 + }, + { + "epoch": 0.05272, + "grad_norm": 0.6180907761221967, + "learning_rate": 0.003, + "loss": 4.166, + "step": 5272 + }, + { + "epoch": 0.05273, + "grad_norm": 0.5866907243765208, + "learning_rate": 0.003, + "loss": 4.1508, + "step": 5273 + }, + { + "epoch": 0.05274, + "grad_norm": 0.7212098565817949, + "learning_rate": 0.003, + "loss": 4.1617, + "step": 5274 + }, + { + "epoch": 0.05275, + "grad_norm": 0.745726351447074, + "learning_rate": 0.003, + "loss": 4.1373, + "step": 5275 + }, + { + "epoch": 0.05276, + "grad_norm": 0.6574672838634651, + "learning_rate": 0.003, + "loss": 4.124, + "step": 5276 + }, + { + "epoch": 0.05277, + "grad_norm": 0.6169430231237321, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 5277 + }, + { + "epoch": 0.05278, + "grad_norm": 0.6357928590591313, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 5278 + }, + { + "epoch": 0.05279, + "grad_norm": 0.6803536948880607, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 5279 + }, + { + "epoch": 0.0528, + "grad_norm": 0.6367797725535698, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 5280 + }, + { + "epoch": 0.05281, + "grad_norm": 0.5426507335239904, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 5281 + }, + { + "epoch": 0.05282, + "grad_norm": 0.47424037246159206, + "learning_rate": 0.003, + "loss": 4.1247, + "step": 5282 + }, + { + "epoch": 0.05283, + "grad_norm": 0.3930494949754586, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 5283 + }, + { + "epoch": 0.05284, + "grad_norm": 0.3934936059529288, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 5284 + }, + { + "epoch": 0.05285, + "grad_norm": 0.4015179406110254, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 5285 + }, + { + "epoch": 0.05286, + "grad_norm": 0.4480587464365292, + "learning_rate": 0.003, + "loss": 4.1513, + "step": 5286 + }, + { + "epoch": 0.05287, + "grad_norm": 0.5070013234550016, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 5287 + }, + { + "epoch": 0.05288, + "grad_norm": 0.4813637426002363, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 5288 + }, + { + "epoch": 0.05289, + "grad_norm": 0.4573299309440364, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 5289 + }, + { + "epoch": 0.0529, + "grad_norm": 0.4657267342352843, + "learning_rate": 0.003, + "loss": 4.1223, + "step": 5290 + }, + { + "epoch": 0.05291, + "grad_norm": 0.44753235119040924, + "learning_rate": 0.003, + "loss": 4.1351, + "step": 5291 + }, + { + "epoch": 0.05292, + "grad_norm": 0.46480492942670476, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 5292 + }, + { + "epoch": 0.05293, + "grad_norm": 0.45730668556713944, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 5293 + }, + { + "epoch": 0.05294, + "grad_norm": 0.4237029704444932, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 5294 + }, + { + "epoch": 0.05295, + "grad_norm": 0.4482914371249565, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 5295 + }, + { + "epoch": 0.05296, + "grad_norm": 0.5493021494866288, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 5296 + }, + { + "epoch": 0.05297, + "grad_norm": 0.8031816093672703, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 5297 + }, + { + "epoch": 0.05298, + "grad_norm": 1.0196685307893005, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 5298 + }, + { + "epoch": 0.05299, + "grad_norm": 0.9550231337660582, + "learning_rate": 0.003, + "loss": 4.1305, + "step": 5299 + }, + { + "epoch": 0.053, + "grad_norm": 0.8307857387808788, + "learning_rate": 0.003, + "loss": 4.1466, + "step": 5300 + }, + { + "epoch": 0.05301, + "grad_norm": 0.7866950778572048, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 5301 + }, + { + "epoch": 0.05302, + "grad_norm": 0.6761619636450853, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 5302 + }, + { + "epoch": 0.05303, + "grad_norm": 0.665701232744064, + "learning_rate": 0.003, + "loss": 4.1395, + "step": 5303 + }, + { + "epoch": 0.05304, + "grad_norm": 0.6461488983224262, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 5304 + }, + { + "epoch": 0.05305, + "grad_norm": 0.5665397238554841, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 5305 + }, + { + "epoch": 0.05306, + "grad_norm": 0.5309591176214379, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 5306 + }, + { + "epoch": 0.05307, + "grad_norm": 0.527472903799991, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 5307 + }, + { + "epoch": 0.05308, + "grad_norm": 0.44640273254720836, + "learning_rate": 0.003, + "loss": 4.1434, + "step": 5308 + }, + { + "epoch": 0.05309, + "grad_norm": 0.4752479060075969, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 5309 + }, + { + "epoch": 0.0531, + "grad_norm": 0.4497310787484318, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 5310 + }, + { + "epoch": 0.05311, + "grad_norm": 0.47563156496846587, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 5311 + }, + { + "epoch": 0.05312, + "grad_norm": 0.508319969010842, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 5312 + }, + { + "epoch": 0.05313, + "grad_norm": 0.5258317632514994, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 5313 + }, + { + "epoch": 0.05314, + "grad_norm": 0.5419446505340356, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 5314 + }, + { + "epoch": 0.05315, + "grad_norm": 0.5617206811761004, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 5315 + }, + { + "epoch": 0.05316, + "grad_norm": 0.6150928093152392, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 5316 + }, + { + "epoch": 0.05317, + "grad_norm": 0.6470618227813498, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 5317 + }, + { + "epoch": 0.05318, + "grad_norm": 0.6497630105833105, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 5318 + }, + { + "epoch": 0.05319, + "grad_norm": 0.6589138797229622, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 5319 + }, + { + "epoch": 0.0532, + "grad_norm": 0.7898491657801758, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 5320 + }, + { + "epoch": 0.05321, + "grad_norm": 0.8933026805238407, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 5321 + }, + { + "epoch": 0.05322, + "grad_norm": 1.0398114885738736, + "learning_rate": 0.003, + "loss": 4.1639, + "step": 5322 + }, + { + "epoch": 0.05323, + "grad_norm": 0.9043069047668215, + "learning_rate": 0.003, + "loss": 4.192, + "step": 5323 + }, + { + "epoch": 0.05324, + "grad_norm": 0.7778016743549974, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 5324 + }, + { + "epoch": 0.05325, + "grad_norm": 0.6980624923718645, + "learning_rate": 0.003, + "loss": 4.1448, + "step": 5325 + }, + { + "epoch": 0.05326, + "grad_norm": 0.6326121552557883, + "learning_rate": 0.003, + "loss": 4.1634, + "step": 5326 + }, + { + "epoch": 0.05327, + "grad_norm": 0.5930605167320887, + "learning_rate": 0.003, + "loss": 4.1424, + "step": 5327 + }, + { + "epoch": 0.05328, + "grad_norm": 0.5642732817761132, + "learning_rate": 0.003, + "loss": 4.1462, + "step": 5328 + }, + { + "epoch": 0.05329, + "grad_norm": 0.6305127051813995, + "learning_rate": 0.003, + "loss": 4.1307, + "step": 5329 + }, + { + "epoch": 0.0533, + "grad_norm": 0.7052704472572536, + "learning_rate": 0.003, + "loss": 4.1527, + "step": 5330 + }, + { + "epoch": 0.05331, + "grad_norm": 0.7552985640042235, + "learning_rate": 0.003, + "loss": 4.1453, + "step": 5331 + }, + { + "epoch": 0.05332, + "grad_norm": 0.6810355156436282, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 5332 + }, + { + "epoch": 0.05333, + "grad_norm": 0.6053097534106456, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 5333 + }, + { + "epoch": 0.05334, + "grad_norm": 0.8017924702361167, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 5334 + }, + { + "epoch": 0.05335, + "grad_norm": 0.8242316752152679, + "learning_rate": 0.003, + "loss": 4.1559, + "step": 5335 + }, + { + "epoch": 0.05336, + "grad_norm": 0.6320025894823955, + "learning_rate": 0.003, + "loss": 4.1609, + "step": 5336 + }, + { + "epoch": 0.05337, + "grad_norm": 0.5224549291900125, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 5337 + }, + { + "epoch": 0.05338, + "grad_norm": 0.5296562283032598, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 5338 + }, + { + "epoch": 0.05339, + "grad_norm": 0.5274630915708725, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 5339 + }, + { + "epoch": 0.0534, + "grad_norm": 0.5537136834473111, + "learning_rate": 0.003, + "loss": 4.1313, + "step": 5340 + }, + { + "epoch": 0.05341, + "grad_norm": 0.5459467303012407, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 5341 + }, + { + "epoch": 0.05342, + "grad_norm": 0.5994941640281616, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 5342 + }, + { + "epoch": 0.05343, + "grad_norm": 0.6071352996442535, + "learning_rate": 0.003, + "loss": 4.137, + "step": 5343 + }, + { + "epoch": 0.05344, + "grad_norm": 0.5345329157634405, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 5344 + }, + { + "epoch": 0.05345, + "grad_norm": 0.5116910157465104, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 5345 + }, + { + "epoch": 0.05346, + "grad_norm": 0.46468708763144895, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 5346 + }, + { + "epoch": 0.05347, + "grad_norm": 0.4409453577269599, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 5347 + }, + { + "epoch": 0.05348, + "grad_norm": 0.4414471535354094, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 5348 + }, + { + "epoch": 0.05349, + "grad_norm": 0.4595368190725746, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 5349 + }, + { + "epoch": 0.0535, + "grad_norm": 0.4867635564262319, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 5350 + }, + { + "epoch": 0.05351, + "grad_norm": 0.5386138190961993, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 5351 + }, + { + "epoch": 0.05352, + "grad_norm": 0.6349645099056564, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 5352 + }, + { + "epoch": 0.05353, + "grad_norm": 0.7163335707083324, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 5353 + }, + { + "epoch": 0.05354, + "grad_norm": 0.7284832017243276, + "learning_rate": 0.003, + "loss": 4.111, + "step": 5354 + }, + { + "epoch": 0.05355, + "grad_norm": 0.6051924935759244, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 5355 + }, + { + "epoch": 0.05356, + "grad_norm": 0.524816428418513, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 5356 + }, + { + "epoch": 0.05357, + "grad_norm": 0.5816829726245226, + "learning_rate": 0.003, + "loss": 4.125, + "step": 5357 + }, + { + "epoch": 0.05358, + "grad_norm": 0.5634033510447515, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 5358 + }, + { + "epoch": 0.05359, + "grad_norm": 0.6119701681573797, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 5359 + }, + { + "epoch": 0.0536, + "grad_norm": 0.7787011760550901, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 5360 + }, + { + "epoch": 0.05361, + "grad_norm": 0.8465182748674693, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 5361 + }, + { + "epoch": 0.05362, + "grad_norm": 0.8841512568835068, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 5362 + }, + { + "epoch": 0.05363, + "grad_norm": 0.7654054247828137, + "learning_rate": 0.003, + "loss": 4.1618, + "step": 5363 + }, + { + "epoch": 0.05364, + "grad_norm": 0.6371159121774179, + "learning_rate": 0.003, + "loss": 4.1499, + "step": 5364 + }, + { + "epoch": 0.05365, + "grad_norm": 0.6825354254162556, + "learning_rate": 0.003, + "loss": 4.1455, + "step": 5365 + }, + { + "epoch": 0.05366, + "grad_norm": 0.8101026860948977, + "learning_rate": 0.003, + "loss": 4.1341, + "step": 5366 + }, + { + "epoch": 0.05367, + "grad_norm": 0.8291532467226016, + "learning_rate": 0.003, + "loss": 4.1339, + "step": 5367 + }, + { + "epoch": 0.05368, + "grad_norm": 0.8056447950741361, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 5368 + }, + { + "epoch": 0.05369, + "grad_norm": 0.8735378267002016, + "learning_rate": 0.003, + "loss": 4.1428, + "step": 5369 + }, + { + "epoch": 0.0537, + "grad_norm": 0.9391702508573958, + "learning_rate": 0.003, + "loss": 4.15, + "step": 5370 + }, + { + "epoch": 0.05371, + "grad_norm": 0.8524226126886991, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 5371 + }, + { + "epoch": 0.05372, + "grad_norm": 0.803705266443803, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 5372 + }, + { + "epoch": 0.05373, + "grad_norm": 0.7560843151563093, + "learning_rate": 0.003, + "loss": 4.1657, + "step": 5373 + }, + { + "epoch": 0.05374, + "grad_norm": 0.6930845720174758, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 5374 + }, + { + "epoch": 0.05375, + "grad_norm": 0.6546702545023289, + "learning_rate": 0.003, + "loss": 4.133, + "step": 5375 + }, + { + "epoch": 0.05376, + "grad_norm": 0.6000524634204305, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 5376 + }, + { + "epoch": 0.05377, + "grad_norm": 0.5834504831622397, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 5377 + }, + { + "epoch": 0.05378, + "grad_norm": 0.49712005322114705, + "learning_rate": 0.003, + "loss": 4.1435, + "step": 5378 + }, + { + "epoch": 0.05379, + "grad_norm": 0.45036859975016763, + "learning_rate": 0.003, + "loss": 4.1475, + "step": 5379 + }, + { + "epoch": 0.0538, + "grad_norm": 0.43310401465589693, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 5380 + }, + { + "epoch": 0.05381, + "grad_norm": 0.43118280545438875, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 5381 + }, + { + "epoch": 0.05382, + "grad_norm": 0.4147511601446657, + "learning_rate": 0.003, + "loss": 4.1407, + "step": 5382 + }, + { + "epoch": 0.05383, + "grad_norm": 0.41428955644326865, + "learning_rate": 0.003, + "loss": 4.1363, + "step": 5383 + }, + { + "epoch": 0.05384, + "grad_norm": 0.46974708785278296, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 5384 + }, + { + "epoch": 0.05385, + "grad_norm": 0.4804439317498876, + "learning_rate": 0.003, + "loss": 4.143, + "step": 5385 + }, + { + "epoch": 0.05386, + "grad_norm": 0.4710597747403432, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 5386 + }, + { + "epoch": 0.05387, + "grad_norm": 0.5345471173023906, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 5387 + }, + { + "epoch": 0.05388, + "grad_norm": 0.527734554101038, + "learning_rate": 0.003, + "loss": 4.1474, + "step": 5388 + }, + { + "epoch": 0.05389, + "grad_norm": 0.5157286475163666, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 5389 + }, + { + "epoch": 0.0539, + "grad_norm": 0.5289054154287074, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 5390 + }, + { + "epoch": 0.05391, + "grad_norm": 0.6349600394290038, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 5391 + }, + { + "epoch": 0.05392, + "grad_norm": 0.7287707209258841, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 5392 + }, + { + "epoch": 0.05393, + "grad_norm": 0.7525205292581727, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 5393 + }, + { + "epoch": 0.05394, + "grad_norm": 0.6536130075343365, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 5394 + }, + { + "epoch": 0.05395, + "grad_norm": 0.6426311365984038, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 5395 + }, + { + "epoch": 0.05396, + "grad_norm": 0.7277902979564174, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 5396 + }, + { + "epoch": 0.05397, + "grad_norm": 0.7417685288107195, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 5397 + }, + { + "epoch": 0.05398, + "grad_norm": 0.7086944915328566, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 5398 + }, + { + "epoch": 0.05399, + "grad_norm": 0.714762262058065, + "learning_rate": 0.003, + "loss": 4.1439, + "step": 5399 + }, + { + "epoch": 0.054, + "grad_norm": 0.6803456287574434, + "learning_rate": 0.003, + "loss": 4.1543, + "step": 5400 + }, + { + "epoch": 0.05401, + "grad_norm": 0.7115679618160242, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 5401 + }, + { + "epoch": 0.05402, + "grad_norm": 0.6178118046647656, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 5402 + }, + { + "epoch": 0.05403, + "grad_norm": 0.717834549171867, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 5403 + }, + { + "epoch": 0.05404, + "grad_norm": 0.7093324185375305, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 5404 + }, + { + "epoch": 0.05405, + "grad_norm": 0.6425829471366494, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 5405 + }, + { + "epoch": 0.05406, + "grad_norm": 0.6272170060637055, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 5406 + }, + { + "epoch": 0.05407, + "grad_norm": 0.599920594553199, + "learning_rate": 0.003, + "loss": 4.123, + "step": 5407 + }, + { + "epoch": 0.05408, + "grad_norm": 0.5820564980718425, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 5408 + }, + { + "epoch": 0.05409, + "grad_norm": 0.5111686298731917, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 5409 + }, + { + "epoch": 0.0541, + "grad_norm": 0.462193558686699, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 5410 + }, + { + "epoch": 0.05411, + "grad_norm": 0.4130402153846359, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 5411 + }, + { + "epoch": 0.05412, + "grad_norm": 0.4056983953065427, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 5412 + }, + { + "epoch": 0.05413, + "grad_norm": 0.37874815302403264, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 5413 + }, + { + "epoch": 0.05414, + "grad_norm": 0.44844074230106556, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 5414 + }, + { + "epoch": 0.05415, + "grad_norm": 0.5721190859806238, + "learning_rate": 0.003, + "loss": 4.13, + "step": 5415 + }, + { + "epoch": 0.05416, + "grad_norm": 0.6739294583356293, + "learning_rate": 0.003, + "loss": 4.091, + "step": 5416 + }, + { + "epoch": 0.05417, + "grad_norm": 0.6903098191439943, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 5417 + }, + { + "epoch": 0.05418, + "grad_norm": 0.63417118564695, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 5418 + }, + { + "epoch": 0.05419, + "grad_norm": 0.7191816724039328, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 5419 + }, + { + "epoch": 0.0542, + "grad_norm": 0.8004777313589135, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 5420 + }, + { + "epoch": 0.05421, + "grad_norm": 0.896985525441762, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 5421 + }, + { + "epoch": 0.05422, + "grad_norm": 0.7798643056239566, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 5422 + }, + { + "epoch": 0.05423, + "grad_norm": 0.7070552144440728, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 5423 + }, + { + "epoch": 0.05424, + "grad_norm": 0.5579018401845559, + "learning_rate": 0.003, + "loss": 4.1542, + "step": 5424 + }, + { + "epoch": 0.05425, + "grad_norm": 0.5286925848297326, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 5425 + }, + { + "epoch": 0.05426, + "grad_norm": 0.5912337560467409, + "learning_rate": 0.003, + "loss": 4.112, + "step": 5426 + }, + { + "epoch": 0.05427, + "grad_norm": 0.5997777842806946, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 5427 + }, + { + "epoch": 0.05428, + "grad_norm": 0.6476094192756237, + "learning_rate": 0.003, + "loss": 4.131, + "step": 5428 + }, + { + "epoch": 0.05429, + "grad_norm": 0.6086875232467213, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 5429 + }, + { + "epoch": 0.0543, + "grad_norm": 0.681356494329053, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 5430 + }, + { + "epoch": 0.05431, + "grad_norm": 0.6615347259505286, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 5431 + }, + { + "epoch": 0.05432, + "grad_norm": 0.6347016018100827, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 5432 + }, + { + "epoch": 0.05433, + "grad_norm": 0.6345480892474629, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 5433 + }, + { + "epoch": 0.05434, + "grad_norm": 0.7111690564402451, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 5434 + }, + { + "epoch": 0.05435, + "grad_norm": 0.7921322712349017, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 5435 + }, + { + "epoch": 0.05436, + "grad_norm": 0.7564267955476854, + "learning_rate": 0.003, + "loss": 4.1381, + "step": 5436 + }, + { + "epoch": 0.05437, + "grad_norm": 0.6706007148323503, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 5437 + }, + { + "epoch": 0.05438, + "grad_norm": 0.6450156634525958, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 5438 + }, + { + "epoch": 0.05439, + "grad_norm": 0.7387880792670661, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 5439 + }, + { + "epoch": 0.0544, + "grad_norm": 0.8351260726939442, + "learning_rate": 0.003, + "loss": 4.1241, + "step": 5440 + }, + { + "epoch": 0.05441, + "grad_norm": 0.8490525937722112, + "learning_rate": 0.003, + "loss": 4.1633, + "step": 5441 + }, + { + "epoch": 0.05442, + "grad_norm": 0.674030590021176, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 5442 + }, + { + "epoch": 0.05443, + "grad_norm": 0.6505791851985235, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 5443 + }, + { + "epoch": 0.05444, + "grad_norm": 0.6106003708383616, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 5444 + }, + { + "epoch": 0.05445, + "grad_norm": 0.5632553032978198, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 5445 + }, + { + "epoch": 0.05446, + "grad_norm": 0.578137023972088, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 5446 + }, + { + "epoch": 0.05447, + "grad_norm": 0.6233499651264109, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 5447 + }, + { + "epoch": 0.05448, + "grad_norm": 0.7025863451489226, + "learning_rate": 0.003, + "loss": 4.1534, + "step": 5448 + }, + { + "epoch": 0.05449, + "grad_norm": 0.6674289251362188, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 5449 + }, + { + "epoch": 0.0545, + "grad_norm": 0.68187200764662, + "learning_rate": 0.003, + "loss": 4.141, + "step": 5450 + }, + { + "epoch": 0.05451, + "grad_norm": 0.9009771762961888, + "learning_rate": 0.003, + "loss": 4.1742, + "step": 5451 + }, + { + "epoch": 0.05452, + "grad_norm": 0.9180522733634838, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 5452 + }, + { + "epoch": 0.05453, + "grad_norm": 0.8476324692508597, + "learning_rate": 0.003, + "loss": 4.1392, + "step": 5453 + }, + { + "epoch": 0.05454, + "grad_norm": 0.7252925157254209, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 5454 + }, + { + "epoch": 0.05455, + "grad_norm": 0.7663075676162417, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 5455 + }, + { + "epoch": 0.05456, + "grad_norm": 0.8409176235376956, + "learning_rate": 0.003, + "loss": 4.1417, + "step": 5456 + }, + { + "epoch": 0.05457, + "grad_norm": 0.8283166405126581, + "learning_rate": 0.003, + "loss": 4.1679, + "step": 5457 + }, + { + "epoch": 0.05458, + "grad_norm": 0.7230997777624161, + "learning_rate": 0.003, + "loss": 4.1382, + "step": 5458 + }, + { + "epoch": 0.05459, + "grad_norm": 0.6649973615570582, + "learning_rate": 0.003, + "loss": 4.1491, + "step": 5459 + }, + { + "epoch": 0.0546, + "grad_norm": 0.528365614611949, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 5460 + }, + { + "epoch": 0.05461, + "grad_norm": 0.5520584167184678, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 5461 + }, + { + "epoch": 0.05462, + "grad_norm": 0.5391041370594502, + "learning_rate": 0.003, + "loss": 4.1598, + "step": 5462 + }, + { + "epoch": 0.05463, + "grad_norm": 0.5846088271091963, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 5463 + }, + { + "epoch": 0.05464, + "grad_norm": 0.7597079518484714, + "learning_rate": 0.003, + "loss": 4.1623, + "step": 5464 + }, + { + "epoch": 0.05465, + "grad_norm": 0.8362666805772095, + "learning_rate": 0.003, + "loss": 4.163, + "step": 5465 + }, + { + "epoch": 0.05466, + "grad_norm": 0.8383727125474473, + "learning_rate": 0.003, + "loss": 4.1732, + "step": 5466 + }, + { + "epoch": 0.05467, + "grad_norm": 0.8306791527668079, + "learning_rate": 0.003, + "loss": 4.156, + "step": 5467 + }, + { + "epoch": 0.05468, + "grad_norm": 0.6902496457265671, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 5468 + }, + { + "epoch": 0.05469, + "grad_norm": 0.6940477329883477, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 5469 + }, + { + "epoch": 0.0547, + "grad_norm": 0.6500396166893858, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 5470 + }, + { + "epoch": 0.05471, + "grad_norm": 0.5274542761046104, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 5471 + }, + { + "epoch": 0.05472, + "grad_norm": 0.5294637164309897, + "learning_rate": 0.003, + "loss": 4.1204, + "step": 5472 + }, + { + "epoch": 0.05473, + "grad_norm": 0.5195706474802417, + "learning_rate": 0.003, + "loss": 4.1386, + "step": 5473 + }, + { + "epoch": 0.05474, + "grad_norm": 0.5283189608781912, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 5474 + }, + { + "epoch": 0.05475, + "grad_norm": 0.5025729117153097, + "learning_rate": 0.003, + "loss": 4.169, + "step": 5475 + }, + { + "epoch": 0.05476, + "grad_norm": 0.4403020126149031, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 5476 + }, + { + "epoch": 0.05477, + "grad_norm": 0.4121449611240764, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 5477 + }, + { + "epoch": 0.05478, + "grad_norm": 0.39409300706271555, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 5478 + }, + { + "epoch": 0.05479, + "grad_norm": 0.4508711810401754, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 5479 + }, + { + "epoch": 0.0548, + "grad_norm": 0.5644355789014077, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 5480 + }, + { + "epoch": 0.05481, + "grad_norm": 0.6849613721636778, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 5481 + }, + { + "epoch": 0.05482, + "grad_norm": 0.7328991559368986, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 5482 + }, + { + "epoch": 0.05483, + "grad_norm": 0.6360361089895654, + "learning_rate": 0.003, + "loss": 4.114, + "step": 5483 + }, + { + "epoch": 0.05484, + "grad_norm": 0.5276598733333204, + "learning_rate": 0.003, + "loss": 4.1352, + "step": 5484 + }, + { + "epoch": 0.05485, + "grad_norm": 0.43134255974381286, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 5485 + }, + { + "epoch": 0.05486, + "grad_norm": 0.49995198292073406, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 5486 + }, + { + "epoch": 0.05487, + "grad_norm": 0.5218900481549134, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 5487 + }, + { + "epoch": 0.05488, + "grad_norm": 0.6346419704768723, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 5488 + }, + { + "epoch": 0.05489, + "grad_norm": 0.7535739958723238, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 5489 + }, + { + "epoch": 0.0549, + "grad_norm": 0.7585637473534005, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 5490 + }, + { + "epoch": 0.05491, + "grad_norm": 0.6170687734506143, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 5491 + }, + { + "epoch": 0.05492, + "grad_norm": 0.5792989631670724, + "learning_rate": 0.003, + "loss": 4.141, + "step": 5492 + }, + { + "epoch": 0.05493, + "grad_norm": 0.5665640509165365, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 5493 + }, + { + "epoch": 0.05494, + "grad_norm": 0.6724865699209109, + "learning_rate": 0.003, + "loss": 4.1366, + "step": 5494 + }, + { + "epoch": 0.05495, + "grad_norm": 0.6552527317872832, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 5495 + }, + { + "epoch": 0.05496, + "grad_norm": 0.5714103826811914, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 5496 + }, + { + "epoch": 0.05497, + "grad_norm": 0.5522465075739208, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 5497 + }, + { + "epoch": 0.05498, + "grad_norm": 0.5012005070964024, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 5498 + }, + { + "epoch": 0.05499, + "grad_norm": 0.4511204649500133, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 5499 + }, + { + "epoch": 0.055, + "grad_norm": 0.5056201476680349, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 5500 + }, + { + "epoch": 0.05501, + "grad_norm": 0.5484274718087955, + "learning_rate": 0.003, + "loss": 4.113, + "step": 5501 + }, + { + "epoch": 0.05502, + "grad_norm": 0.5221083178277119, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 5502 + }, + { + "epoch": 0.05503, + "grad_norm": 0.54151812556379, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 5503 + }, + { + "epoch": 0.05504, + "grad_norm": 0.5431761618729547, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 5504 + }, + { + "epoch": 0.05505, + "grad_norm": 0.677049395087336, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 5505 + }, + { + "epoch": 0.05506, + "grad_norm": 0.9158252562242566, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 5506 + }, + { + "epoch": 0.05507, + "grad_norm": 1.1004119550566662, + "learning_rate": 0.003, + "loss": 4.1704, + "step": 5507 + }, + { + "epoch": 0.05508, + "grad_norm": 0.7690899803645341, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 5508 + }, + { + "epoch": 0.05509, + "grad_norm": 0.7409889310481907, + "learning_rate": 0.003, + "loss": 4.1318, + "step": 5509 + }, + { + "epoch": 0.0551, + "grad_norm": 0.8413828044451898, + "learning_rate": 0.003, + "loss": 4.1188, + "step": 5510 + }, + { + "epoch": 0.05511, + "grad_norm": 0.974031304462407, + "learning_rate": 0.003, + "loss": 4.1516, + "step": 5511 + }, + { + "epoch": 0.05512, + "grad_norm": 0.9261519351358342, + "learning_rate": 0.003, + "loss": 4.1574, + "step": 5512 + }, + { + "epoch": 0.05513, + "grad_norm": 0.7536753305180286, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 5513 + }, + { + "epoch": 0.05514, + "grad_norm": 0.682629393213208, + "learning_rate": 0.003, + "loss": 4.1465, + "step": 5514 + }, + { + "epoch": 0.05515, + "grad_norm": 0.6438639256486987, + "learning_rate": 0.003, + "loss": 4.124, + "step": 5515 + }, + { + "epoch": 0.05516, + "grad_norm": 0.6370628235963453, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 5516 + }, + { + "epoch": 0.05517, + "grad_norm": 0.6380637575916892, + "learning_rate": 0.003, + "loss": 4.117, + "step": 5517 + }, + { + "epoch": 0.05518, + "grad_norm": 0.6722144275625301, + "learning_rate": 0.003, + "loss": 4.1404, + "step": 5518 + }, + { + "epoch": 0.05519, + "grad_norm": 0.6613958013501796, + "learning_rate": 0.003, + "loss": 4.1658, + "step": 5519 + }, + { + "epoch": 0.0552, + "grad_norm": 0.5988204576269331, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 5520 + }, + { + "epoch": 0.05521, + "grad_norm": 0.6572663799653967, + "learning_rate": 0.003, + "loss": 4.1529, + "step": 5521 + }, + { + "epoch": 0.05522, + "grad_norm": 0.6243740084612656, + "learning_rate": 0.003, + "loss": 4.1628, + "step": 5522 + }, + { + "epoch": 0.05523, + "grad_norm": 0.561465995263533, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 5523 + }, + { + "epoch": 0.05524, + "grad_norm": 0.5486658522043749, + "learning_rate": 0.003, + "loss": 4.1453, + "step": 5524 + }, + { + "epoch": 0.05525, + "grad_norm": 0.5695350014121137, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 5525 + }, + { + "epoch": 0.05526, + "grad_norm": 0.5346598523218599, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 5526 + }, + { + "epoch": 0.05527, + "grad_norm": 0.4551658263055205, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 5527 + }, + { + "epoch": 0.05528, + "grad_norm": 0.4843464050850517, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 5528 + }, + { + "epoch": 0.05529, + "grad_norm": 0.585373677911223, + "learning_rate": 0.003, + "loss": 4.1372, + "step": 5529 + }, + { + "epoch": 0.0553, + "grad_norm": 0.6825180764380449, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 5530 + }, + { + "epoch": 0.05531, + "grad_norm": 0.8045417801010512, + "learning_rate": 0.003, + "loss": 4.1428, + "step": 5531 + }, + { + "epoch": 0.05532, + "grad_norm": 0.7195866881344775, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 5532 + }, + { + "epoch": 0.05533, + "grad_norm": 0.6162799867320136, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 5533 + }, + { + "epoch": 0.05534, + "grad_norm": 0.7904779000302894, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 5534 + }, + { + "epoch": 0.05535, + "grad_norm": 0.8644528986049678, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 5535 + }, + { + "epoch": 0.05536, + "grad_norm": 0.7766441044677856, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 5536 + }, + { + "epoch": 0.05537, + "grad_norm": 0.7239299173431871, + "learning_rate": 0.003, + "loss": 4.1377, + "step": 5537 + }, + { + "epoch": 0.05538, + "grad_norm": 0.7001732822471084, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 5538 + }, + { + "epoch": 0.05539, + "grad_norm": 0.6623943623113974, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 5539 + }, + { + "epoch": 0.0554, + "grad_norm": 0.6471550442576808, + "learning_rate": 0.003, + "loss": 4.1748, + "step": 5540 + }, + { + "epoch": 0.05541, + "grad_norm": 0.5868454097526585, + "learning_rate": 0.003, + "loss": 4.1489, + "step": 5541 + }, + { + "epoch": 0.05542, + "grad_norm": 0.5047287366024358, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 5542 + }, + { + "epoch": 0.05543, + "grad_norm": 0.47745524075484136, + "learning_rate": 0.003, + "loss": 4.1095, + "step": 5543 + }, + { + "epoch": 0.05544, + "grad_norm": 0.4985694332258811, + "learning_rate": 0.003, + "loss": 4.1516, + "step": 5544 + }, + { + "epoch": 0.05545, + "grad_norm": 0.5073634801765811, + "learning_rate": 0.003, + "loss": 4.1381, + "step": 5545 + }, + { + "epoch": 0.05546, + "grad_norm": 0.5237047792569028, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 5546 + }, + { + "epoch": 0.05547, + "grad_norm": 0.5914205238128549, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 5547 + }, + { + "epoch": 0.05548, + "grad_norm": 0.6165645927936478, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 5548 + }, + { + "epoch": 0.05549, + "grad_norm": 0.5898830700152652, + "learning_rate": 0.003, + "loss": 4.143, + "step": 5549 + }, + { + "epoch": 0.0555, + "grad_norm": 0.45665075102292346, + "learning_rate": 0.003, + "loss": 4.115, + "step": 5550 + }, + { + "epoch": 0.05551, + "grad_norm": 0.41856537923447845, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 5551 + }, + { + "epoch": 0.05552, + "grad_norm": 0.4227890713436711, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 5552 + }, + { + "epoch": 0.05553, + "grad_norm": 0.40800302608685, + "learning_rate": 0.003, + "loss": 4.1221, + "step": 5553 + }, + { + "epoch": 0.05554, + "grad_norm": 0.4343242066304794, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 5554 + }, + { + "epoch": 0.05555, + "grad_norm": 0.49222919452357977, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 5555 + }, + { + "epoch": 0.05556, + "grad_norm": 0.5727061854119558, + "learning_rate": 0.003, + "loss": 4.102, + "step": 5556 + }, + { + "epoch": 0.05557, + "grad_norm": 0.6430410979228637, + "learning_rate": 0.003, + "loss": 4.1312, + "step": 5557 + }, + { + "epoch": 0.05558, + "grad_norm": 0.7465449755259148, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 5558 + }, + { + "epoch": 0.05559, + "grad_norm": 0.8571034449603531, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 5559 + }, + { + "epoch": 0.0556, + "grad_norm": 0.9603542514944866, + "learning_rate": 0.003, + "loss": 4.1606, + "step": 5560 + }, + { + "epoch": 0.05561, + "grad_norm": 1.1384407616016659, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 5561 + }, + { + "epoch": 0.05562, + "grad_norm": 0.8542948215220858, + "learning_rate": 0.003, + "loss": 4.1323, + "step": 5562 + }, + { + "epoch": 0.05563, + "grad_norm": 0.6440539340616278, + "learning_rate": 0.003, + "loss": 4.1376, + "step": 5563 + }, + { + "epoch": 0.05564, + "grad_norm": 0.5983189844345748, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 5564 + }, + { + "epoch": 0.05565, + "grad_norm": 0.6928412710848559, + "learning_rate": 0.003, + "loss": 4.1628, + "step": 5565 + }, + { + "epoch": 0.05566, + "grad_norm": 0.7458788234271931, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 5566 + }, + { + "epoch": 0.05567, + "grad_norm": 0.6975211809831088, + "learning_rate": 0.003, + "loss": 4.1221, + "step": 5567 + }, + { + "epoch": 0.05568, + "grad_norm": 0.7797367230472155, + "learning_rate": 0.003, + "loss": 4.1575, + "step": 5568 + }, + { + "epoch": 0.05569, + "grad_norm": 0.8587086222029046, + "learning_rate": 0.003, + "loss": 4.1352, + "step": 5569 + }, + { + "epoch": 0.0557, + "grad_norm": 0.842041289187777, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 5570 + }, + { + "epoch": 0.05571, + "grad_norm": 0.7876741380103197, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 5571 + }, + { + "epoch": 0.05572, + "grad_norm": 0.6444705359029274, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 5572 + }, + { + "epoch": 0.05573, + "grad_norm": 0.549050192063969, + "learning_rate": 0.003, + "loss": 4.1563, + "step": 5573 + }, + { + "epoch": 0.05574, + "grad_norm": 0.5333806190349942, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 5574 + }, + { + "epoch": 0.05575, + "grad_norm": 0.5225404420588966, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 5575 + }, + { + "epoch": 0.05576, + "grad_norm": 0.5429894944172927, + "learning_rate": 0.003, + "loss": 4.098, + "step": 5576 + }, + { + "epoch": 0.05577, + "grad_norm": 0.5490427675347196, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 5577 + }, + { + "epoch": 0.05578, + "grad_norm": 0.6083016266092716, + "learning_rate": 0.003, + "loss": 4.1242, + "step": 5578 + }, + { + "epoch": 0.05579, + "grad_norm": 0.6186932276076856, + "learning_rate": 0.003, + "loss": 4.1399, + "step": 5579 + }, + { + "epoch": 0.0558, + "grad_norm": 0.5780512131591367, + "learning_rate": 0.003, + "loss": 4.129, + "step": 5580 + }, + { + "epoch": 0.05581, + "grad_norm": 0.5980239436138541, + "learning_rate": 0.003, + "loss": 4.1401, + "step": 5581 + }, + { + "epoch": 0.05582, + "grad_norm": 0.5392719008352066, + "learning_rate": 0.003, + "loss": 4.1521, + "step": 5582 + }, + { + "epoch": 0.05583, + "grad_norm": 0.5546273095860996, + "learning_rate": 0.003, + "loss": 4.125, + "step": 5583 + }, + { + "epoch": 0.05584, + "grad_norm": 0.6705838265732573, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 5584 + }, + { + "epoch": 0.05585, + "grad_norm": 0.8217396804436106, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 5585 + }, + { + "epoch": 0.05586, + "grad_norm": 0.9042169206281825, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 5586 + }, + { + "epoch": 0.05587, + "grad_norm": 0.9227619224934434, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 5587 + }, + { + "epoch": 0.05588, + "grad_norm": 0.7777466990352064, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 5588 + }, + { + "epoch": 0.05589, + "grad_norm": 0.6603780366654983, + "learning_rate": 0.003, + "loss": 4.1357, + "step": 5589 + }, + { + "epoch": 0.0559, + "grad_norm": 0.6528519374382725, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 5590 + }, + { + "epoch": 0.05591, + "grad_norm": 0.6471996561956648, + "learning_rate": 0.003, + "loss": 4.1387, + "step": 5591 + }, + { + "epoch": 0.05592, + "grad_norm": 0.6641363904608714, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 5592 + }, + { + "epoch": 0.05593, + "grad_norm": 0.6321306805343749, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 5593 + }, + { + "epoch": 0.05594, + "grad_norm": 0.5652755839521619, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 5594 + }, + { + "epoch": 0.05595, + "grad_norm": 0.5052820748237068, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 5595 + }, + { + "epoch": 0.05596, + "grad_norm": 0.47170632908810217, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 5596 + }, + { + "epoch": 0.05597, + "grad_norm": 0.4937678505983004, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 5597 + }, + { + "epoch": 0.05598, + "grad_norm": 0.5635649117022338, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 5598 + }, + { + "epoch": 0.05599, + "grad_norm": 0.6174104628038847, + "learning_rate": 0.003, + "loss": 4.1489, + "step": 5599 + }, + { + "epoch": 0.056, + "grad_norm": 0.6539159328500083, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 5600 + }, + { + "epoch": 0.05601, + "grad_norm": 0.6679699951089212, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 5601 + }, + { + "epoch": 0.05602, + "grad_norm": 0.6546413158566504, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 5602 + }, + { + "epoch": 0.05603, + "grad_norm": 0.6946743527150302, + "learning_rate": 0.003, + "loss": 4.115, + "step": 5603 + }, + { + "epoch": 0.05604, + "grad_norm": 0.7552162101007626, + "learning_rate": 0.003, + "loss": 4.1547, + "step": 5604 + }, + { + "epoch": 0.05605, + "grad_norm": 0.6696007064120673, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 5605 + }, + { + "epoch": 0.05606, + "grad_norm": 0.59562089846795, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 5606 + }, + { + "epoch": 0.05607, + "grad_norm": 0.6269393532345678, + "learning_rate": 0.003, + "loss": 4.1513, + "step": 5607 + }, + { + "epoch": 0.05608, + "grad_norm": 0.6816772543279809, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 5608 + }, + { + "epoch": 0.05609, + "grad_norm": 0.6707354321971271, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 5609 + }, + { + "epoch": 0.0561, + "grad_norm": 0.6411808035499755, + "learning_rate": 0.003, + "loss": 4.1566, + "step": 5610 + }, + { + "epoch": 0.05611, + "grad_norm": 0.6367039596675031, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 5611 + }, + { + "epoch": 0.05612, + "grad_norm": 0.6159883542316671, + "learning_rate": 0.003, + "loss": 4.117, + "step": 5612 + }, + { + "epoch": 0.05613, + "grad_norm": 0.5688379331389054, + "learning_rate": 0.003, + "loss": 4.1407, + "step": 5613 + }, + { + "epoch": 0.05614, + "grad_norm": 0.5623023558521582, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 5614 + }, + { + "epoch": 0.05615, + "grad_norm": 0.5064939834311122, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 5615 + }, + { + "epoch": 0.05616, + "grad_norm": 0.5555368115916227, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 5616 + }, + { + "epoch": 0.05617, + "grad_norm": 0.6951229882403026, + "learning_rate": 0.003, + "loss": 4.1353, + "step": 5617 + }, + { + "epoch": 0.05618, + "grad_norm": 0.7979399037113596, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 5618 + }, + { + "epoch": 0.05619, + "grad_norm": 0.7776125407615552, + "learning_rate": 0.003, + "loss": 4.1785, + "step": 5619 + }, + { + "epoch": 0.0562, + "grad_norm": 0.6817596297069429, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 5620 + }, + { + "epoch": 0.05621, + "grad_norm": 0.6549397635610529, + "learning_rate": 0.003, + "loss": 4.139, + "step": 5621 + }, + { + "epoch": 0.05622, + "grad_norm": 0.7329313978126628, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 5622 + }, + { + "epoch": 0.05623, + "grad_norm": 0.732086571615983, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 5623 + }, + { + "epoch": 0.05624, + "grad_norm": 0.6771095843567011, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 5624 + }, + { + "epoch": 0.05625, + "grad_norm": 0.6183411314364045, + "learning_rate": 0.003, + "loss": 4.15, + "step": 5625 + }, + { + "epoch": 0.05626, + "grad_norm": 0.6570365199694389, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 5626 + }, + { + "epoch": 0.05627, + "grad_norm": 0.6639084179326356, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 5627 + }, + { + "epoch": 0.05628, + "grad_norm": 0.6844026433487909, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 5628 + }, + { + "epoch": 0.05629, + "grad_norm": 0.82713889536583, + "learning_rate": 0.003, + "loss": 4.1413, + "step": 5629 + }, + { + "epoch": 0.0563, + "grad_norm": 0.6942930662060884, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 5630 + }, + { + "epoch": 0.05631, + "grad_norm": 0.5294373134589537, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 5631 + }, + { + "epoch": 0.05632, + "grad_norm": 0.49155296329990794, + "learning_rate": 0.003, + "loss": 4.1302, + "step": 5632 + }, + { + "epoch": 0.05633, + "grad_norm": 0.4835048319394899, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 5633 + }, + { + "epoch": 0.05634, + "grad_norm": 0.4557319202795549, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 5634 + }, + { + "epoch": 0.05635, + "grad_norm": 0.4698037555374509, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 5635 + }, + { + "epoch": 0.05636, + "grad_norm": 0.5988222861693672, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 5636 + }, + { + "epoch": 0.05637, + "grad_norm": 0.6571880685992473, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 5637 + }, + { + "epoch": 0.05638, + "grad_norm": 0.589395853147304, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 5638 + }, + { + "epoch": 0.05639, + "grad_norm": 0.5433844345616452, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 5639 + }, + { + "epoch": 0.0564, + "grad_norm": 0.5170019835368442, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 5640 + }, + { + "epoch": 0.05641, + "grad_norm": 0.4891316043455271, + "learning_rate": 0.003, + "loss": 4.071, + "step": 5641 + }, + { + "epoch": 0.05642, + "grad_norm": 0.4491321561168964, + "learning_rate": 0.003, + "loss": 4.139, + "step": 5642 + }, + { + "epoch": 0.05643, + "grad_norm": 0.4830092535839702, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 5643 + }, + { + "epoch": 0.05644, + "grad_norm": 0.5232904059467851, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 5644 + }, + { + "epoch": 0.05645, + "grad_norm": 0.567111969651976, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 5645 + }, + { + "epoch": 0.05646, + "grad_norm": 0.6631370135003563, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 5646 + }, + { + "epoch": 0.05647, + "grad_norm": 0.8154565355116417, + "learning_rate": 0.003, + "loss": 4.125, + "step": 5647 + }, + { + "epoch": 0.05648, + "grad_norm": 0.7782448973936725, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 5648 + }, + { + "epoch": 0.05649, + "grad_norm": 0.6294974249907404, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 5649 + }, + { + "epoch": 0.0565, + "grad_norm": 0.658532379840979, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 5650 + }, + { + "epoch": 0.05651, + "grad_norm": 0.5881790419238977, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 5651 + }, + { + "epoch": 0.05652, + "grad_norm": 0.6487361989272737, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 5652 + }, + { + "epoch": 0.05653, + "grad_norm": 0.890297904598085, + "learning_rate": 0.003, + "loss": 4.1452, + "step": 5653 + }, + { + "epoch": 0.05654, + "grad_norm": 1.122891362935015, + "learning_rate": 0.003, + "loss": 4.1688, + "step": 5654 + }, + { + "epoch": 0.05655, + "grad_norm": 0.7784475530833843, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 5655 + }, + { + "epoch": 0.05656, + "grad_norm": 0.7039018333153184, + "learning_rate": 0.003, + "loss": 4.1449, + "step": 5656 + }, + { + "epoch": 0.05657, + "grad_norm": 0.7611663979262661, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 5657 + }, + { + "epoch": 0.05658, + "grad_norm": 0.6301443417394809, + "learning_rate": 0.003, + "loss": 4.128, + "step": 5658 + }, + { + "epoch": 0.05659, + "grad_norm": 0.5799574664434455, + "learning_rate": 0.003, + "loss": 4.1303, + "step": 5659 + }, + { + "epoch": 0.0566, + "grad_norm": 0.5398797700270416, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 5660 + }, + { + "epoch": 0.05661, + "grad_norm": 0.5607006474067805, + "learning_rate": 0.003, + "loss": 4.1688, + "step": 5661 + }, + { + "epoch": 0.05662, + "grad_norm": 0.6312047749179707, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 5662 + }, + { + "epoch": 0.05663, + "grad_norm": 0.6691547203899587, + "learning_rate": 0.003, + "loss": 4.1338, + "step": 5663 + }, + { + "epoch": 0.05664, + "grad_norm": 0.6839990390423485, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 5664 + }, + { + "epoch": 0.05665, + "grad_norm": 0.6483304454614884, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 5665 + }, + { + "epoch": 0.05666, + "grad_norm": 0.7336745114259869, + "learning_rate": 0.003, + "loss": 4.136, + "step": 5666 + }, + { + "epoch": 0.05667, + "grad_norm": 0.6601921754069777, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 5667 + }, + { + "epoch": 0.05668, + "grad_norm": 0.608008716479813, + "learning_rate": 0.003, + "loss": 4.1476, + "step": 5668 + }, + { + "epoch": 0.05669, + "grad_norm": 0.6089029787701348, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 5669 + }, + { + "epoch": 0.0567, + "grad_norm": 0.5002563109306051, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 5670 + }, + { + "epoch": 0.05671, + "grad_norm": 0.5193596424869418, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 5671 + }, + { + "epoch": 0.05672, + "grad_norm": 0.6518666148309528, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 5672 + }, + { + "epoch": 0.05673, + "grad_norm": 0.7728626458240908, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 5673 + }, + { + "epoch": 0.05674, + "grad_norm": 0.8332874518712882, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 5674 + }, + { + "epoch": 0.05675, + "grad_norm": 0.8323964040847633, + "learning_rate": 0.003, + "loss": 4.1313, + "step": 5675 + }, + { + "epoch": 0.05676, + "grad_norm": 0.7595324774497408, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 5676 + }, + { + "epoch": 0.05677, + "grad_norm": 0.6740603533153058, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 5677 + }, + { + "epoch": 0.05678, + "grad_norm": 0.5713494747040236, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 5678 + }, + { + "epoch": 0.05679, + "grad_norm": 0.6253195899590996, + "learning_rate": 0.003, + "loss": 4.1404, + "step": 5679 + }, + { + "epoch": 0.0568, + "grad_norm": 0.6100710085093579, + "learning_rate": 0.003, + "loss": 4.117, + "step": 5680 + }, + { + "epoch": 0.05681, + "grad_norm": 0.6830054586678898, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 5681 + }, + { + "epoch": 0.05682, + "grad_norm": 0.7108282874400138, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 5682 + }, + { + "epoch": 0.05683, + "grad_norm": 0.5883840169923861, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 5683 + }, + { + "epoch": 0.05684, + "grad_norm": 0.6507038187659987, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 5684 + }, + { + "epoch": 0.05685, + "grad_norm": 0.6756833252452829, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 5685 + }, + { + "epoch": 0.05686, + "grad_norm": 0.583645658916334, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 5686 + }, + { + "epoch": 0.05687, + "grad_norm": 0.5953213292181491, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 5687 + }, + { + "epoch": 0.05688, + "grad_norm": 0.5475876848073131, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 5688 + }, + { + "epoch": 0.05689, + "grad_norm": 0.5025790725419514, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 5689 + }, + { + "epoch": 0.0569, + "grad_norm": 0.45433112969189393, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 5690 + }, + { + "epoch": 0.05691, + "grad_norm": 0.4140962723522923, + "learning_rate": 0.003, + "loss": 4.1404, + "step": 5691 + }, + { + "epoch": 0.05692, + "grad_norm": 0.4662306842775959, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 5692 + }, + { + "epoch": 0.05693, + "grad_norm": 0.5891875363708755, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 5693 + }, + { + "epoch": 0.05694, + "grad_norm": 0.8176890746618, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 5694 + }, + { + "epoch": 0.05695, + "grad_norm": 1.0722082901443937, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 5695 + }, + { + "epoch": 0.05696, + "grad_norm": 0.9384436343645547, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 5696 + }, + { + "epoch": 0.05697, + "grad_norm": 0.7353450806650836, + "learning_rate": 0.003, + "loss": 4.1609, + "step": 5697 + }, + { + "epoch": 0.05698, + "grad_norm": 0.6318836782831939, + "learning_rate": 0.003, + "loss": 4.156, + "step": 5698 + }, + { + "epoch": 0.05699, + "grad_norm": 0.5916316530186301, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 5699 + }, + { + "epoch": 0.057, + "grad_norm": 0.6535089788186025, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 5700 + }, + { + "epoch": 0.05701, + "grad_norm": 0.6546023694581979, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 5701 + }, + { + "epoch": 0.05702, + "grad_norm": 0.5084570882781324, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 5702 + }, + { + "epoch": 0.05703, + "grad_norm": 0.5133248015349259, + "learning_rate": 0.003, + "loss": 4.1582, + "step": 5703 + }, + { + "epoch": 0.05704, + "grad_norm": 0.5358288934883745, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 5704 + }, + { + "epoch": 0.05705, + "grad_norm": 0.5900408511498093, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 5705 + }, + { + "epoch": 0.05706, + "grad_norm": 0.6627882566353759, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 5706 + }, + { + "epoch": 0.05707, + "grad_norm": 0.7192441694975358, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 5707 + }, + { + "epoch": 0.05708, + "grad_norm": 0.7445828354995511, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 5708 + }, + { + "epoch": 0.05709, + "grad_norm": 0.7483624354012154, + "learning_rate": 0.003, + "loss": 4.1312, + "step": 5709 + }, + { + "epoch": 0.0571, + "grad_norm": 0.725767615807694, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 5710 + }, + { + "epoch": 0.05711, + "grad_norm": 0.7397045380207842, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 5711 + }, + { + "epoch": 0.05712, + "grad_norm": 0.7189079122457795, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 5712 + }, + { + "epoch": 0.05713, + "grad_norm": 0.7060174681207148, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 5713 + }, + { + "epoch": 0.05714, + "grad_norm": 0.6921393568332758, + "learning_rate": 0.003, + "loss": 4.1728, + "step": 5714 + }, + { + "epoch": 0.05715, + "grad_norm": 0.6750383401887041, + "learning_rate": 0.003, + "loss": 4.1752, + "step": 5715 + }, + { + "epoch": 0.05716, + "grad_norm": 0.7085886694681629, + "learning_rate": 0.003, + "loss": 4.1475, + "step": 5716 + }, + { + "epoch": 0.05717, + "grad_norm": 0.6663173016626173, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 5717 + }, + { + "epoch": 0.05718, + "grad_norm": 0.7941387320047305, + "learning_rate": 0.003, + "loss": 4.1406, + "step": 5718 + }, + { + "epoch": 0.05719, + "grad_norm": 0.9177092410441985, + "learning_rate": 0.003, + "loss": 4.1523, + "step": 5719 + }, + { + "epoch": 0.0572, + "grad_norm": 0.8971727958952642, + "learning_rate": 0.003, + "loss": 4.1409, + "step": 5720 + }, + { + "epoch": 0.05721, + "grad_norm": 0.926040651918975, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 5721 + }, + { + "epoch": 0.05722, + "grad_norm": 0.899008075973923, + "learning_rate": 0.003, + "loss": 4.1453, + "step": 5722 + }, + { + "epoch": 0.05723, + "grad_norm": 0.8257100704628738, + "learning_rate": 0.003, + "loss": 4.1442, + "step": 5723 + }, + { + "epoch": 0.05724, + "grad_norm": 0.7778923288997804, + "learning_rate": 0.003, + "loss": 4.1448, + "step": 5724 + }, + { + "epoch": 0.05725, + "grad_norm": 0.6413770657071006, + "learning_rate": 0.003, + "loss": 4.1505, + "step": 5725 + }, + { + "epoch": 0.05726, + "grad_norm": 0.6037306221880084, + "learning_rate": 0.003, + "loss": 4.1389, + "step": 5726 + }, + { + "epoch": 0.05727, + "grad_norm": 0.5200633077018988, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 5727 + }, + { + "epoch": 0.05728, + "grad_norm": 0.5036743411797538, + "learning_rate": 0.003, + "loss": 4.15, + "step": 5728 + }, + { + "epoch": 0.05729, + "grad_norm": 0.4985411262542399, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 5729 + }, + { + "epoch": 0.0573, + "grad_norm": 0.4584917863448276, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 5730 + }, + { + "epoch": 0.05731, + "grad_norm": 0.5438247899724311, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 5731 + }, + { + "epoch": 0.05732, + "grad_norm": 0.5879044540226291, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 5732 + }, + { + "epoch": 0.05733, + "grad_norm": 0.5930153226482116, + "learning_rate": 0.003, + "loss": 4.1533, + "step": 5733 + }, + { + "epoch": 0.05734, + "grad_norm": 0.551401918593653, + "learning_rate": 0.003, + "loss": 4.146, + "step": 5734 + }, + { + "epoch": 0.05735, + "grad_norm": 0.44849752852507807, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 5735 + }, + { + "epoch": 0.05736, + "grad_norm": 0.44905168397046813, + "learning_rate": 0.003, + "loss": 4.1292, + "step": 5736 + }, + { + "epoch": 0.05737, + "grad_norm": 0.5268320731054086, + "learning_rate": 0.003, + "loss": 4.124, + "step": 5737 + }, + { + "epoch": 0.05738, + "grad_norm": 0.5127400458979028, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 5738 + }, + { + "epoch": 0.05739, + "grad_norm": 0.4584064056524342, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 5739 + }, + { + "epoch": 0.0574, + "grad_norm": 0.43315600343740956, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 5740 + }, + { + "epoch": 0.05741, + "grad_norm": 0.435296931858689, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 5741 + }, + { + "epoch": 0.05742, + "grad_norm": 0.4223546407108866, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 5742 + }, + { + "epoch": 0.05743, + "grad_norm": 0.3829337526851232, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 5743 + }, + { + "epoch": 0.05744, + "grad_norm": 0.42582439817855655, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 5744 + }, + { + "epoch": 0.05745, + "grad_norm": 0.5032675064997798, + "learning_rate": 0.003, + "loss": 4.1142, + "step": 5745 + }, + { + "epoch": 0.05746, + "grad_norm": 0.5371812025082381, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 5746 + }, + { + "epoch": 0.05747, + "grad_norm": 0.5691353438016382, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 5747 + }, + { + "epoch": 0.05748, + "grad_norm": 0.5609944240328593, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 5748 + }, + { + "epoch": 0.05749, + "grad_norm": 0.5812063114829625, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 5749 + }, + { + "epoch": 0.0575, + "grad_norm": 0.59242400709736, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 5750 + }, + { + "epoch": 0.05751, + "grad_norm": 0.5892062367000988, + "learning_rate": 0.003, + "loss": 4.1363, + "step": 5751 + }, + { + "epoch": 0.05752, + "grad_norm": 0.6411889904390121, + "learning_rate": 0.003, + "loss": 4.125, + "step": 5752 + }, + { + "epoch": 0.05753, + "grad_norm": 0.7623604527296114, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 5753 + }, + { + "epoch": 0.05754, + "grad_norm": 0.8190982577351572, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 5754 + }, + { + "epoch": 0.05755, + "grad_norm": 0.8129781285023648, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 5755 + }, + { + "epoch": 0.05756, + "grad_norm": 0.9645409102735437, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 5756 + }, + { + "epoch": 0.05757, + "grad_norm": 1.0613767929848879, + "learning_rate": 0.003, + "loss": 4.1515, + "step": 5757 + }, + { + "epoch": 0.05758, + "grad_norm": 0.8340493918370546, + "learning_rate": 0.003, + "loss": 4.141, + "step": 5758 + }, + { + "epoch": 0.05759, + "grad_norm": 0.7501058117672132, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 5759 + }, + { + "epoch": 0.0576, + "grad_norm": 0.6678453188221255, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 5760 + }, + { + "epoch": 0.05761, + "grad_norm": 0.7222477111017501, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 5761 + }, + { + "epoch": 0.05762, + "grad_norm": 0.8358884638373766, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 5762 + }, + { + "epoch": 0.05763, + "grad_norm": 0.9017240989057173, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 5763 + }, + { + "epoch": 0.05764, + "grad_norm": 0.9525671514720745, + "learning_rate": 0.003, + "loss": 4.1454, + "step": 5764 + }, + { + "epoch": 0.05765, + "grad_norm": 0.903840898655556, + "learning_rate": 0.003, + "loss": 4.1594, + "step": 5765 + }, + { + "epoch": 0.05766, + "grad_norm": 0.841117793582558, + "learning_rate": 0.003, + "loss": 4.1584, + "step": 5766 + }, + { + "epoch": 0.05767, + "grad_norm": 0.8184231215193009, + "learning_rate": 0.003, + "loss": 4.1448, + "step": 5767 + }, + { + "epoch": 0.05768, + "grad_norm": 0.7953651176604322, + "learning_rate": 0.003, + "loss": 4.1647, + "step": 5768 + }, + { + "epoch": 0.05769, + "grad_norm": 0.8641222338040627, + "learning_rate": 0.003, + "loss": 4.1594, + "step": 5769 + }, + { + "epoch": 0.0577, + "grad_norm": 0.8267946717518797, + "learning_rate": 0.003, + "loss": 4.1629, + "step": 5770 + }, + { + "epoch": 0.05771, + "grad_norm": 0.7549731798996204, + "learning_rate": 0.003, + "loss": 4.1637, + "step": 5771 + }, + { + "epoch": 0.05772, + "grad_norm": 0.8471888716859771, + "learning_rate": 0.003, + "loss": 4.174, + "step": 5772 + }, + { + "epoch": 0.05773, + "grad_norm": 0.8004119994019282, + "learning_rate": 0.003, + "loss": 4.1698, + "step": 5773 + }, + { + "epoch": 0.05774, + "grad_norm": 0.6795800808077224, + "learning_rate": 0.003, + "loss": 4.136, + "step": 5774 + }, + { + "epoch": 0.05775, + "grad_norm": 0.6426254130546161, + "learning_rate": 0.003, + "loss": 4.14, + "step": 5775 + }, + { + "epoch": 0.05776, + "grad_norm": 0.5457159419232533, + "learning_rate": 0.003, + "loss": 4.1307, + "step": 5776 + }, + { + "epoch": 0.05777, + "grad_norm": 0.5193032203296611, + "learning_rate": 0.003, + "loss": 4.1452, + "step": 5777 + }, + { + "epoch": 0.05778, + "grad_norm": 0.5430365890766845, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 5778 + }, + { + "epoch": 0.05779, + "grad_norm": 0.5514106031103018, + "learning_rate": 0.003, + "loss": 4.1516, + "step": 5779 + }, + { + "epoch": 0.0578, + "grad_norm": 0.5595076136345223, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 5780 + }, + { + "epoch": 0.05781, + "grad_norm": 0.6358454172899635, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 5781 + }, + { + "epoch": 0.05782, + "grad_norm": 0.7067244036587653, + "learning_rate": 0.003, + "loss": 4.1522, + "step": 5782 + }, + { + "epoch": 0.05783, + "grad_norm": 0.6579510175517927, + "learning_rate": 0.003, + "loss": 4.094, + "step": 5783 + }, + { + "epoch": 0.05784, + "grad_norm": 0.4615349795672166, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 5784 + }, + { + "epoch": 0.05785, + "grad_norm": 0.40423330402508734, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 5785 + }, + { + "epoch": 0.05786, + "grad_norm": 0.4219551145978536, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 5786 + }, + { + "epoch": 0.05787, + "grad_norm": 0.47190796765263787, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 5787 + }, + { + "epoch": 0.05788, + "grad_norm": 0.48647627634592133, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 5788 + }, + { + "epoch": 0.05789, + "grad_norm": 0.45916564143367755, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 5789 + }, + { + "epoch": 0.0579, + "grad_norm": 0.38438157656491667, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 5790 + }, + { + "epoch": 0.05791, + "grad_norm": 0.380767444456616, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 5791 + }, + { + "epoch": 0.05792, + "grad_norm": 0.39494151370656566, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 5792 + }, + { + "epoch": 0.05793, + "grad_norm": 0.4521836450575782, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 5793 + }, + { + "epoch": 0.05794, + "grad_norm": 0.5585408775581093, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 5794 + }, + { + "epoch": 0.05795, + "grad_norm": 0.7037387927731176, + "learning_rate": 0.003, + "loss": 4.1517, + "step": 5795 + }, + { + "epoch": 0.05796, + "grad_norm": 0.7512006570045856, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 5796 + }, + { + "epoch": 0.05797, + "grad_norm": 0.6139515253192398, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 5797 + }, + { + "epoch": 0.05798, + "grad_norm": 0.42741457828347207, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 5798 + }, + { + "epoch": 0.05799, + "grad_norm": 0.4697070878309648, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 5799 + }, + { + "epoch": 0.058, + "grad_norm": 0.5718953135900179, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 5800 + }, + { + "epoch": 0.05801, + "grad_norm": 0.6101112337124837, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 5801 + }, + { + "epoch": 0.05802, + "grad_norm": 0.518362856862478, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 5802 + }, + { + "epoch": 0.05803, + "grad_norm": 0.5319512529198976, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 5803 + }, + { + "epoch": 0.05804, + "grad_norm": 0.583888596545647, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 5804 + }, + { + "epoch": 0.05805, + "grad_norm": 0.6332538337272945, + "learning_rate": 0.003, + "loss": 4.086, + "step": 5805 + }, + { + "epoch": 0.05806, + "grad_norm": 0.6591146413634413, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 5806 + }, + { + "epoch": 0.05807, + "grad_norm": 0.5648403140301973, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 5807 + }, + { + "epoch": 0.05808, + "grad_norm": 0.5082305082390789, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 5808 + }, + { + "epoch": 0.05809, + "grad_norm": 0.45869354878914337, + "learning_rate": 0.003, + "loss": 4.136, + "step": 5809 + }, + { + "epoch": 0.0581, + "grad_norm": 0.5128621779633227, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 5810 + }, + { + "epoch": 0.05811, + "grad_norm": 0.4634974372718931, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 5811 + }, + { + "epoch": 0.05812, + "grad_norm": 0.5415369741433366, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 5812 + }, + { + "epoch": 0.05813, + "grad_norm": 0.6045041038302872, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 5813 + }, + { + "epoch": 0.05814, + "grad_norm": 0.781348878698672, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 5814 + }, + { + "epoch": 0.05815, + "grad_norm": 0.9049868579609852, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 5815 + }, + { + "epoch": 0.05816, + "grad_norm": 0.8496790014644907, + "learning_rate": 0.003, + "loss": 4.1366, + "step": 5816 + }, + { + "epoch": 0.05817, + "grad_norm": 0.6848842285781443, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 5817 + }, + { + "epoch": 0.05818, + "grad_norm": 0.6972808127885899, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 5818 + }, + { + "epoch": 0.05819, + "grad_norm": 0.7928565409685948, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 5819 + }, + { + "epoch": 0.0582, + "grad_norm": 0.8531050086329389, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 5820 + }, + { + "epoch": 0.05821, + "grad_norm": 0.8181521585974338, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 5821 + }, + { + "epoch": 0.05822, + "grad_norm": 0.6685069393486631, + "learning_rate": 0.003, + "loss": 4.103, + "step": 5822 + }, + { + "epoch": 0.05823, + "grad_norm": 0.7178511889820098, + "learning_rate": 0.003, + "loss": 4.138, + "step": 5823 + }, + { + "epoch": 0.05824, + "grad_norm": 0.7316138274504331, + "learning_rate": 0.003, + "loss": 4.1395, + "step": 5824 + }, + { + "epoch": 0.05825, + "grad_norm": 0.7577488477174734, + "learning_rate": 0.003, + "loss": 4.1663, + "step": 5825 + }, + { + "epoch": 0.05826, + "grad_norm": 0.7352824443915027, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 5826 + }, + { + "epoch": 0.05827, + "grad_norm": 0.737466448111901, + "learning_rate": 0.003, + "loss": 4.1413, + "step": 5827 + }, + { + "epoch": 0.05828, + "grad_norm": 0.5756572073483629, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 5828 + }, + { + "epoch": 0.05829, + "grad_norm": 0.5700830450607922, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 5829 + }, + { + "epoch": 0.0583, + "grad_norm": 0.6090643389167884, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 5830 + }, + { + "epoch": 0.05831, + "grad_norm": 0.6598337613053546, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 5831 + }, + { + "epoch": 0.05832, + "grad_norm": 0.7236800212652943, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 5832 + }, + { + "epoch": 0.05833, + "grad_norm": 0.8312662391240169, + "learning_rate": 0.003, + "loss": 4.1348, + "step": 5833 + }, + { + "epoch": 0.05834, + "grad_norm": 0.8570982085297809, + "learning_rate": 0.003, + "loss": 4.1458, + "step": 5834 + }, + { + "epoch": 0.05835, + "grad_norm": 0.7634242055631113, + "learning_rate": 0.003, + "loss": 4.1634, + "step": 5835 + }, + { + "epoch": 0.05836, + "grad_norm": 0.8129339709531613, + "learning_rate": 0.003, + "loss": 4.1529, + "step": 5836 + }, + { + "epoch": 0.05837, + "grad_norm": 0.7114562019507951, + "learning_rate": 0.003, + "loss": 4.1424, + "step": 5837 + }, + { + "epoch": 0.05838, + "grad_norm": 0.7146446360282164, + "learning_rate": 0.003, + "loss": 4.144, + "step": 5838 + }, + { + "epoch": 0.05839, + "grad_norm": 0.6826153272700018, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 5839 + }, + { + "epoch": 0.0584, + "grad_norm": 0.6296145264005895, + "learning_rate": 0.003, + "loss": 4.1591, + "step": 5840 + }, + { + "epoch": 0.05841, + "grad_norm": 0.6400001378596765, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 5841 + }, + { + "epoch": 0.05842, + "grad_norm": 0.6745147859755771, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 5842 + }, + { + "epoch": 0.05843, + "grad_norm": 0.7204687011170773, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 5843 + }, + { + "epoch": 0.05844, + "grad_norm": 0.6692907850570987, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 5844 + }, + { + "epoch": 0.05845, + "grad_norm": 0.6464847846170348, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 5845 + }, + { + "epoch": 0.05846, + "grad_norm": 0.5903179636360006, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 5846 + }, + { + "epoch": 0.05847, + "grad_norm": 0.5715707353163612, + "learning_rate": 0.003, + "loss": 4.147, + "step": 5847 + }, + { + "epoch": 0.05848, + "grad_norm": 0.6188778057459394, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 5848 + }, + { + "epoch": 0.05849, + "grad_norm": 0.679788170958821, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 5849 + }, + { + "epoch": 0.0585, + "grad_norm": 0.6866570136156571, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 5850 + }, + { + "epoch": 0.05851, + "grad_norm": 0.6035947467507179, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 5851 + }, + { + "epoch": 0.05852, + "grad_norm": 0.6646374634051921, + "learning_rate": 0.003, + "loss": 4.114, + "step": 5852 + }, + { + "epoch": 0.05853, + "grad_norm": 0.6845982684639399, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 5853 + }, + { + "epoch": 0.05854, + "grad_norm": 0.6479252813025695, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 5854 + }, + { + "epoch": 0.05855, + "grad_norm": 0.729313342535791, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 5855 + }, + { + "epoch": 0.05856, + "grad_norm": 0.8374311171433904, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 5856 + }, + { + "epoch": 0.05857, + "grad_norm": 0.8450077468218878, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 5857 + }, + { + "epoch": 0.05858, + "grad_norm": 0.7554133376301935, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 5858 + }, + { + "epoch": 0.05859, + "grad_norm": 0.6288985709272776, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 5859 + }, + { + "epoch": 0.0586, + "grad_norm": 0.6807746349622874, + "learning_rate": 0.003, + "loss": 4.1361, + "step": 5860 + }, + { + "epoch": 0.05861, + "grad_norm": 0.6685473536685101, + "learning_rate": 0.003, + "loss": 4.1524, + "step": 5861 + }, + { + "epoch": 0.05862, + "grad_norm": 0.5947123918251042, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 5862 + }, + { + "epoch": 0.05863, + "grad_norm": 0.6529519804727131, + "learning_rate": 0.003, + "loss": 4.1397, + "step": 5863 + }, + { + "epoch": 0.05864, + "grad_norm": 0.7095951129146904, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 5864 + }, + { + "epoch": 0.05865, + "grad_norm": 0.69339783930652, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 5865 + }, + { + "epoch": 0.05866, + "grad_norm": 0.6690878299187139, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 5866 + }, + { + "epoch": 0.05867, + "grad_norm": 0.7086863427678343, + "learning_rate": 0.003, + "loss": 4.1189, + "step": 5867 + }, + { + "epoch": 0.05868, + "grad_norm": 0.6786171909040684, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 5868 + }, + { + "epoch": 0.05869, + "grad_norm": 0.5232033904488403, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 5869 + }, + { + "epoch": 0.0587, + "grad_norm": 0.47022549323018026, + "learning_rate": 0.003, + "loss": 4.109, + "step": 5870 + }, + { + "epoch": 0.05871, + "grad_norm": 0.44108238286000917, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 5871 + }, + { + "epoch": 0.05872, + "grad_norm": 0.3913734511800551, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 5872 + }, + { + "epoch": 0.05873, + "grad_norm": 0.42138434666249186, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 5873 + }, + { + "epoch": 0.05874, + "grad_norm": 0.4288958519233743, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 5874 + }, + { + "epoch": 0.05875, + "grad_norm": 0.44577682626719106, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 5875 + }, + { + "epoch": 0.05876, + "grad_norm": 0.4510573231336446, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 5876 + }, + { + "epoch": 0.05877, + "grad_norm": 0.43564108247460703, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 5877 + }, + { + "epoch": 0.05878, + "grad_norm": 0.5419890900941077, + "learning_rate": 0.003, + "loss": 4.1323, + "step": 5878 + }, + { + "epoch": 0.05879, + "grad_norm": 0.6244220397965549, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 5879 + }, + { + "epoch": 0.0588, + "grad_norm": 0.7402086193529186, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 5880 + }, + { + "epoch": 0.05881, + "grad_norm": 0.9386524101489326, + "learning_rate": 0.003, + "loss": 4.1251, + "step": 5881 + }, + { + "epoch": 0.05882, + "grad_norm": 0.9522508870914118, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 5882 + }, + { + "epoch": 0.05883, + "grad_norm": 0.8075223632251547, + "learning_rate": 0.003, + "loss": 4.1392, + "step": 5883 + }, + { + "epoch": 0.05884, + "grad_norm": 0.8799909048791514, + "learning_rate": 0.003, + "loss": 4.1323, + "step": 5884 + }, + { + "epoch": 0.05885, + "grad_norm": 0.8530787571893335, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 5885 + }, + { + "epoch": 0.05886, + "grad_norm": 0.7329103122991582, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 5886 + }, + { + "epoch": 0.05887, + "grad_norm": 0.6748479851298841, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 5887 + }, + { + "epoch": 0.05888, + "grad_norm": 0.6343669394015016, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 5888 + }, + { + "epoch": 0.05889, + "grad_norm": 0.6277512686474859, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 5889 + }, + { + "epoch": 0.0589, + "grad_norm": 0.5871377194165454, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 5890 + }, + { + "epoch": 0.05891, + "grad_norm": 0.6464959924160834, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 5891 + }, + { + "epoch": 0.05892, + "grad_norm": 0.666522134105091, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 5892 + }, + { + "epoch": 0.05893, + "grad_norm": 0.6842219549286493, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 5893 + }, + { + "epoch": 0.05894, + "grad_norm": 0.6497740081031832, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 5894 + }, + { + "epoch": 0.05895, + "grad_norm": 0.6324488507002164, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 5895 + }, + { + "epoch": 0.05896, + "grad_norm": 0.6784791219839371, + "learning_rate": 0.003, + "loss": 4.1405, + "step": 5896 + }, + { + "epoch": 0.05897, + "grad_norm": 0.633556875627449, + "learning_rate": 0.003, + "loss": 4.1416, + "step": 5897 + }, + { + "epoch": 0.05898, + "grad_norm": 0.7611480962202484, + "learning_rate": 0.003, + "loss": 4.1402, + "step": 5898 + }, + { + "epoch": 0.05899, + "grad_norm": 0.8755081899697468, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 5899 + }, + { + "epoch": 0.059, + "grad_norm": 0.7092368456575846, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 5900 + }, + { + "epoch": 0.05901, + "grad_norm": 0.7082980339584151, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 5901 + }, + { + "epoch": 0.05902, + "grad_norm": 0.5794941356444853, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 5902 + }, + { + "epoch": 0.05903, + "grad_norm": 0.6148369105501363, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 5903 + }, + { + "epoch": 0.05904, + "grad_norm": 0.49903767340028915, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 5904 + }, + { + "epoch": 0.05905, + "grad_norm": 0.5199067308874674, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 5905 + }, + { + "epoch": 0.05906, + "grad_norm": 0.555958949304611, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 5906 + }, + { + "epoch": 0.05907, + "grad_norm": 0.6558790760046532, + "learning_rate": 0.003, + "loss": 4.108, + "step": 5907 + }, + { + "epoch": 0.05908, + "grad_norm": 0.7027699977426964, + "learning_rate": 0.003, + "loss": 4.1306, + "step": 5908 + }, + { + "epoch": 0.05909, + "grad_norm": 0.695163190283452, + "learning_rate": 0.003, + "loss": 4.1412, + "step": 5909 + }, + { + "epoch": 0.0591, + "grad_norm": 0.6843743312475742, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 5910 + }, + { + "epoch": 0.05911, + "grad_norm": 0.8025018895941953, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 5911 + }, + { + "epoch": 0.05912, + "grad_norm": 0.7607180090206141, + "learning_rate": 0.003, + "loss": 4.1517, + "step": 5912 + }, + { + "epoch": 0.05913, + "grad_norm": 0.6684077408320309, + "learning_rate": 0.003, + "loss": 4.1703, + "step": 5913 + }, + { + "epoch": 0.05914, + "grad_norm": 0.5315953517109148, + "learning_rate": 0.003, + "loss": 4.1367, + "step": 5914 + }, + { + "epoch": 0.05915, + "grad_norm": 0.5149999875273275, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 5915 + }, + { + "epoch": 0.05916, + "grad_norm": 0.5083952172925276, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 5916 + }, + { + "epoch": 0.05917, + "grad_norm": 0.6049621514969292, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 5917 + }, + { + "epoch": 0.05918, + "grad_norm": 0.6835808900733502, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 5918 + }, + { + "epoch": 0.05919, + "grad_norm": 0.6391964983836171, + "learning_rate": 0.003, + "loss": 4.1145, + "step": 5919 + }, + { + "epoch": 0.0592, + "grad_norm": 0.6143559484021399, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 5920 + }, + { + "epoch": 0.05921, + "grad_norm": 0.5979521195261379, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 5921 + }, + { + "epoch": 0.05922, + "grad_norm": 0.5693281868266068, + "learning_rate": 0.003, + "loss": 4.1239, + "step": 5922 + }, + { + "epoch": 0.05923, + "grad_norm": 0.4849615069695424, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 5923 + }, + { + "epoch": 0.05924, + "grad_norm": 0.46280219229994835, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 5924 + }, + { + "epoch": 0.05925, + "grad_norm": 0.48491396846071155, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 5925 + }, + { + "epoch": 0.05926, + "grad_norm": 0.4958640158739645, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 5926 + }, + { + "epoch": 0.05927, + "grad_norm": 0.4751378170054991, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 5927 + }, + { + "epoch": 0.05928, + "grad_norm": 0.4782244181945609, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 5928 + }, + { + "epoch": 0.05929, + "grad_norm": 0.5398915288946496, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 5929 + }, + { + "epoch": 0.0593, + "grad_norm": 0.5515514495551406, + "learning_rate": 0.003, + "loss": 4.123, + "step": 5930 + }, + { + "epoch": 0.05931, + "grad_norm": 0.5294701484364319, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 5931 + }, + { + "epoch": 0.05932, + "grad_norm": 0.636944020483588, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 5932 + }, + { + "epoch": 0.05933, + "grad_norm": 0.7543213645563572, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 5933 + }, + { + "epoch": 0.05934, + "grad_norm": 0.866274898856157, + "learning_rate": 0.003, + "loss": 4.153, + "step": 5934 + }, + { + "epoch": 0.05935, + "grad_norm": 0.8843523390695053, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 5935 + }, + { + "epoch": 0.05936, + "grad_norm": 0.8578932374136042, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 5936 + }, + { + "epoch": 0.05937, + "grad_norm": 0.9791083661921728, + "learning_rate": 0.003, + "loss": 4.1452, + "step": 5937 + }, + { + "epoch": 0.05938, + "grad_norm": 1.091849551063209, + "learning_rate": 0.003, + "loss": 4.1333, + "step": 5938 + }, + { + "epoch": 0.05939, + "grad_norm": 0.8366778414432169, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 5939 + }, + { + "epoch": 0.0594, + "grad_norm": 0.8548416780890227, + "learning_rate": 0.003, + "loss": 4.1619, + "step": 5940 + }, + { + "epoch": 0.05941, + "grad_norm": 0.8185445127313269, + "learning_rate": 0.003, + "loss": 4.1571, + "step": 5941 + }, + { + "epoch": 0.05942, + "grad_norm": 0.7989954573855615, + "learning_rate": 0.003, + "loss": 4.151, + "step": 5942 + }, + { + "epoch": 0.05943, + "grad_norm": 0.6699215014203319, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 5943 + }, + { + "epoch": 0.05944, + "grad_norm": 0.6793053946810763, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 5944 + }, + { + "epoch": 0.05945, + "grad_norm": 0.594609731292735, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 5945 + }, + { + "epoch": 0.05946, + "grad_norm": 0.6220089630527496, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 5946 + }, + { + "epoch": 0.05947, + "grad_norm": 0.6014765437282273, + "learning_rate": 0.003, + "loss": 4.118, + "step": 5947 + }, + { + "epoch": 0.05948, + "grad_norm": 0.5605950560039095, + "learning_rate": 0.003, + "loss": 4.1307, + "step": 5948 + }, + { + "epoch": 0.05949, + "grad_norm": 0.627701504083057, + "learning_rate": 0.003, + "loss": 4.121, + "step": 5949 + }, + { + "epoch": 0.0595, + "grad_norm": 0.825535661839418, + "learning_rate": 0.003, + "loss": 4.1392, + "step": 5950 + }, + { + "epoch": 0.05951, + "grad_norm": 1.016734954201298, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 5951 + }, + { + "epoch": 0.05952, + "grad_norm": 0.9844508366684808, + "learning_rate": 0.003, + "loss": 4.1538, + "step": 5952 + }, + { + "epoch": 0.05953, + "grad_norm": 0.7676164320052801, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 5953 + }, + { + "epoch": 0.05954, + "grad_norm": 0.6525936684271212, + "learning_rate": 0.003, + "loss": 4.1454, + "step": 5954 + }, + { + "epoch": 0.05955, + "grad_norm": 0.6209927774101155, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 5955 + }, + { + "epoch": 0.05956, + "grad_norm": 0.5271299792696181, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 5956 + }, + { + "epoch": 0.05957, + "grad_norm": 0.4821600312349904, + "learning_rate": 0.003, + "loss": 4.108, + "step": 5957 + }, + { + "epoch": 0.05958, + "grad_norm": 0.4508961273097574, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 5958 + }, + { + "epoch": 0.05959, + "grad_norm": 0.39637824515281267, + "learning_rate": 0.003, + "loss": 4.1153, + "step": 5959 + }, + { + "epoch": 0.0596, + "grad_norm": 0.38874299684571356, + "learning_rate": 0.003, + "loss": 4.1307, + "step": 5960 + }, + { + "epoch": 0.05961, + "grad_norm": 0.3847016903534145, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 5961 + }, + { + "epoch": 0.05962, + "grad_norm": 0.5209058986494142, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 5962 + }, + { + "epoch": 0.05963, + "grad_norm": 0.6475184427595679, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 5963 + }, + { + "epoch": 0.05964, + "grad_norm": 0.8684223155180772, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 5964 + }, + { + "epoch": 0.05965, + "grad_norm": 0.9048617493837324, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 5965 + }, + { + "epoch": 0.05966, + "grad_norm": 0.6837588042230079, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 5966 + }, + { + "epoch": 0.05967, + "grad_norm": 0.7348291814277423, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 5967 + }, + { + "epoch": 0.05968, + "grad_norm": 0.8050791089461226, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 5968 + }, + { + "epoch": 0.05969, + "grad_norm": 0.6869212130630161, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 5969 + }, + { + "epoch": 0.0597, + "grad_norm": 0.6307870695677188, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 5970 + }, + { + "epoch": 0.05971, + "grad_norm": 0.5810377904286976, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 5971 + }, + { + "epoch": 0.05972, + "grad_norm": 0.6021740778839361, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 5972 + }, + { + "epoch": 0.05973, + "grad_norm": 0.6228448900216191, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 5973 + }, + { + "epoch": 0.05974, + "grad_norm": 0.6655393691311872, + "learning_rate": 0.003, + "loss": 4.1, + "step": 5974 + }, + { + "epoch": 0.05975, + "grad_norm": 0.7310132470202124, + "learning_rate": 0.003, + "loss": 4.1415, + "step": 5975 + }, + { + "epoch": 0.05976, + "grad_norm": 0.644416431543167, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 5976 + }, + { + "epoch": 0.05977, + "grad_norm": 0.5496400386930255, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 5977 + }, + { + "epoch": 0.05978, + "grad_norm": 0.5085905535105046, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 5978 + }, + { + "epoch": 0.05979, + "grad_norm": 0.4862541012980576, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 5979 + }, + { + "epoch": 0.0598, + "grad_norm": 0.4826350144452677, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 5980 + }, + { + "epoch": 0.05981, + "grad_norm": 0.4741196340897285, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 5981 + }, + { + "epoch": 0.05982, + "grad_norm": 0.46060363145123095, + "learning_rate": 0.003, + "loss": 4.109, + "step": 5982 + }, + { + "epoch": 0.05983, + "grad_norm": 0.4820067683683615, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 5983 + }, + { + "epoch": 0.05984, + "grad_norm": 0.4691794530046038, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 5984 + }, + { + "epoch": 0.05985, + "grad_norm": 0.43934894841255706, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 5985 + }, + { + "epoch": 0.05986, + "grad_norm": 0.47930129072684313, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 5986 + }, + { + "epoch": 0.05987, + "grad_norm": 0.5100592025533284, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 5987 + }, + { + "epoch": 0.05988, + "grad_norm": 0.4911602197245711, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 5988 + }, + { + "epoch": 0.05989, + "grad_norm": 0.537756575806154, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 5989 + }, + { + "epoch": 0.0599, + "grad_norm": 0.6185246945024333, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 5990 + }, + { + "epoch": 0.05991, + "grad_norm": 0.6911938315101449, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 5991 + }, + { + "epoch": 0.05992, + "grad_norm": 0.7869203480784628, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 5992 + }, + { + "epoch": 0.05993, + "grad_norm": 0.8486504973239845, + "learning_rate": 0.003, + "loss": 4.1285, + "step": 5993 + }, + { + "epoch": 0.05994, + "grad_norm": 0.7035449677364836, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 5994 + }, + { + "epoch": 0.05995, + "grad_norm": 0.6451536905550219, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 5995 + }, + { + "epoch": 0.05996, + "grad_norm": 0.5969532336692085, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 5996 + }, + { + "epoch": 0.05997, + "grad_norm": 0.5888895798721049, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 5997 + }, + { + "epoch": 0.05998, + "grad_norm": 0.5821959514538432, + "learning_rate": 0.003, + "loss": 4.138, + "step": 5998 + }, + { + "epoch": 0.05999, + "grad_norm": 0.6236789871480475, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 5999 + }, + { + "epoch": 0.06, + "grad_norm": 0.6956596076788343, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 6000 + }, + { + "epoch": 0.06001, + "grad_norm": 0.6802540034736559, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 6001 + }, + { + "epoch": 0.06002, + "grad_norm": 0.6479478065752834, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 6002 + }, + { + "epoch": 0.06003, + "grad_norm": 0.6764473408981168, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 6003 + }, + { + "epoch": 0.06004, + "grad_norm": 0.7085851864172142, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 6004 + }, + { + "epoch": 0.06005, + "grad_norm": 0.8647819122756384, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 6005 + }, + { + "epoch": 0.06006, + "grad_norm": 0.9638548780117764, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 6006 + }, + { + "epoch": 0.06007, + "grad_norm": 0.8220903209106395, + "learning_rate": 0.003, + "loss": 4.1595, + "step": 6007 + }, + { + "epoch": 0.06008, + "grad_norm": 0.7201290584455559, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 6008 + }, + { + "epoch": 0.06009, + "grad_norm": 0.6481918637846485, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 6009 + }, + { + "epoch": 0.0601, + "grad_norm": 0.6606383295563236, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 6010 + }, + { + "epoch": 0.06011, + "grad_norm": 0.6045842842765333, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 6011 + }, + { + "epoch": 0.06012, + "grad_norm": 0.5311879872224512, + "learning_rate": 0.003, + "loss": 4.1304, + "step": 6012 + }, + { + "epoch": 0.06013, + "grad_norm": 0.5661934584861681, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 6013 + }, + { + "epoch": 0.06014, + "grad_norm": 0.5103102427525318, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 6014 + }, + { + "epoch": 0.06015, + "grad_norm": 0.5332466894899932, + "learning_rate": 0.003, + "loss": 4.1251, + "step": 6015 + }, + { + "epoch": 0.06016, + "grad_norm": 0.5814846632342617, + "learning_rate": 0.003, + "loss": 4.128, + "step": 6016 + }, + { + "epoch": 0.06017, + "grad_norm": 0.5584668704349169, + "learning_rate": 0.003, + "loss": 4.131, + "step": 6017 + }, + { + "epoch": 0.06018, + "grad_norm": 0.6487569147119433, + "learning_rate": 0.003, + "loss": 4.1498, + "step": 6018 + }, + { + "epoch": 0.06019, + "grad_norm": 0.8160505827483635, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 6019 + }, + { + "epoch": 0.0602, + "grad_norm": 0.9505286189725469, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 6020 + }, + { + "epoch": 0.06021, + "grad_norm": 0.9008202254968174, + "learning_rate": 0.003, + "loss": 4.1474, + "step": 6021 + }, + { + "epoch": 0.06022, + "grad_norm": 0.8318378400927622, + "learning_rate": 0.003, + "loss": 4.1488, + "step": 6022 + }, + { + "epoch": 0.06023, + "grad_norm": 0.8535039615699999, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 6023 + }, + { + "epoch": 0.06024, + "grad_norm": 0.7266061447147755, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 6024 + }, + { + "epoch": 0.06025, + "grad_norm": 0.6534591173507039, + "learning_rate": 0.003, + "loss": 4.167, + "step": 6025 + }, + { + "epoch": 0.06026, + "grad_norm": 0.6882074560691404, + "learning_rate": 0.003, + "loss": 4.1332, + "step": 6026 + }, + { + "epoch": 0.06027, + "grad_norm": 0.6152833498862915, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 6027 + }, + { + "epoch": 0.06028, + "grad_norm": 0.6917892126475053, + "learning_rate": 0.003, + "loss": 4.1526, + "step": 6028 + }, + { + "epoch": 0.06029, + "grad_norm": 0.7744787029514602, + "learning_rate": 0.003, + "loss": 4.1476, + "step": 6029 + }, + { + "epoch": 0.0603, + "grad_norm": 0.7298002661604193, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 6030 + }, + { + "epoch": 0.06031, + "grad_norm": 0.6529802283746298, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 6031 + }, + { + "epoch": 0.06032, + "grad_norm": 0.663361567248902, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 6032 + }, + { + "epoch": 0.06033, + "grad_norm": 0.6680068753946955, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 6033 + }, + { + "epoch": 0.06034, + "grad_norm": 0.5886111324153055, + "learning_rate": 0.003, + "loss": 4.1497, + "step": 6034 + }, + { + "epoch": 0.06035, + "grad_norm": 0.5817861329776778, + "learning_rate": 0.003, + "loss": 4.14, + "step": 6035 + }, + { + "epoch": 0.06036, + "grad_norm": 0.5937836039304831, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 6036 + }, + { + "epoch": 0.06037, + "grad_norm": 0.5457734592349858, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 6037 + }, + { + "epoch": 0.06038, + "grad_norm": 0.5606694744948917, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 6038 + }, + { + "epoch": 0.06039, + "grad_norm": 0.5636222880096219, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 6039 + }, + { + "epoch": 0.0604, + "grad_norm": 0.499349110228407, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 6040 + }, + { + "epoch": 0.06041, + "grad_norm": 0.4402271663603966, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 6041 + }, + { + "epoch": 0.06042, + "grad_norm": 0.5428448624210946, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 6042 + }, + { + "epoch": 0.06043, + "grad_norm": 0.6553234947154927, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 6043 + }, + { + "epoch": 0.06044, + "grad_norm": 0.8600471908171244, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 6044 + }, + { + "epoch": 0.06045, + "grad_norm": 0.872706540571187, + "learning_rate": 0.003, + "loss": 4.1297, + "step": 6045 + }, + { + "epoch": 0.06046, + "grad_norm": 0.821018845139322, + "learning_rate": 0.003, + "loss": 4.1353, + "step": 6046 + }, + { + "epoch": 0.06047, + "grad_norm": 0.9233882633484479, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 6047 + }, + { + "epoch": 0.06048, + "grad_norm": 0.8442957052670749, + "learning_rate": 0.003, + "loss": 4.117, + "step": 6048 + }, + { + "epoch": 0.06049, + "grad_norm": 0.7302305536562084, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 6049 + }, + { + "epoch": 0.0605, + "grad_norm": 0.7221886690004308, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 6050 + }, + { + "epoch": 0.06051, + "grad_norm": 0.7360382510474752, + "learning_rate": 0.003, + "loss": 4.1511, + "step": 6051 + }, + { + "epoch": 0.06052, + "grad_norm": 0.8032114038858249, + "learning_rate": 0.003, + "loss": 4.1494, + "step": 6052 + }, + { + "epoch": 0.06053, + "grad_norm": 0.902493419850475, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 6053 + }, + { + "epoch": 0.06054, + "grad_norm": 0.806885544934829, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 6054 + }, + { + "epoch": 0.06055, + "grad_norm": 0.6869862769047108, + "learning_rate": 0.003, + "loss": 4.1325, + "step": 6055 + }, + { + "epoch": 0.06056, + "grad_norm": 0.6404057857514671, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 6056 + }, + { + "epoch": 0.06057, + "grad_norm": 0.6290861083377466, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 6057 + }, + { + "epoch": 0.06058, + "grad_norm": 0.5444350086738762, + "learning_rate": 0.003, + "loss": 4.1188, + "step": 6058 + }, + { + "epoch": 0.06059, + "grad_norm": 0.5143246073617321, + "learning_rate": 0.003, + "loss": 4.13, + "step": 6059 + }, + { + "epoch": 0.0606, + "grad_norm": 0.5226193521387048, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 6060 + }, + { + "epoch": 0.06061, + "grad_norm": 0.5393493288965573, + "learning_rate": 0.003, + "loss": 4.1323, + "step": 6061 + }, + { + "epoch": 0.06062, + "grad_norm": 0.514502958864031, + "learning_rate": 0.003, + "loss": 4.1358, + "step": 6062 + }, + { + "epoch": 0.06063, + "grad_norm": 0.5094815444270113, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 6063 + }, + { + "epoch": 0.06064, + "grad_norm": 0.565356316744902, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 6064 + }, + { + "epoch": 0.06065, + "grad_norm": 0.6136518813857853, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 6065 + }, + { + "epoch": 0.06066, + "grad_norm": 0.7283737698010649, + "learning_rate": 0.003, + "loss": 4.1534, + "step": 6066 + }, + { + "epoch": 0.06067, + "grad_norm": 0.8404355722803185, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 6067 + }, + { + "epoch": 0.06068, + "grad_norm": 0.9467403298027881, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 6068 + }, + { + "epoch": 0.06069, + "grad_norm": 0.7197917961662677, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 6069 + }, + { + "epoch": 0.0607, + "grad_norm": 0.5774704901538709, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 6070 + }, + { + "epoch": 0.06071, + "grad_norm": 0.7311169673433349, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 6071 + }, + { + "epoch": 0.06072, + "grad_norm": 0.7458715679339405, + "learning_rate": 0.003, + "loss": 4.1131, + "step": 6072 + }, + { + "epoch": 0.06073, + "grad_norm": 0.6582937125157123, + "learning_rate": 0.003, + "loss": 4.1375, + "step": 6073 + }, + { + "epoch": 0.06074, + "grad_norm": 0.5828453881664645, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 6074 + }, + { + "epoch": 0.06075, + "grad_norm": 0.6029835041926357, + "learning_rate": 0.003, + "loss": 4.1402, + "step": 6075 + }, + { + "epoch": 0.06076, + "grad_norm": 0.6620595916061112, + "learning_rate": 0.003, + "loss": 4.1266, + "step": 6076 + }, + { + "epoch": 0.06077, + "grad_norm": 0.6564194015584814, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 6077 + }, + { + "epoch": 0.06078, + "grad_norm": 0.5186407499376411, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 6078 + }, + { + "epoch": 0.06079, + "grad_norm": 0.46056676475765035, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 6079 + }, + { + "epoch": 0.0608, + "grad_norm": 0.47846889118582303, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 6080 + }, + { + "epoch": 0.06081, + "grad_norm": 0.5041919175905508, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 6081 + }, + { + "epoch": 0.06082, + "grad_norm": 0.46581276372898556, + "learning_rate": 0.003, + "loss": 4.1401, + "step": 6082 + }, + { + "epoch": 0.06083, + "grad_norm": 0.44508131506221504, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 6083 + }, + { + "epoch": 0.06084, + "grad_norm": 0.4965899394145137, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 6084 + }, + { + "epoch": 0.06085, + "grad_norm": 0.54325827930552, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 6085 + }, + { + "epoch": 0.06086, + "grad_norm": 0.566296273115109, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 6086 + }, + { + "epoch": 0.06087, + "grad_norm": 0.6264732981988343, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 6087 + }, + { + "epoch": 0.06088, + "grad_norm": 0.7267394373677858, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 6088 + }, + { + "epoch": 0.06089, + "grad_norm": 0.6910375877495948, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 6089 + }, + { + "epoch": 0.0609, + "grad_norm": 0.6230953516324953, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 6090 + }, + { + "epoch": 0.06091, + "grad_norm": 0.6214986267493964, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 6091 + }, + { + "epoch": 0.06092, + "grad_norm": 0.5881606444052708, + "learning_rate": 0.003, + "loss": 4.096, + "step": 6092 + }, + { + "epoch": 0.06093, + "grad_norm": 0.6823668073031264, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 6093 + }, + { + "epoch": 0.06094, + "grad_norm": 0.8167174518561652, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 6094 + }, + { + "epoch": 0.06095, + "grad_norm": 0.7935802530659607, + "learning_rate": 0.003, + "loss": 4.126, + "step": 6095 + }, + { + "epoch": 0.06096, + "grad_norm": 0.8625378261886631, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 6096 + }, + { + "epoch": 0.06097, + "grad_norm": 0.8936840972481503, + "learning_rate": 0.003, + "loss": 4.1399, + "step": 6097 + }, + { + "epoch": 0.06098, + "grad_norm": 0.9387438614632022, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 6098 + }, + { + "epoch": 0.06099, + "grad_norm": 0.929800973675657, + "learning_rate": 0.003, + "loss": 4.1427, + "step": 6099 + }, + { + "epoch": 0.061, + "grad_norm": 0.7917197466465429, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 6100 + }, + { + "epoch": 0.06101, + "grad_norm": 0.696545220226268, + "learning_rate": 0.003, + "loss": 4.1593, + "step": 6101 + }, + { + "epoch": 0.06102, + "grad_norm": 0.6949648238824377, + "learning_rate": 0.003, + "loss": 4.1406, + "step": 6102 + }, + { + "epoch": 0.06103, + "grad_norm": 0.5850787846680839, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 6103 + }, + { + "epoch": 0.06104, + "grad_norm": 0.6577354660593795, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 6104 + }, + { + "epoch": 0.06105, + "grad_norm": 0.5544659377310093, + "learning_rate": 0.003, + "loss": 4.1189, + "step": 6105 + }, + { + "epoch": 0.06106, + "grad_norm": 0.597694523433053, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 6106 + }, + { + "epoch": 0.06107, + "grad_norm": 0.6874240958175389, + "learning_rate": 0.003, + "loss": 4.1541, + "step": 6107 + }, + { + "epoch": 0.06108, + "grad_norm": 0.7673053125459535, + "learning_rate": 0.003, + "loss": 4.1247, + "step": 6108 + }, + { + "epoch": 0.06109, + "grad_norm": 0.8657897732774222, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 6109 + }, + { + "epoch": 0.0611, + "grad_norm": 0.8160350383434505, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 6110 + }, + { + "epoch": 0.06111, + "grad_norm": 0.7059937113818745, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 6111 + }, + { + "epoch": 0.06112, + "grad_norm": 0.6729024659212177, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 6112 + }, + { + "epoch": 0.06113, + "grad_norm": 0.6927676866964618, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 6113 + }, + { + "epoch": 0.06114, + "grad_norm": 0.6588192987110121, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 6114 + }, + { + "epoch": 0.06115, + "grad_norm": 0.6984192814248, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 6115 + }, + { + "epoch": 0.06116, + "grad_norm": 0.6898828248533009, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 6116 + }, + { + "epoch": 0.06117, + "grad_norm": 0.6133649995686336, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 6117 + }, + { + "epoch": 0.06118, + "grad_norm": 0.5717021418949526, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 6118 + }, + { + "epoch": 0.06119, + "grad_norm": 0.5743733423784956, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 6119 + }, + { + "epoch": 0.0612, + "grad_norm": 0.6414842059318022, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 6120 + }, + { + "epoch": 0.06121, + "grad_norm": 0.6294145948404021, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 6121 + }, + { + "epoch": 0.06122, + "grad_norm": 0.6183422089928163, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 6122 + }, + { + "epoch": 0.06123, + "grad_norm": 0.5371837518354552, + "learning_rate": 0.003, + "loss": 4.1223, + "step": 6123 + }, + { + "epoch": 0.06124, + "grad_norm": 0.5226763311000785, + "learning_rate": 0.003, + "loss": 4.113, + "step": 6124 + }, + { + "epoch": 0.06125, + "grad_norm": 0.42639897195355503, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 6125 + }, + { + "epoch": 0.06126, + "grad_norm": 0.3629196367260766, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 6126 + }, + { + "epoch": 0.06127, + "grad_norm": 0.368265832283412, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 6127 + }, + { + "epoch": 0.06128, + "grad_norm": 0.38181801243432145, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 6128 + }, + { + "epoch": 0.06129, + "grad_norm": 0.40095129455194956, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 6129 + }, + { + "epoch": 0.0613, + "grad_norm": 0.4499458135634559, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 6130 + }, + { + "epoch": 0.06131, + "grad_norm": 0.5593938292242131, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 6131 + }, + { + "epoch": 0.06132, + "grad_norm": 0.68971097830543, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 6132 + }, + { + "epoch": 0.06133, + "grad_norm": 0.7970564003592729, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 6133 + }, + { + "epoch": 0.06134, + "grad_norm": 0.7946308123903568, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 6134 + }, + { + "epoch": 0.06135, + "grad_norm": 0.7243365229476787, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 6135 + }, + { + "epoch": 0.06136, + "grad_norm": 0.7248377814506896, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 6136 + }, + { + "epoch": 0.06137, + "grad_norm": 0.689387348680236, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 6137 + }, + { + "epoch": 0.06138, + "grad_norm": 0.6434616078916127, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 6138 + }, + { + "epoch": 0.06139, + "grad_norm": 0.6388715462619938, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 6139 + }, + { + "epoch": 0.0614, + "grad_norm": 0.5543186971635615, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 6140 + }, + { + "epoch": 0.06141, + "grad_norm": 0.5449818716272995, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 6141 + }, + { + "epoch": 0.06142, + "grad_norm": 0.5906892436672208, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 6142 + }, + { + "epoch": 0.06143, + "grad_norm": 0.6241199656978966, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 6143 + }, + { + "epoch": 0.06144, + "grad_norm": 0.691568154043788, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 6144 + }, + { + "epoch": 0.06145, + "grad_norm": 0.800383551611604, + "learning_rate": 0.003, + "loss": 4.1306, + "step": 6145 + }, + { + "epoch": 0.06146, + "grad_norm": 0.8240824904436047, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 6146 + }, + { + "epoch": 0.06147, + "grad_norm": 0.8394359495307258, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 6147 + }, + { + "epoch": 0.06148, + "grad_norm": 0.8704187123192046, + "learning_rate": 0.003, + "loss": 4.1313, + "step": 6148 + }, + { + "epoch": 0.06149, + "grad_norm": 0.8286112000168357, + "learning_rate": 0.003, + "loss": 4.123, + "step": 6149 + }, + { + "epoch": 0.0615, + "grad_norm": 0.7291708358830001, + "learning_rate": 0.003, + "loss": 4.1369, + "step": 6150 + }, + { + "epoch": 0.06151, + "grad_norm": 0.6713497435304215, + "learning_rate": 0.003, + "loss": 4.1292, + "step": 6151 + }, + { + "epoch": 0.06152, + "grad_norm": 0.6781308025527774, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 6152 + }, + { + "epoch": 0.06153, + "grad_norm": 0.652938479870769, + "learning_rate": 0.003, + "loss": 4.1395, + "step": 6153 + }, + { + "epoch": 0.06154, + "grad_norm": 0.6842620061629228, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 6154 + }, + { + "epoch": 0.06155, + "grad_norm": 0.6696388139082615, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 6155 + }, + { + "epoch": 0.06156, + "grad_norm": 0.661871708727772, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 6156 + }, + { + "epoch": 0.06157, + "grad_norm": 0.6706443949736604, + "learning_rate": 0.003, + "loss": 4.1333, + "step": 6157 + }, + { + "epoch": 0.06158, + "grad_norm": 0.6730091441682898, + "learning_rate": 0.003, + "loss": 4.1581, + "step": 6158 + }, + { + "epoch": 0.06159, + "grad_norm": 0.6516318216884822, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 6159 + }, + { + "epoch": 0.0616, + "grad_norm": 0.6776433090637509, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 6160 + }, + { + "epoch": 0.06161, + "grad_norm": 0.705245630542708, + "learning_rate": 0.003, + "loss": 4.1387, + "step": 6161 + }, + { + "epoch": 0.06162, + "grad_norm": 0.667033646386603, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 6162 + }, + { + "epoch": 0.06163, + "grad_norm": 0.6115409987155483, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 6163 + }, + { + "epoch": 0.06164, + "grad_norm": 0.6022989592102578, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 6164 + }, + { + "epoch": 0.06165, + "grad_norm": 0.4995381931517796, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 6165 + }, + { + "epoch": 0.06166, + "grad_norm": 0.5481265388426909, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 6166 + }, + { + "epoch": 0.06167, + "grad_norm": 0.5609608519278377, + "learning_rate": 0.003, + "loss": 4.107, + "step": 6167 + }, + { + "epoch": 0.06168, + "grad_norm": 0.5825751863249862, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 6168 + }, + { + "epoch": 0.06169, + "grad_norm": 0.7711393078072618, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 6169 + }, + { + "epoch": 0.0617, + "grad_norm": 1.0686022680290213, + "learning_rate": 0.003, + "loss": 4.1327, + "step": 6170 + }, + { + "epoch": 0.06171, + "grad_norm": 0.8988455270603705, + "learning_rate": 0.003, + "loss": 4.1401, + "step": 6171 + }, + { + "epoch": 0.06172, + "grad_norm": 0.668550526380113, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 6172 + }, + { + "epoch": 0.06173, + "grad_norm": 0.6432161964405613, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 6173 + }, + { + "epoch": 0.06174, + "grad_norm": 0.6965904305758828, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 6174 + }, + { + "epoch": 0.06175, + "grad_norm": 0.6884542211695148, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 6175 + }, + { + "epoch": 0.06176, + "grad_norm": 0.722156592407858, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 6176 + }, + { + "epoch": 0.06177, + "grad_norm": 0.7406001109119403, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 6177 + }, + { + "epoch": 0.06178, + "grad_norm": 0.7298011712863863, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 6178 + }, + { + "epoch": 0.06179, + "grad_norm": 0.7122823712875358, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 6179 + }, + { + "epoch": 0.0618, + "grad_norm": 0.5964322698799788, + "learning_rate": 0.003, + "loss": 4.123, + "step": 6180 + }, + { + "epoch": 0.06181, + "grad_norm": 0.581223696651277, + "learning_rate": 0.003, + "loss": 4.1402, + "step": 6181 + }, + { + "epoch": 0.06182, + "grad_norm": 0.6517698358193322, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 6182 + }, + { + "epoch": 0.06183, + "grad_norm": 0.82973155998321, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 6183 + }, + { + "epoch": 0.06184, + "grad_norm": 1.0291494249526294, + "learning_rate": 0.003, + "loss": 4.1363, + "step": 6184 + }, + { + "epoch": 0.06185, + "grad_norm": 0.838098174913646, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 6185 + }, + { + "epoch": 0.06186, + "grad_norm": 0.7438011498693125, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 6186 + }, + { + "epoch": 0.06187, + "grad_norm": 0.6718856898807979, + "learning_rate": 0.003, + "loss": 4.149, + "step": 6187 + }, + { + "epoch": 0.06188, + "grad_norm": 0.5340636370837376, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 6188 + }, + { + "epoch": 0.06189, + "grad_norm": 0.5159651984761162, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 6189 + }, + { + "epoch": 0.0619, + "grad_norm": 0.4916863941661442, + "learning_rate": 0.003, + "loss": 4.1512, + "step": 6190 + }, + { + "epoch": 0.06191, + "grad_norm": 0.4762063974718235, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 6191 + }, + { + "epoch": 0.06192, + "grad_norm": 0.46078596463013155, + "learning_rate": 0.003, + "loss": 4.103, + "step": 6192 + }, + { + "epoch": 0.06193, + "grad_norm": 0.45789803674311924, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 6193 + }, + { + "epoch": 0.06194, + "grad_norm": 0.3890805745658892, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 6194 + }, + { + "epoch": 0.06195, + "grad_norm": 0.39301414143303115, + "learning_rate": 0.003, + "loss": 4.1095, + "step": 6195 + }, + { + "epoch": 0.06196, + "grad_norm": 0.4015529007182125, + "learning_rate": 0.003, + "loss": 4.1204, + "step": 6196 + }, + { + "epoch": 0.06197, + "grad_norm": 0.4155210473439802, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 6197 + }, + { + "epoch": 0.06198, + "grad_norm": 0.4460266614970883, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 6198 + }, + { + "epoch": 0.06199, + "grad_norm": 0.4771646815553033, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 6199 + }, + { + "epoch": 0.062, + "grad_norm": 0.5196589692573884, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 6200 + }, + { + "epoch": 0.06201, + "grad_norm": 0.654622055901127, + "learning_rate": 0.003, + "loss": 4.097, + "step": 6201 + }, + { + "epoch": 0.06202, + "grad_norm": 0.7996732523950479, + "learning_rate": 0.003, + "loss": 4.1475, + "step": 6202 + }, + { + "epoch": 0.06203, + "grad_norm": 0.9238177375399548, + "learning_rate": 0.003, + "loss": 4.1397, + "step": 6203 + }, + { + "epoch": 0.06204, + "grad_norm": 0.8129041862310228, + "learning_rate": 0.003, + "loss": 4.131, + "step": 6204 + }, + { + "epoch": 0.06205, + "grad_norm": 0.6014391143548676, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 6205 + }, + { + "epoch": 0.06206, + "grad_norm": 0.6512606966939022, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 6206 + }, + { + "epoch": 0.06207, + "grad_norm": 0.6313513041064389, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 6207 + }, + { + "epoch": 0.06208, + "grad_norm": 0.5802624432054514, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 6208 + }, + { + "epoch": 0.06209, + "grad_norm": 0.5117728936425465, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 6209 + }, + { + "epoch": 0.0621, + "grad_norm": 0.5061612945373342, + "learning_rate": 0.003, + "loss": 4.15, + "step": 6210 + }, + { + "epoch": 0.06211, + "grad_norm": 0.5713900960332042, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 6211 + }, + { + "epoch": 0.06212, + "grad_norm": 0.6069757619115741, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 6212 + }, + { + "epoch": 0.06213, + "grad_norm": 0.7325742487754486, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 6213 + }, + { + "epoch": 0.06214, + "grad_norm": 0.9327829534552855, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 6214 + }, + { + "epoch": 0.06215, + "grad_norm": 1.035143571261494, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 6215 + }, + { + "epoch": 0.06216, + "grad_norm": 0.9038892809710738, + "learning_rate": 0.003, + "loss": 4.1712, + "step": 6216 + }, + { + "epoch": 0.06217, + "grad_norm": 0.8175109198583256, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 6217 + }, + { + "epoch": 0.06218, + "grad_norm": 0.8799511997661614, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 6218 + }, + { + "epoch": 0.06219, + "grad_norm": 0.9186492040029288, + "learning_rate": 0.003, + "loss": 4.1569, + "step": 6219 + }, + { + "epoch": 0.0622, + "grad_norm": 0.904845342853041, + "learning_rate": 0.003, + "loss": 4.1554, + "step": 6220 + }, + { + "epoch": 0.06221, + "grad_norm": 0.8348080339462427, + "learning_rate": 0.003, + "loss": 4.1528, + "step": 6221 + }, + { + "epoch": 0.06222, + "grad_norm": 0.7823721322877822, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 6222 + }, + { + "epoch": 0.06223, + "grad_norm": 0.8355859013287794, + "learning_rate": 0.003, + "loss": 4.1484, + "step": 6223 + }, + { + "epoch": 0.06224, + "grad_norm": 0.9715063855754449, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 6224 + }, + { + "epoch": 0.06225, + "grad_norm": 0.7808199034581652, + "learning_rate": 0.003, + "loss": 4.1509, + "step": 6225 + }, + { + "epoch": 0.06226, + "grad_norm": 0.5329204522038972, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 6226 + }, + { + "epoch": 0.06227, + "grad_norm": 0.5450265158322551, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 6227 + }, + { + "epoch": 0.06228, + "grad_norm": 0.5351380607743084, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 6228 + }, + { + "epoch": 0.06229, + "grad_norm": 0.4599058234618347, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 6229 + }, + { + "epoch": 0.0623, + "grad_norm": 0.4873390679451465, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 6230 + }, + { + "epoch": 0.06231, + "grad_norm": 0.5391956252106268, + "learning_rate": 0.003, + "loss": 4.1151, + "step": 6231 + }, + { + "epoch": 0.06232, + "grad_norm": 0.5342591833135322, + "learning_rate": 0.003, + "loss": 4.11, + "step": 6232 + }, + { + "epoch": 0.06233, + "grad_norm": 0.5191689316659253, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 6233 + }, + { + "epoch": 0.06234, + "grad_norm": 0.5396807325147936, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 6234 + }, + { + "epoch": 0.06235, + "grad_norm": 0.5913027145203329, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 6235 + }, + { + "epoch": 0.06236, + "grad_norm": 0.5979504829726415, + "learning_rate": 0.003, + "loss": 4.126, + "step": 6236 + }, + { + "epoch": 0.06237, + "grad_norm": 0.5052384980096988, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 6237 + }, + { + "epoch": 0.06238, + "grad_norm": 0.6128825413246882, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 6238 + }, + { + "epoch": 0.06239, + "grad_norm": 0.7235535837216264, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 6239 + }, + { + "epoch": 0.0624, + "grad_norm": 0.7240803076746741, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 6240 + }, + { + "epoch": 0.06241, + "grad_norm": 0.6862973733458866, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 6241 + }, + { + "epoch": 0.06242, + "grad_norm": 0.7830627984578179, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 6242 + }, + { + "epoch": 0.06243, + "grad_norm": 0.7591018516451389, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 6243 + }, + { + "epoch": 0.06244, + "grad_norm": 0.7952190700820353, + "learning_rate": 0.003, + "loss": 4.1517, + "step": 6244 + }, + { + "epoch": 0.06245, + "grad_norm": 0.6835256460654824, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 6245 + }, + { + "epoch": 0.06246, + "grad_norm": 0.6543359618448353, + "learning_rate": 0.003, + "loss": 4.1372, + "step": 6246 + }, + { + "epoch": 0.06247, + "grad_norm": 0.5419452686920828, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 6247 + }, + { + "epoch": 0.06248, + "grad_norm": 0.4806235291826554, + "learning_rate": 0.003, + "loss": 4.114, + "step": 6248 + }, + { + "epoch": 0.06249, + "grad_norm": 0.453910776396139, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 6249 + }, + { + "epoch": 0.0625, + "grad_norm": 0.491783945365717, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 6250 + }, + { + "epoch": 0.06251, + "grad_norm": 0.5018941564042159, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 6251 + }, + { + "epoch": 0.06252, + "grad_norm": 0.4921472089644119, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 6252 + }, + { + "epoch": 0.06253, + "grad_norm": 0.5756543753609307, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 6253 + }, + { + "epoch": 0.06254, + "grad_norm": 0.754887950867572, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 6254 + }, + { + "epoch": 0.06255, + "grad_norm": 0.8752296029209932, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 6255 + }, + { + "epoch": 0.06256, + "grad_norm": 0.9844123215090611, + "learning_rate": 0.003, + "loss": 4.1095, + "step": 6256 + }, + { + "epoch": 0.06257, + "grad_norm": 0.8214120124061037, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 6257 + }, + { + "epoch": 0.06258, + "grad_norm": 0.7602595490679227, + "learning_rate": 0.003, + "loss": 4.141, + "step": 6258 + }, + { + "epoch": 0.06259, + "grad_norm": 0.9107431312102489, + "learning_rate": 0.003, + "loss": 4.1605, + "step": 6259 + }, + { + "epoch": 0.0626, + "grad_norm": 0.8325776100808221, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 6260 + }, + { + "epoch": 0.06261, + "grad_norm": 0.7571831266663098, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 6261 + }, + { + "epoch": 0.06262, + "grad_norm": 0.8785267861570065, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 6262 + }, + { + "epoch": 0.06263, + "grad_norm": 0.8761411863659299, + "learning_rate": 0.003, + "loss": 4.1576, + "step": 6263 + }, + { + "epoch": 0.06264, + "grad_norm": 0.7407405897110528, + "learning_rate": 0.003, + "loss": 4.1281, + "step": 6264 + }, + { + "epoch": 0.06265, + "grad_norm": 0.5999131309688215, + "learning_rate": 0.003, + "loss": 4.1401, + "step": 6265 + }, + { + "epoch": 0.06266, + "grad_norm": 0.5788028949497352, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 6266 + }, + { + "epoch": 0.06267, + "grad_norm": 0.5413831011099959, + "learning_rate": 0.003, + "loss": 4.127, + "step": 6267 + }, + { + "epoch": 0.06268, + "grad_norm": 0.5311326877936203, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 6268 + }, + { + "epoch": 0.06269, + "grad_norm": 0.46005639814155846, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 6269 + }, + { + "epoch": 0.0627, + "grad_norm": 0.4377136995334488, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 6270 + }, + { + "epoch": 0.06271, + "grad_norm": 0.45104310998898184, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 6271 + }, + { + "epoch": 0.06272, + "grad_norm": 0.5739468267833442, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 6272 + }, + { + "epoch": 0.06273, + "grad_norm": 0.6519100999767281, + "learning_rate": 0.003, + "loss": 4.1378, + "step": 6273 + }, + { + "epoch": 0.06274, + "grad_norm": 0.7586632185001425, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 6274 + }, + { + "epoch": 0.06275, + "grad_norm": 0.8032215370564139, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 6275 + }, + { + "epoch": 0.06276, + "grad_norm": 0.808677786213581, + "learning_rate": 0.003, + "loss": 4.1316, + "step": 6276 + }, + { + "epoch": 0.06277, + "grad_norm": 0.7705443100288017, + "learning_rate": 0.003, + "loss": 4.1371, + "step": 6277 + }, + { + "epoch": 0.06278, + "grad_norm": 0.6723307868789453, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 6278 + }, + { + "epoch": 0.06279, + "grad_norm": 0.650139228463092, + "learning_rate": 0.003, + "loss": 4.1425, + "step": 6279 + }, + { + "epoch": 0.0628, + "grad_norm": 0.6941961358979846, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 6280 + }, + { + "epoch": 0.06281, + "grad_norm": 0.6921836377836419, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 6281 + }, + { + "epoch": 0.06282, + "grad_norm": 0.7247534756382068, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 6282 + }, + { + "epoch": 0.06283, + "grad_norm": 0.698538662441215, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 6283 + }, + { + "epoch": 0.06284, + "grad_norm": 0.6478092027016852, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 6284 + }, + { + "epoch": 0.06285, + "grad_norm": 0.588832262527032, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 6285 + }, + { + "epoch": 0.06286, + "grad_norm": 0.471220817590307, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 6286 + }, + { + "epoch": 0.06287, + "grad_norm": 0.5253206409382251, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 6287 + }, + { + "epoch": 0.06288, + "grad_norm": 0.5700598976368867, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 6288 + }, + { + "epoch": 0.06289, + "grad_norm": 0.5480401509047442, + "learning_rate": 0.003, + "loss": 4.078, + "step": 6289 + }, + { + "epoch": 0.0629, + "grad_norm": 0.45176823851380066, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 6290 + }, + { + "epoch": 0.06291, + "grad_norm": 0.42244036366221777, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 6291 + }, + { + "epoch": 0.06292, + "grad_norm": 0.4757895925155192, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 6292 + }, + { + "epoch": 0.06293, + "grad_norm": 0.46687329270252953, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 6293 + }, + { + "epoch": 0.06294, + "grad_norm": 0.4692166334566838, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 6294 + }, + { + "epoch": 0.06295, + "grad_norm": 0.4964943486782987, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 6295 + }, + { + "epoch": 0.06296, + "grad_norm": 0.6023566234041191, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 6296 + }, + { + "epoch": 0.06297, + "grad_norm": 0.6562138349481754, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 6297 + }, + { + "epoch": 0.06298, + "grad_norm": 0.7252858308308101, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 6298 + }, + { + "epoch": 0.06299, + "grad_norm": 0.7441518369961619, + "learning_rate": 0.003, + "loss": 4.109, + "step": 6299 + }, + { + "epoch": 0.063, + "grad_norm": 0.643909205073487, + "learning_rate": 0.003, + "loss": 4.1558, + "step": 6300 + }, + { + "epoch": 0.06301, + "grad_norm": 0.7076717275211952, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 6301 + }, + { + "epoch": 0.06302, + "grad_norm": 0.8610584882505145, + "learning_rate": 0.003, + "loss": 4.1298, + "step": 6302 + }, + { + "epoch": 0.06303, + "grad_norm": 0.9134770715261514, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 6303 + }, + { + "epoch": 0.06304, + "grad_norm": 0.8308795580072063, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 6304 + }, + { + "epoch": 0.06305, + "grad_norm": 0.7703883250041841, + "learning_rate": 0.003, + "loss": 4.1556, + "step": 6305 + }, + { + "epoch": 0.06306, + "grad_norm": 0.8074367440959146, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 6306 + }, + { + "epoch": 0.06307, + "grad_norm": 0.7821961103686115, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 6307 + }, + { + "epoch": 0.06308, + "grad_norm": 0.793746924122557, + "learning_rate": 0.003, + "loss": 4.1339, + "step": 6308 + }, + { + "epoch": 0.06309, + "grad_norm": 0.8187320878830382, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 6309 + }, + { + "epoch": 0.0631, + "grad_norm": 0.8140608370958674, + "learning_rate": 0.003, + "loss": 4.1292, + "step": 6310 + }, + { + "epoch": 0.06311, + "grad_norm": 0.7925215372022332, + "learning_rate": 0.003, + "loss": 4.1223, + "step": 6311 + }, + { + "epoch": 0.06312, + "grad_norm": 0.6948175841091384, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 6312 + }, + { + "epoch": 0.06313, + "grad_norm": 0.6098450127577909, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 6313 + }, + { + "epoch": 0.06314, + "grad_norm": 0.601788808023616, + "learning_rate": 0.003, + "loss": 4.1378, + "step": 6314 + }, + { + "epoch": 0.06315, + "grad_norm": 0.7002256479736269, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 6315 + }, + { + "epoch": 0.06316, + "grad_norm": 0.6702801162745691, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 6316 + }, + { + "epoch": 0.06317, + "grad_norm": 0.6767392466613616, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 6317 + }, + { + "epoch": 0.06318, + "grad_norm": 0.7396695999029177, + "learning_rate": 0.003, + "loss": 4.1577, + "step": 6318 + }, + { + "epoch": 0.06319, + "grad_norm": 0.7873666098443218, + "learning_rate": 0.003, + "loss": 4.1632, + "step": 6319 + }, + { + "epoch": 0.0632, + "grad_norm": 0.7968696734464397, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 6320 + }, + { + "epoch": 0.06321, + "grad_norm": 0.748485567878394, + "learning_rate": 0.003, + "loss": 4.1373, + "step": 6321 + }, + { + "epoch": 0.06322, + "grad_norm": 0.700562547460408, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 6322 + }, + { + "epoch": 0.06323, + "grad_norm": 0.5674479127547497, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 6323 + }, + { + "epoch": 0.06324, + "grad_norm": 0.5979201027650989, + "learning_rate": 0.003, + "loss": 4.129, + "step": 6324 + }, + { + "epoch": 0.06325, + "grad_norm": 0.7374786634922282, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 6325 + }, + { + "epoch": 0.06326, + "grad_norm": 0.700878890093403, + "learning_rate": 0.003, + "loss": 4.1281, + "step": 6326 + }, + { + "epoch": 0.06327, + "grad_norm": 0.6414815261782608, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 6327 + }, + { + "epoch": 0.06328, + "grad_norm": 0.5746442825430522, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 6328 + }, + { + "epoch": 0.06329, + "grad_norm": 0.4936256424209013, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 6329 + }, + { + "epoch": 0.0633, + "grad_norm": 0.5247583876058833, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 6330 + }, + { + "epoch": 0.06331, + "grad_norm": 0.5330239575249108, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 6331 + }, + { + "epoch": 0.06332, + "grad_norm": 0.5813651225344605, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 6332 + }, + { + "epoch": 0.06333, + "grad_norm": 0.597763757106397, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 6333 + }, + { + "epoch": 0.06334, + "grad_norm": 0.6306087415380968, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 6334 + }, + { + "epoch": 0.06335, + "grad_norm": 0.7275752434891477, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 6335 + }, + { + "epoch": 0.06336, + "grad_norm": 0.7536067920651794, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 6336 + }, + { + "epoch": 0.06337, + "grad_norm": 0.7434090606482405, + "learning_rate": 0.003, + "loss": 4.1434, + "step": 6337 + }, + { + "epoch": 0.06338, + "grad_norm": 0.7917061963285527, + "learning_rate": 0.003, + "loss": 4.1306, + "step": 6338 + }, + { + "epoch": 0.06339, + "grad_norm": 0.7963119547355659, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 6339 + }, + { + "epoch": 0.0634, + "grad_norm": 0.7009711549087663, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 6340 + }, + { + "epoch": 0.06341, + "grad_norm": 0.560288845949728, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 6341 + }, + { + "epoch": 0.06342, + "grad_norm": 0.6129201020379068, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 6342 + }, + { + "epoch": 0.06343, + "grad_norm": 0.6289890308043102, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 6343 + }, + { + "epoch": 0.06344, + "grad_norm": 0.5883345878311979, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 6344 + }, + { + "epoch": 0.06345, + "grad_norm": 0.6114359553520017, + "learning_rate": 0.003, + "loss": 4.105, + "step": 6345 + }, + { + "epoch": 0.06346, + "grad_norm": 0.6130452330266231, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 6346 + }, + { + "epoch": 0.06347, + "grad_norm": 0.5609868656725044, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 6347 + }, + { + "epoch": 0.06348, + "grad_norm": 0.5111538310088432, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 6348 + }, + { + "epoch": 0.06349, + "grad_norm": 0.47960134551277134, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 6349 + }, + { + "epoch": 0.0635, + "grad_norm": 0.47528221979889074, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 6350 + }, + { + "epoch": 0.06351, + "grad_norm": 0.44208488045258604, + "learning_rate": 0.003, + "loss": 4.1153, + "step": 6351 + }, + { + "epoch": 0.06352, + "grad_norm": 0.5710585928560157, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 6352 + }, + { + "epoch": 0.06353, + "grad_norm": 0.8023230715536823, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 6353 + }, + { + "epoch": 0.06354, + "grad_norm": 1.0817933740124142, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 6354 + }, + { + "epoch": 0.06355, + "grad_norm": 0.839081793625545, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 6355 + }, + { + "epoch": 0.06356, + "grad_norm": 0.6498473408279323, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 6356 + }, + { + "epoch": 0.06357, + "grad_norm": 0.8657856383717981, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 6357 + }, + { + "epoch": 0.06358, + "grad_norm": 0.7367851380124152, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 6358 + }, + { + "epoch": 0.06359, + "grad_norm": 0.5580748213290369, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 6359 + }, + { + "epoch": 0.0636, + "grad_norm": 0.5715351960251037, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 6360 + }, + { + "epoch": 0.06361, + "grad_norm": 0.565402082000186, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 6361 + }, + { + "epoch": 0.06362, + "grad_norm": 0.6032607082774909, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 6362 + }, + { + "epoch": 0.06363, + "grad_norm": 0.5994033595373762, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 6363 + }, + { + "epoch": 0.06364, + "grad_norm": 0.5608720225119531, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 6364 + }, + { + "epoch": 0.06365, + "grad_norm": 0.5416673020756672, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 6365 + }, + { + "epoch": 0.06366, + "grad_norm": 0.5508838364908115, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 6366 + }, + { + "epoch": 0.06367, + "grad_norm": 0.6095798041890194, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 6367 + }, + { + "epoch": 0.06368, + "grad_norm": 0.582397991543009, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 6368 + }, + { + "epoch": 0.06369, + "grad_norm": 0.6111604358018735, + "learning_rate": 0.003, + "loss": 4.145, + "step": 6369 + }, + { + "epoch": 0.0637, + "grad_norm": 0.6887636290273739, + "learning_rate": 0.003, + "loss": 4.108, + "step": 6370 + }, + { + "epoch": 0.06371, + "grad_norm": 0.7312125926181495, + "learning_rate": 0.003, + "loss": 4.1314, + "step": 6371 + }, + { + "epoch": 0.06372, + "grad_norm": 0.7404666495620588, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 6372 + }, + { + "epoch": 0.06373, + "grad_norm": 0.7812823153466856, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 6373 + }, + { + "epoch": 0.06374, + "grad_norm": 0.7678179937935664, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 6374 + }, + { + "epoch": 0.06375, + "grad_norm": 0.7856368742093315, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 6375 + }, + { + "epoch": 0.06376, + "grad_norm": 0.7933141620096883, + "learning_rate": 0.003, + "loss": 4.1244, + "step": 6376 + }, + { + "epoch": 0.06377, + "grad_norm": 0.7911553024083666, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 6377 + }, + { + "epoch": 0.06378, + "grad_norm": 0.7097351170207112, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 6378 + }, + { + "epoch": 0.06379, + "grad_norm": 0.6022623562253553, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 6379 + }, + { + "epoch": 0.0638, + "grad_norm": 0.6161727993530954, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 6380 + }, + { + "epoch": 0.06381, + "grad_norm": 0.5896296208532763, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 6381 + }, + { + "epoch": 0.06382, + "grad_norm": 0.5666427713210422, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 6382 + }, + { + "epoch": 0.06383, + "grad_norm": 0.5280695299014706, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 6383 + }, + { + "epoch": 0.06384, + "grad_norm": 0.5315287442798651, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 6384 + }, + { + "epoch": 0.06385, + "grad_norm": 0.6180327194782872, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 6385 + }, + { + "epoch": 0.06386, + "grad_norm": 0.6877124042376932, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 6386 + }, + { + "epoch": 0.06387, + "grad_norm": 0.7049546795784516, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 6387 + }, + { + "epoch": 0.06388, + "grad_norm": 0.7560102533270469, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 6388 + }, + { + "epoch": 0.06389, + "grad_norm": 0.7727138622823404, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 6389 + }, + { + "epoch": 0.0639, + "grad_norm": 0.6854773910756959, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 6390 + }, + { + "epoch": 0.06391, + "grad_norm": 0.6116296991895176, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 6391 + }, + { + "epoch": 0.06392, + "grad_norm": 0.7278138368053769, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 6392 + }, + { + "epoch": 0.06393, + "grad_norm": 0.8102481077528341, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 6393 + }, + { + "epoch": 0.06394, + "grad_norm": 0.9128273035282637, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 6394 + }, + { + "epoch": 0.06395, + "grad_norm": 0.8577312836930895, + "learning_rate": 0.003, + "loss": 4.1468, + "step": 6395 + }, + { + "epoch": 0.06396, + "grad_norm": 0.7745179973456292, + "learning_rate": 0.003, + "loss": 4.1578, + "step": 6396 + }, + { + "epoch": 0.06397, + "grad_norm": 0.7881218719479312, + "learning_rate": 0.003, + "loss": 4.1151, + "step": 6397 + }, + { + "epoch": 0.06398, + "grad_norm": 0.7024891362838419, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 6398 + }, + { + "epoch": 0.06399, + "grad_norm": 0.7110448250319877, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 6399 + }, + { + "epoch": 0.064, + "grad_norm": 0.7111684482738276, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 6400 + }, + { + "epoch": 0.06401, + "grad_norm": 0.693597181970454, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 6401 + }, + { + "epoch": 0.06402, + "grad_norm": 0.7922387757761263, + "learning_rate": 0.003, + "loss": 4.115, + "step": 6402 + }, + { + "epoch": 0.06403, + "grad_norm": 0.8260320747984292, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 6403 + }, + { + "epoch": 0.06404, + "grad_norm": 0.729663274761036, + "learning_rate": 0.003, + "loss": 4.1675, + "step": 6404 + }, + { + "epoch": 0.06405, + "grad_norm": 0.7378732939090409, + "learning_rate": 0.003, + "loss": 4.1292, + "step": 6405 + }, + { + "epoch": 0.06406, + "grad_norm": 0.6383629571059021, + "learning_rate": 0.003, + "loss": 4.112, + "step": 6406 + }, + { + "epoch": 0.06407, + "grad_norm": 0.6050162196732369, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 6407 + }, + { + "epoch": 0.06408, + "grad_norm": 0.5983568485448366, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 6408 + }, + { + "epoch": 0.06409, + "grad_norm": 0.5642818531961227, + "learning_rate": 0.003, + "loss": 4.1332, + "step": 6409 + }, + { + "epoch": 0.0641, + "grad_norm": 0.5874430801692443, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 6410 + }, + { + "epoch": 0.06411, + "grad_norm": 0.6330649262504477, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 6411 + }, + { + "epoch": 0.06412, + "grad_norm": 0.6618713014028005, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 6412 + }, + { + "epoch": 0.06413, + "grad_norm": 0.700072236260971, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 6413 + }, + { + "epoch": 0.06414, + "grad_norm": 0.6353084694205889, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 6414 + }, + { + "epoch": 0.06415, + "grad_norm": 0.5831173982008553, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 6415 + }, + { + "epoch": 0.06416, + "grad_norm": 0.5437779001226543, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 6416 + }, + { + "epoch": 0.06417, + "grad_norm": 0.6107145316832859, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 6417 + }, + { + "epoch": 0.06418, + "grad_norm": 0.6005928402730104, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 6418 + }, + { + "epoch": 0.06419, + "grad_norm": 0.583926884989267, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 6419 + }, + { + "epoch": 0.0642, + "grad_norm": 0.6742080868290792, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 6420 + }, + { + "epoch": 0.06421, + "grad_norm": 0.7855355256548284, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 6421 + }, + { + "epoch": 0.06422, + "grad_norm": 0.9319136699441484, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 6422 + }, + { + "epoch": 0.06423, + "grad_norm": 0.999583894911011, + "learning_rate": 0.003, + "loss": 4.1451, + "step": 6423 + }, + { + "epoch": 0.06424, + "grad_norm": 0.9331342374240993, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 6424 + }, + { + "epoch": 0.06425, + "grad_norm": 0.7793159824304509, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 6425 + }, + { + "epoch": 0.06426, + "grad_norm": 0.6962948866885783, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 6426 + }, + { + "epoch": 0.06427, + "grad_norm": 0.683410446043668, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 6427 + }, + { + "epoch": 0.06428, + "grad_norm": 0.6512307290531658, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 6428 + }, + { + "epoch": 0.06429, + "grad_norm": 0.6377825140551957, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 6429 + }, + { + "epoch": 0.0643, + "grad_norm": 0.5759747531004582, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 6430 + }, + { + "epoch": 0.06431, + "grad_norm": 0.5676646229844707, + "learning_rate": 0.003, + "loss": 4.114, + "step": 6431 + }, + { + "epoch": 0.06432, + "grad_norm": 0.5461669723653775, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 6432 + }, + { + "epoch": 0.06433, + "grad_norm": 0.45797081849907423, + "learning_rate": 0.003, + "loss": 4.13, + "step": 6433 + }, + { + "epoch": 0.06434, + "grad_norm": 0.40792521706337564, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 6434 + }, + { + "epoch": 0.06435, + "grad_norm": 0.4730517544060023, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 6435 + }, + { + "epoch": 0.06436, + "grad_norm": 0.4766134688447076, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 6436 + }, + { + "epoch": 0.06437, + "grad_norm": 0.45378874087374643, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 6437 + }, + { + "epoch": 0.06438, + "grad_norm": 0.44043582431299527, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 6438 + }, + { + "epoch": 0.06439, + "grad_norm": 0.46296521989188577, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 6439 + }, + { + "epoch": 0.0644, + "grad_norm": 0.43339108687839656, + "learning_rate": 0.003, + "loss": 4.095, + "step": 6440 + }, + { + "epoch": 0.06441, + "grad_norm": 0.5476027827440176, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 6441 + }, + { + "epoch": 0.06442, + "grad_norm": 0.7595327046230422, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 6442 + }, + { + "epoch": 0.06443, + "grad_norm": 0.8816633245265507, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 6443 + }, + { + "epoch": 0.06444, + "grad_norm": 0.7973460889122836, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 6444 + }, + { + "epoch": 0.06445, + "grad_norm": 0.6695114396139051, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 6445 + }, + { + "epoch": 0.06446, + "grad_norm": 0.7112577504861094, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 6446 + }, + { + "epoch": 0.06447, + "grad_norm": 0.6669997171392789, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 6447 + }, + { + "epoch": 0.06448, + "grad_norm": 0.7000529377796273, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 6448 + }, + { + "epoch": 0.06449, + "grad_norm": 0.7629296906277161, + "learning_rate": 0.003, + "loss": 4.1573, + "step": 6449 + }, + { + "epoch": 0.0645, + "grad_norm": 0.8151936619117285, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 6450 + }, + { + "epoch": 0.06451, + "grad_norm": 0.7790440723976773, + "learning_rate": 0.003, + "loss": 4.1546, + "step": 6451 + }, + { + "epoch": 0.06452, + "grad_norm": 0.7540024646037367, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 6452 + }, + { + "epoch": 0.06453, + "grad_norm": 0.7911961892915145, + "learning_rate": 0.003, + "loss": 4.1565, + "step": 6453 + }, + { + "epoch": 0.06454, + "grad_norm": 0.8237841593306827, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 6454 + }, + { + "epoch": 0.06455, + "grad_norm": 0.8231550575684212, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 6455 + }, + { + "epoch": 0.06456, + "grad_norm": 0.7811541896303539, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 6456 + }, + { + "epoch": 0.06457, + "grad_norm": 0.8750658750124761, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 6457 + }, + { + "epoch": 0.06458, + "grad_norm": 0.9462957109484857, + "learning_rate": 0.003, + "loss": 4.1368, + "step": 6458 + }, + { + "epoch": 0.06459, + "grad_norm": 0.951508448401663, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 6459 + }, + { + "epoch": 0.0646, + "grad_norm": 0.7887348617904948, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 6460 + }, + { + "epoch": 0.06461, + "grad_norm": 0.6561119853474549, + "learning_rate": 0.003, + "loss": 4.1643, + "step": 6461 + }, + { + "epoch": 0.06462, + "grad_norm": 0.6564843031193351, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 6462 + }, + { + "epoch": 0.06463, + "grad_norm": 0.6866574328355095, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 6463 + }, + { + "epoch": 0.06464, + "grad_norm": 0.7412943846552807, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 6464 + }, + { + "epoch": 0.06465, + "grad_norm": 0.8607666448452919, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 6465 + }, + { + "epoch": 0.06466, + "grad_norm": 0.9805980509960028, + "learning_rate": 0.003, + "loss": 4.1351, + "step": 6466 + }, + { + "epoch": 0.06467, + "grad_norm": 0.8240025115330769, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 6467 + }, + { + "epoch": 0.06468, + "grad_norm": 0.6915322018007451, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 6468 + }, + { + "epoch": 0.06469, + "grad_norm": 0.758205250001039, + "learning_rate": 0.003, + "loss": 4.1393, + "step": 6469 + }, + { + "epoch": 0.0647, + "grad_norm": 0.6894610388291892, + "learning_rate": 0.003, + "loss": 4.1469, + "step": 6470 + }, + { + "epoch": 0.06471, + "grad_norm": 0.5734326985005797, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 6471 + }, + { + "epoch": 0.06472, + "grad_norm": 0.6006963939375836, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 6472 + }, + { + "epoch": 0.06473, + "grad_norm": 0.5903102918749725, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 6473 + }, + { + "epoch": 0.06474, + "grad_norm": 0.6245812199248715, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 6474 + }, + { + "epoch": 0.06475, + "grad_norm": 0.6385947320236768, + "learning_rate": 0.003, + "loss": 4.1448, + "step": 6475 + }, + { + "epoch": 0.06476, + "grad_norm": 0.5964051582023622, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 6476 + }, + { + "epoch": 0.06477, + "grad_norm": 0.4545337714425668, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 6477 + }, + { + "epoch": 0.06478, + "grad_norm": 0.4067813603673349, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 6478 + }, + { + "epoch": 0.06479, + "grad_norm": 0.35829021719441345, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 6479 + }, + { + "epoch": 0.0648, + "grad_norm": 0.36043755979794306, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 6480 + }, + { + "epoch": 0.06481, + "grad_norm": 0.379368972959528, + "learning_rate": 0.003, + "loss": 4.102, + "step": 6481 + }, + { + "epoch": 0.06482, + "grad_norm": 0.3991318139475991, + "learning_rate": 0.003, + "loss": 4.1, + "step": 6482 + }, + { + "epoch": 0.06483, + "grad_norm": 0.5102756201687999, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 6483 + }, + { + "epoch": 0.06484, + "grad_norm": 0.6747208649376389, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 6484 + }, + { + "epoch": 0.06485, + "grad_norm": 0.818751842126756, + "learning_rate": 0.003, + "loss": 4.1385, + "step": 6485 + }, + { + "epoch": 0.06486, + "grad_norm": 0.8421941035958429, + "learning_rate": 0.003, + "loss": 4.139, + "step": 6486 + }, + { + "epoch": 0.06487, + "grad_norm": 0.8130967059681043, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 6487 + }, + { + "epoch": 0.06488, + "grad_norm": 0.8190321300487033, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 6488 + }, + { + "epoch": 0.06489, + "grad_norm": 0.8618794228209637, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 6489 + }, + { + "epoch": 0.0649, + "grad_norm": 0.8167067584812114, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 6490 + }, + { + "epoch": 0.06491, + "grad_norm": 0.7120159141814162, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 6491 + }, + { + "epoch": 0.06492, + "grad_norm": 0.6966415188190852, + "learning_rate": 0.003, + "loss": 4.1358, + "step": 6492 + }, + { + "epoch": 0.06493, + "grad_norm": 0.6279768739419431, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 6493 + }, + { + "epoch": 0.06494, + "grad_norm": 0.6729099375336287, + "learning_rate": 0.003, + "loss": 4.1301, + "step": 6494 + }, + { + "epoch": 0.06495, + "grad_norm": 0.7138117731723219, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 6495 + }, + { + "epoch": 0.06496, + "grad_norm": 0.7481765813018704, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 6496 + }, + { + "epoch": 0.06497, + "grad_norm": 0.6676385968633429, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 6497 + }, + { + "epoch": 0.06498, + "grad_norm": 0.5497380735072145, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 6498 + }, + { + "epoch": 0.06499, + "grad_norm": 0.5123283023400911, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 6499 + }, + { + "epoch": 0.065, + "grad_norm": 0.4909211309507681, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 6500 + }, + { + "epoch": 0.06501, + "grad_norm": 0.47186804780732317, + "learning_rate": 0.003, + "loss": 4.1, + "step": 6501 + }, + { + "epoch": 0.06502, + "grad_norm": 0.4586207578169618, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 6502 + }, + { + "epoch": 0.06503, + "grad_norm": 0.4734551401435721, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 6503 + }, + { + "epoch": 0.06504, + "grad_norm": 0.4518516829518515, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 6504 + }, + { + "epoch": 0.06505, + "grad_norm": 0.3926267599626706, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 6505 + }, + { + "epoch": 0.06506, + "grad_norm": 0.4116861037565272, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 6506 + }, + { + "epoch": 0.06507, + "grad_norm": 0.4869551083875778, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 6507 + }, + { + "epoch": 0.06508, + "grad_norm": 0.595601444957384, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 6508 + }, + { + "epoch": 0.06509, + "grad_norm": 0.8176861477648554, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 6509 + }, + { + "epoch": 0.0651, + "grad_norm": 0.9793486633870401, + "learning_rate": 0.003, + "loss": 4.1399, + "step": 6510 + }, + { + "epoch": 0.06511, + "grad_norm": 1.101260179163563, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 6511 + }, + { + "epoch": 0.06512, + "grad_norm": 0.7048089149615372, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 6512 + }, + { + "epoch": 0.06513, + "grad_norm": 0.6606285760107887, + "learning_rate": 0.003, + "loss": 4.118, + "step": 6513 + }, + { + "epoch": 0.06514, + "grad_norm": 0.6936176900884794, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 6514 + }, + { + "epoch": 0.06515, + "grad_norm": 0.5802739054552889, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 6515 + }, + { + "epoch": 0.06516, + "grad_norm": 0.5957435080730257, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 6516 + }, + { + "epoch": 0.06517, + "grad_norm": 0.6026894638436581, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 6517 + }, + { + "epoch": 0.06518, + "grad_norm": 0.6442432050579253, + "learning_rate": 0.003, + "loss": 4.131, + "step": 6518 + }, + { + "epoch": 0.06519, + "grad_norm": 0.6958415784014198, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 6519 + }, + { + "epoch": 0.0652, + "grad_norm": 0.687200073329854, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 6520 + }, + { + "epoch": 0.06521, + "grad_norm": 0.6695110160765739, + "learning_rate": 0.003, + "loss": 4.126, + "step": 6521 + }, + { + "epoch": 0.06522, + "grad_norm": 0.6730631239357413, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 6522 + }, + { + "epoch": 0.06523, + "grad_norm": 0.6680040550931838, + "learning_rate": 0.003, + "loss": 4.116, + "step": 6523 + }, + { + "epoch": 0.06524, + "grad_norm": 0.7000701390808372, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 6524 + }, + { + "epoch": 0.06525, + "grad_norm": 0.6591874414899291, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 6525 + }, + { + "epoch": 0.06526, + "grad_norm": 0.5974142684472702, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 6526 + }, + { + "epoch": 0.06527, + "grad_norm": 0.629624470137727, + "learning_rate": 0.003, + "loss": 4.123, + "step": 6527 + }, + { + "epoch": 0.06528, + "grad_norm": 0.7082086868906479, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 6528 + }, + { + "epoch": 0.06529, + "grad_norm": 0.7547993486095116, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 6529 + }, + { + "epoch": 0.0653, + "grad_norm": 0.7160694526225951, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 6530 + }, + { + "epoch": 0.06531, + "grad_norm": 0.7091031625495451, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 6531 + }, + { + "epoch": 0.06532, + "grad_norm": 0.5960779659612676, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 6532 + }, + { + "epoch": 0.06533, + "grad_norm": 0.569584737632218, + "learning_rate": 0.003, + "loss": 4.1241, + "step": 6533 + }, + { + "epoch": 0.06534, + "grad_norm": 0.47053846099049507, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 6534 + }, + { + "epoch": 0.06535, + "grad_norm": 0.4876438724766613, + "learning_rate": 0.003, + "loss": 4.074, + "step": 6535 + }, + { + "epoch": 0.06536, + "grad_norm": 0.5421075581742479, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 6536 + }, + { + "epoch": 0.06537, + "grad_norm": 0.6138808767175822, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 6537 + }, + { + "epoch": 0.06538, + "grad_norm": 0.681850513199098, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 6538 + }, + { + "epoch": 0.06539, + "grad_norm": 0.7093467740008924, + "learning_rate": 0.003, + "loss": 4.089, + "step": 6539 + }, + { + "epoch": 0.0654, + "grad_norm": 0.6796947934262606, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 6540 + }, + { + "epoch": 0.06541, + "grad_norm": 0.5895536662879667, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 6541 + }, + { + "epoch": 0.06542, + "grad_norm": 0.5887252594422324, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 6542 + }, + { + "epoch": 0.06543, + "grad_norm": 0.6070666236587158, + "learning_rate": 0.003, + "loss": 4.119, + "step": 6543 + }, + { + "epoch": 0.06544, + "grad_norm": 0.6336458361165094, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 6544 + }, + { + "epoch": 0.06545, + "grad_norm": 0.5994933552644387, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 6545 + }, + { + "epoch": 0.06546, + "grad_norm": 0.5861867238542918, + "learning_rate": 0.003, + "loss": 4.099, + "step": 6546 + }, + { + "epoch": 0.06547, + "grad_norm": 0.7675266494691649, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 6547 + }, + { + "epoch": 0.06548, + "grad_norm": 0.7584810341137693, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 6548 + }, + { + "epoch": 0.06549, + "grad_norm": 0.7244422065216164, + "learning_rate": 0.003, + "loss": 4.1372, + "step": 6549 + }, + { + "epoch": 0.0655, + "grad_norm": 0.7864205468003803, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 6550 + }, + { + "epoch": 0.06551, + "grad_norm": 0.7433546549614126, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 6551 + }, + { + "epoch": 0.06552, + "grad_norm": 0.8758299277222058, + "learning_rate": 0.003, + "loss": 4.142, + "step": 6552 + }, + { + "epoch": 0.06553, + "grad_norm": 0.981060258172319, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 6553 + }, + { + "epoch": 0.06554, + "grad_norm": 1.0836670305325147, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 6554 + }, + { + "epoch": 0.06555, + "grad_norm": 0.893552519172031, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 6555 + }, + { + "epoch": 0.06556, + "grad_norm": 0.8133887797335478, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 6556 + }, + { + "epoch": 0.06557, + "grad_norm": 0.8038068218784429, + "learning_rate": 0.003, + "loss": 4.125, + "step": 6557 + }, + { + "epoch": 0.06558, + "grad_norm": 0.8563243567849327, + "learning_rate": 0.003, + "loss": 4.1577, + "step": 6558 + }, + { + "epoch": 0.06559, + "grad_norm": 0.8357600944003225, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 6559 + }, + { + "epoch": 0.0656, + "grad_norm": 0.7471745091465914, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 6560 + }, + { + "epoch": 0.06561, + "grad_norm": 0.7197252510341869, + "learning_rate": 0.003, + "loss": 4.1523, + "step": 6561 + }, + { + "epoch": 0.06562, + "grad_norm": 0.6961222717520075, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 6562 + }, + { + "epoch": 0.06563, + "grad_norm": 0.7969065869116986, + "learning_rate": 0.003, + "loss": 4.129, + "step": 6563 + }, + { + "epoch": 0.06564, + "grad_norm": 0.8609617020319454, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 6564 + }, + { + "epoch": 0.06565, + "grad_norm": 0.7706054033095626, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 6565 + }, + { + "epoch": 0.06566, + "grad_norm": 0.736160028183247, + "learning_rate": 0.003, + "loss": 4.1315, + "step": 6566 + }, + { + "epoch": 0.06567, + "grad_norm": 0.8827507523317322, + "learning_rate": 0.003, + "loss": 4.1502, + "step": 6567 + }, + { + "epoch": 0.06568, + "grad_norm": 0.9730014527594969, + "learning_rate": 0.003, + "loss": 4.1614, + "step": 6568 + }, + { + "epoch": 0.06569, + "grad_norm": 0.8987588032009837, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 6569 + }, + { + "epoch": 0.0657, + "grad_norm": 0.7480401403974306, + "learning_rate": 0.003, + "loss": 4.131, + "step": 6570 + }, + { + "epoch": 0.06571, + "grad_norm": 0.5934268593799419, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 6571 + }, + { + "epoch": 0.06572, + "grad_norm": 0.6585081494040603, + "learning_rate": 0.003, + "loss": 4.099, + "step": 6572 + }, + { + "epoch": 0.06573, + "grad_norm": 0.7295365583733733, + "learning_rate": 0.003, + "loss": 4.13, + "step": 6573 + }, + { + "epoch": 0.06574, + "grad_norm": 0.756954183791856, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 6574 + }, + { + "epoch": 0.06575, + "grad_norm": 0.7099875197144789, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 6575 + }, + { + "epoch": 0.06576, + "grad_norm": 0.6986413932325001, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 6576 + }, + { + "epoch": 0.06577, + "grad_norm": 0.6408480221530382, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 6577 + }, + { + "epoch": 0.06578, + "grad_norm": 0.5861819343962567, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 6578 + }, + { + "epoch": 0.06579, + "grad_norm": 0.5772678504283916, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 6579 + }, + { + "epoch": 0.0658, + "grad_norm": 0.4907489111870778, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 6580 + }, + { + "epoch": 0.06581, + "grad_norm": 0.4803454909907143, + "learning_rate": 0.003, + "loss": 4.1292, + "step": 6581 + }, + { + "epoch": 0.06582, + "grad_norm": 0.5180246165607337, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 6582 + }, + { + "epoch": 0.06583, + "grad_norm": 0.63192504153945, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 6583 + }, + { + "epoch": 0.06584, + "grad_norm": 0.7212896215769731, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 6584 + }, + { + "epoch": 0.06585, + "grad_norm": 0.7725745929887623, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 6585 + }, + { + "epoch": 0.06586, + "grad_norm": 0.7150680524436269, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 6586 + }, + { + "epoch": 0.06587, + "grad_norm": 0.6454511286444826, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 6587 + }, + { + "epoch": 0.06588, + "grad_norm": 0.6209496060701972, + "learning_rate": 0.003, + "loss": 4.1439, + "step": 6588 + }, + { + "epoch": 0.06589, + "grad_norm": 0.643641834535616, + "learning_rate": 0.003, + "loss": 4.134, + "step": 6589 + }, + { + "epoch": 0.0659, + "grad_norm": 0.6545001554853087, + "learning_rate": 0.003, + "loss": 4.1411, + "step": 6590 + }, + { + "epoch": 0.06591, + "grad_norm": 0.6756223183950393, + "learning_rate": 0.003, + "loss": 4.162, + "step": 6591 + }, + { + "epoch": 0.06592, + "grad_norm": 0.6681680196479507, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 6592 + }, + { + "epoch": 0.06593, + "grad_norm": 0.6370540979582453, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 6593 + }, + { + "epoch": 0.06594, + "grad_norm": 0.6592450103168598, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 6594 + }, + { + "epoch": 0.06595, + "grad_norm": 0.6488902280727541, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 6595 + }, + { + "epoch": 0.06596, + "grad_norm": 0.6078864404662637, + "learning_rate": 0.003, + "loss": 4.1304, + "step": 6596 + }, + { + "epoch": 0.06597, + "grad_norm": 0.5826732446122348, + "learning_rate": 0.003, + "loss": 4.105, + "step": 6597 + }, + { + "epoch": 0.06598, + "grad_norm": 0.5786447854885438, + "learning_rate": 0.003, + "loss": 4.1355, + "step": 6598 + }, + { + "epoch": 0.06599, + "grad_norm": 0.5203150176294418, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 6599 + }, + { + "epoch": 0.066, + "grad_norm": 0.5194458831005944, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 6600 + }, + { + "epoch": 0.06601, + "grad_norm": 0.5125280130771507, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 6601 + }, + { + "epoch": 0.06602, + "grad_norm": 0.5255374937732523, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 6602 + }, + { + "epoch": 0.06603, + "grad_norm": 0.5756998423978795, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 6603 + }, + { + "epoch": 0.06604, + "grad_norm": 0.6322066603487145, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 6604 + }, + { + "epoch": 0.06605, + "grad_norm": 0.6466004843165204, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 6605 + }, + { + "epoch": 0.06606, + "grad_norm": 0.7167353080028168, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 6606 + }, + { + "epoch": 0.06607, + "grad_norm": 0.8469413810873132, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 6607 + }, + { + "epoch": 0.06608, + "grad_norm": 0.7995923933103859, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 6608 + }, + { + "epoch": 0.06609, + "grad_norm": 0.681750060363617, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 6609 + }, + { + "epoch": 0.0661, + "grad_norm": 0.6480858792190892, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 6610 + }, + { + "epoch": 0.06611, + "grad_norm": 0.658286589419674, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 6611 + }, + { + "epoch": 0.06612, + "grad_norm": 0.6793208367131326, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 6612 + }, + { + "epoch": 0.06613, + "grad_norm": 0.6454677330588496, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 6613 + }, + { + "epoch": 0.06614, + "grad_norm": 0.6649584285769843, + "learning_rate": 0.003, + "loss": 4.1145, + "step": 6614 + }, + { + "epoch": 0.06615, + "grad_norm": 0.62805829678068, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 6615 + }, + { + "epoch": 0.06616, + "grad_norm": 0.5438812377408372, + "learning_rate": 0.003, + "loss": 4.1439, + "step": 6616 + }, + { + "epoch": 0.06617, + "grad_norm": 0.5527092065688209, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 6617 + }, + { + "epoch": 0.06618, + "grad_norm": 0.6122283269442648, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 6618 + }, + { + "epoch": 0.06619, + "grad_norm": 0.742890868164231, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 6619 + }, + { + "epoch": 0.0662, + "grad_norm": 0.7283542861441105, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 6620 + }, + { + "epoch": 0.06621, + "grad_norm": 0.5718678667246628, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 6621 + }, + { + "epoch": 0.06622, + "grad_norm": 0.5903274463487566, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 6622 + }, + { + "epoch": 0.06623, + "grad_norm": 0.6512429289105603, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 6623 + }, + { + "epoch": 0.06624, + "grad_norm": 0.6598637861946588, + "learning_rate": 0.003, + "loss": 4.1301, + "step": 6624 + }, + { + "epoch": 0.06625, + "grad_norm": 0.6856917785450871, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 6625 + }, + { + "epoch": 0.06626, + "grad_norm": 0.6780772216393701, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 6626 + }, + { + "epoch": 0.06627, + "grad_norm": 0.6760700441536392, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 6627 + }, + { + "epoch": 0.06628, + "grad_norm": 0.6093236296424669, + "learning_rate": 0.003, + "loss": 4.122, + "step": 6628 + }, + { + "epoch": 0.06629, + "grad_norm": 0.54950173826257, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 6629 + }, + { + "epoch": 0.0663, + "grad_norm": 0.5194281298880598, + "learning_rate": 0.003, + "loss": 4.1318, + "step": 6630 + }, + { + "epoch": 0.06631, + "grad_norm": 0.4311499243556024, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 6631 + }, + { + "epoch": 0.06632, + "grad_norm": 0.4475581432730069, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 6632 + }, + { + "epoch": 0.06633, + "grad_norm": 0.5189111671509972, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 6633 + }, + { + "epoch": 0.06634, + "grad_norm": 0.6506213384335201, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 6634 + }, + { + "epoch": 0.06635, + "grad_norm": 0.951821788755601, + "learning_rate": 0.003, + "loss": 4.1452, + "step": 6635 + }, + { + "epoch": 0.06636, + "grad_norm": 1.1753961378034505, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 6636 + }, + { + "epoch": 0.06637, + "grad_norm": 0.8274131958669491, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 6637 + }, + { + "epoch": 0.06638, + "grad_norm": 0.9087736611088357, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 6638 + }, + { + "epoch": 0.06639, + "grad_norm": 0.8594515515830997, + "learning_rate": 0.003, + "loss": 4.1355, + "step": 6639 + }, + { + "epoch": 0.0664, + "grad_norm": 0.7981389214332112, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 6640 + }, + { + "epoch": 0.06641, + "grad_norm": 0.7337294828850577, + "learning_rate": 0.003, + "loss": 4.1325, + "step": 6641 + }, + { + "epoch": 0.06642, + "grad_norm": 0.6533577770906497, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 6642 + }, + { + "epoch": 0.06643, + "grad_norm": 0.7837625761612944, + "learning_rate": 0.003, + "loss": 4.1251, + "step": 6643 + }, + { + "epoch": 0.06644, + "grad_norm": 0.7304936419918611, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 6644 + }, + { + "epoch": 0.06645, + "grad_norm": 0.7132250314025796, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 6645 + }, + { + "epoch": 0.06646, + "grad_norm": 0.7488926225230269, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 6646 + }, + { + "epoch": 0.06647, + "grad_norm": 0.793819493883866, + "learning_rate": 0.003, + "loss": 4.1447, + "step": 6647 + }, + { + "epoch": 0.06648, + "grad_norm": 0.8562701883893814, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 6648 + }, + { + "epoch": 0.06649, + "grad_norm": 0.8550813838279994, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 6649 + }, + { + "epoch": 0.0665, + "grad_norm": 0.921797160027487, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 6650 + }, + { + "epoch": 0.06651, + "grad_norm": 0.8818469027010972, + "learning_rate": 0.003, + "loss": 4.099, + "step": 6651 + }, + { + "epoch": 0.06652, + "grad_norm": 0.8146093740190103, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 6652 + }, + { + "epoch": 0.06653, + "grad_norm": 0.7944236365667351, + "learning_rate": 0.003, + "loss": 4.1526, + "step": 6653 + }, + { + "epoch": 0.06654, + "grad_norm": 0.8268460125317942, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 6654 + }, + { + "epoch": 0.06655, + "grad_norm": 0.8093935411338872, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 6655 + }, + { + "epoch": 0.06656, + "grad_norm": 0.7794404155644764, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 6656 + }, + { + "epoch": 0.06657, + "grad_norm": 0.867714800249613, + "learning_rate": 0.003, + "loss": 4.1413, + "step": 6657 + }, + { + "epoch": 0.06658, + "grad_norm": 0.9142624375781937, + "learning_rate": 0.003, + "loss": 4.1351, + "step": 6658 + }, + { + "epoch": 0.06659, + "grad_norm": 0.9521474355283531, + "learning_rate": 0.003, + "loss": 4.1507, + "step": 6659 + }, + { + "epoch": 0.0666, + "grad_norm": 0.9916432590288042, + "learning_rate": 0.003, + "loss": 4.1409, + "step": 6660 + }, + { + "epoch": 0.06661, + "grad_norm": 0.7955314202862873, + "learning_rate": 0.003, + "loss": 4.1292, + "step": 6661 + }, + { + "epoch": 0.06662, + "grad_norm": 0.610721250942562, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 6662 + }, + { + "epoch": 0.06663, + "grad_norm": 0.6122676305957936, + "learning_rate": 0.003, + "loss": 4.182, + "step": 6663 + }, + { + "epoch": 0.06664, + "grad_norm": 0.5620102202705014, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 6664 + }, + { + "epoch": 0.06665, + "grad_norm": 0.5333334036557247, + "learning_rate": 0.003, + "loss": 4.1608, + "step": 6665 + }, + { + "epoch": 0.06666, + "grad_norm": 0.4779105794437428, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 6666 + }, + { + "epoch": 0.06667, + "grad_norm": 0.4623720974757374, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 6667 + }, + { + "epoch": 0.06668, + "grad_norm": 0.43970305974094764, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 6668 + }, + { + "epoch": 0.06669, + "grad_norm": 0.4262049724870593, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 6669 + }, + { + "epoch": 0.0667, + "grad_norm": 0.43473058147660226, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 6670 + }, + { + "epoch": 0.06671, + "grad_norm": 0.4811373905144156, + "learning_rate": 0.003, + "loss": 4.1153, + "step": 6671 + }, + { + "epoch": 0.06672, + "grad_norm": 0.48294732369579685, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 6672 + }, + { + "epoch": 0.06673, + "grad_norm": 0.43487438428595204, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 6673 + }, + { + "epoch": 0.06674, + "grad_norm": 0.42810600796813436, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 6674 + }, + { + "epoch": 0.06675, + "grad_norm": 0.4187127469658615, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 6675 + }, + { + "epoch": 0.06676, + "grad_norm": 0.40648645104736636, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 6676 + }, + { + "epoch": 0.06677, + "grad_norm": 0.45239784588879955, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 6677 + }, + { + "epoch": 0.06678, + "grad_norm": 0.507095938002501, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 6678 + }, + { + "epoch": 0.06679, + "grad_norm": 0.5982462726379839, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 6679 + }, + { + "epoch": 0.0668, + "grad_norm": 0.7085661257950622, + "learning_rate": 0.003, + "loss": 4.1394, + "step": 6680 + }, + { + "epoch": 0.06681, + "grad_norm": 0.7670644647417465, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 6681 + }, + { + "epoch": 0.06682, + "grad_norm": 0.666267524204535, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 6682 + }, + { + "epoch": 0.06683, + "grad_norm": 0.5601369401979518, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 6683 + }, + { + "epoch": 0.06684, + "grad_norm": 0.6101758101844771, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 6684 + }, + { + "epoch": 0.06685, + "grad_norm": 0.7821140255267783, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 6685 + }, + { + "epoch": 0.06686, + "grad_norm": 0.870444885006984, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 6686 + }, + { + "epoch": 0.06687, + "grad_norm": 0.885507452709397, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 6687 + }, + { + "epoch": 0.06688, + "grad_norm": 0.7868701754781342, + "learning_rate": 0.003, + "loss": 4.104, + "step": 6688 + }, + { + "epoch": 0.06689, + "grad_norm": 0.8462898511590071, + "learning_rate": 0.003, + "loss": 4.1453, + "step": 6689 + }, + { + "epoch": 0.0669, + "grad_norm": 0.7964646938445537, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 6690 + }, + { + "epoch": 0.06691, + "grad_norm": 0.7632100671298492, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 6691 + }, + { + "epoch": 0.06692, + "grad_norm": 0.7875561157456419, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 6692 + }, + { + "epoch": 0.06693, + "grad_norm": 0.8456951569948422, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 6693 + }, + { + "epoch": 0.06694, + "grad_norm": 0.8930574483409586, + "learning_rate": 0.003, + "loss": 4.1239, + "step": 6694 + }, + { + "epoch": 0.06695, + "grad_norm": 0.7514288949015006, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 6695 + }, + { + "epoch": 0.06696, + "grad_norm": 0.7702907982422175, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 6696 + }, + { + "epoch": 0.06697, + "grad_norm": 0.736140406934018, + "learning_rate": 0.003, + "loss": 4.1304, + "step": 6697 + }, + { + "epoch": 0.06698, + "grad_norm": 0.7509814571017203, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 6698 + }, + { + "epoch": 0.06699, + "grad_norm": 0.6631263407471134, + "learning_rate": 0.003, + "loss": 4.115, + "step": 6699 + }, + { + "epoch": 0.067, + "grad_norm": 0.5871001542490745, + "learning_rate": 0.003, + "loss": 4.1297, + "step": 6700 + }, + { + "epoch": 0.06701, + "grad_norm": 0.5435734122195582, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 6701 + }, + { + "epoch": 0.06702, + "grad_norm": 0.5252277352718284, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 6702 + }, + { + "epoch": 0.06703, + "grad_norm": 0.6019201583785915, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 6703 + }, + { + "epoch": 0.06704, + "grad_norm": 0.6629860196845838, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 6704 + }, + { + "epoch": 0.06705, + "grad_norm": 0.7076318403084291, + "learning_rate": 0.003, + "loss": 4.108, + "step": 6705 + }, + { + "epoch": 0.06706, + "grad_norm": 0.7718630424234669, + "learning_rate": 0.003, + "loss": 4.1562, + "step": 6706 + }, + { + "epoch": 0.06707, + "grad_norm": 0.735500368867454, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 6707 + }, + { + "epoch": 0.06708, + "grad_norm": 0.6126034855042024, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 6708 + }, + { + "epoch": 0.06709, + "grad_norm": 0.5356173594265632, + "learning_rate": 0.003, + "loss": 4.1, + "step": 6709 + }, + { + "epoch": 0.0671, + "grad_norm": 0.4923039482946903, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 6710 + }, + { + "epoch": 0.06711, + "grad_norm": 0.496437675576904, + "learning_rate": 0.003, + "loss": 4.1316, + "step": 6711 + }, + { + "epoch": 0.06712, + "grad_norm": 0.6267076354025424, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 6712 + }, + { + "epoch": 0.06713, + "grad_norm": 0.810419464520027, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 6713 + }, + { + "epoch": 0.06714, + "grad_norm": 1.0033614640972555, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 6714 + }, + { + "epoch": 0.06715, + "grad_norm": 1.034862211372409, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 6715 + }, + { + "epoch": 0.06716, + "grad_norm": 0.7891873870335114, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 6716 + }, + { + "epoch": 0.06717, + "grad_norm": 0.7583795710672536, + "learning_rate": 0.003, + "loss": 4.1501, + "step": 6717 + }, + { + "epoch": 0.06718, + "grad_norm": 0.7115916620356093, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 6718 + }, + { + "epoch": 0.06719, + "grad_norm": 0.6809781091009515, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 6719 + }, + { + "epoch": 0.0672, + "grad_norm": 0.7352768877036369, + "learning_rate": 0.003, + "loss": 4.1239, + "step": 6720 + }, + { + "epoch": 0.06721, + "grad_norm": 0.6315118398036524, + "learning_rate": 0.003, + "loss": 4.135, + "step": 6721 + }, + { + "epoch": 0.06722, + "grad_norm": 0.6185480236265406, + "learning_rate": 0.003, + "loss": 4.1388, + "step": 6722 + }, + { + "epoch": 0.06723, + "grad_norm": 0.5845982684430056, + "learning_rate": 0.003, + "loss": 4.1285, + "step": 6723 + }, + { + "epoch": 0.06724, + "grad_norm": 0.49159392452345974, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 6724 + }, + { + "epoch": 0.06725, + "grad_norm": 0.4513358433552748, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 6725 + }, + { + "epoch": 0.06726, + "grad_norm": 0.4366793957137453, + "learning_rate": 0.003, + "loss": 4.1429, + "step": 6726 + }, + { + "epoch": 0.06727, + "grad_norm": 0.45039616788114, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 6727 + }, + { + "epoch": 0.06728, + "grad_norm": 0.42960667974723327, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 6728 + }, + { + "epoch": 0.06729, + "grad_norm": 0.3936495109654878, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 6729 + }, + { + "epoch": 0.0673, + "grad_norm": 0.4004584193437624, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 6730 + }, + { + "epoch": 0.06731, + "grad_norm": 0.43331873995398884, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 6731 + }, + { + "epoch": 0.06732, + "grad_norm": 0.5033918683059013, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 6732 + }, + { + "epoch": 0.06733, + "grad_norm": 0.5679439839709435, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 6733 + }, + { + "epoch": 0.06734, + "grad_norm": 0.6741835879416314, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 6734 + }, + { + "epoch": 0.06735, + "grad_norm": 0.7951013050676844, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 6735 + }, + { + "epoch": 0.06736, + "grad_norm": 0.8142625336131883, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 6736 + }, + { + "epoch": 0.06737, + "grad_norm": 0.6923418050164777, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 6737 + }, + { + "epoch": 0.06738, + "grad_norm": 0.6417437188150639, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 6738 + }, + { + "epoch": 0.06739, + "grad_norm": 0.6666846912447963, + "learning_rate": 0.003, + "loss": 4.102, + "step": 6739 + }, + { + "epoch": 0.0674, + "grad_norm": 0.7113245183023263, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 6740 + }, + { + "epoch": 0.06741, + "grad_norm": 0.696453096464101, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 6741 + }, + { + "epoch": 0.06742, + "grad_norm": 0.6475796676339262, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 6742 + }, + { + "epoch": 0.06743, + "grad_norm": 0.6335412186431436, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 6743 + }, + { + "epoch": 0.06744, + "grad_norm": 0.5563393921556343, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 6744 + }, + { + "epoch": 0.06745, + "grad_norm": 0.5419204393939647, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 6745 + }, + { + "epoch": 0.06746, + "grad_norm": 0.6275067206076368, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 6746 + }, + { + "epoch": 0.06747, + "grad_norm": 0.6946085858510286, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 6747 + }, + { + "epoch": 0.06748, + "grad_norm": 0.7589007060444024, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 6748 + }, + { + "epoch": 0.06749, + "grad_norm": 0.8157554526066001, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 6749 + }, + { + "epoch": 0.0675, + "grad_norm": 0.7942374978397424, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 6750 + }, + { + "epoch": 0.06751, + "grad_norm": 0.8980523962537803, + "learning_rate": 0.003, + "loss": 4.1399, + "step": 6751 + }, + { + "epoch": 0.06752, + "grad_norm": 1.0269645356154302, + "learning_rate": 0.003, + "loss": 4.1455, + "step": 6752 + }, + { + "epoch": 0.06753, + "grad_norm": 0.9257798372553527, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 6753 + }, + { + "epoch": 0.06754, + "grad_norm": 0.7843345022720416, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 6754 + }, + { + "epoch": 0.06755, + "grad_norm": 0.8626796100641331, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 6755 + }, + { + "epoch": 0.06756, + "grad_norm": 0.813278917534869, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 6756 + }, + { + "epoch": 0.06757, + "grad_norm": 0.7512724737472484, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 6757 + }, + { + "epoch": 0.06758, + "grad_norm": 0.6469087254992117, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 6758 + }, + { + "epoch": 0.06759, + "grad_norm": 0.6436740764232052, + "learning_rate": 0.003, + "loss": 4.148, + "step": 6759 + }, + { + "epoch": 0.0676, + "grad_norm": 0.6943244361400559, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 6760 + }, + { + "epoch": 0.06761, + "grad_norm": 0.6830986749187081, + "learning_rate": 0.003, + "loss": 4.112, + "step": 6761 + }, + { + "epoch": 0.06762, + "grad_norm": 0.6781853474637783, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 6762 + }, + { + "epoch": 0.06763, + "grad_norm": 0.6506611393135219, + "learning_rate": 0.003, + "loss": 4.115, + "step": 6763 + }, + { + "epoch": 0.06764, + "grad_norm": 0.7027611096936427, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 6764 + }, + { + "epoch": 0.06765, + "grad_norm": 0.6155749614693148, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 6765 + }, + { + "epoch": 0.06766, + "grad_norm": 0.6035208063328635, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 6766 + }, + { + "epoch": 0.06767, + "grad_norm": 0.5420944911927922, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 6767 + }, + { + "epoch": 0.06768, + "grad_norm": 0.6137703731544314, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 6768 + }, + { + "epoch": 0.06769, + "grad_norm": 0.7672588192354372, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 6769 + }, + { + "epoch": 0.0677, + "grad_norm": 0.8517644016624661, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 6770 + }, + { + "epoch": 0.06771, + "grad_norm": 0.8225325274897667, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 6771 + }, + { + "epoch": 0.06772, + "grad_norm": 0.8285914075732632, + "learning_rate": 0.003, + "loss": 4.1242, + "step": 6772 + }, + { + "epoch": 0.06773, + "grad_norm": 0.7479732791491223, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 6773 + }, + { + "epoch": 0.06774, + "grad_norm": 0.7939299101707836, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 6774 + }, + { + "epoch": 0.06775, + "grad_norm": 0.8020629017874903, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 6775 + }, + { + "epoch": 0.06776, + "grad_norm": 0.7932490440340019, + "learning_rate": 0.003, + "loss": 4.1527, + "step": 6776 + }, + { + "epoch": 0.06777, + "grad_norm": 0.794180803890291, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 6777 + }, + { + "epoch": 0.06778, + "grad_norm": 0.7991189828810096, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 6778 + }, + { + "epoch": 0.06779, + "grad_norm": 0.818786154364808, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 6779 + }, + { + "epoch": 0.0678, + "grad_norm": 0.712453918318275, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 6780 + }, + { + "epoch": 0.06781, + "grad_norm": 0.6542310526187651, + "learning_rate": 0.003, + "loss": 4.1549, + "step": 6781 + }, + { + "epoch": 0.06782, + "grad_norm": 0.6334297545638243, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 6782 + }, + { + "epoch": 0.06783, + "grad_norm": 0.6222166600018698, + "learning_rate": 0.003, + "loss": 4.136, + "step": 6783 + }, + { + "epoch": 0.06784, + "grad_norm": 0.6173988646215278, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 6784 + }, + { + "epoch": 0.06785, + "grad_norm": 0.6089756191279135, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 6785 + }, + { + "epoch": 0.06786, + "grad_norm": 0.5533280430635569, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 6786 + }, + { + "epoch": 0.06787, + "grad_norm": 0.5466977622748588, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 6787 + }, + { + "epoch": 0.06788, + "grad_norm": 0.5709118469585935, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 6788 + }, + { + "epoch": 0.06789, + "grad_norm": 0.43615212913657647, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 6789 + }, + { + "epoch": 0.0679, + "grad_norm": 0.4445553735594529, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 6790 + }, + { + "epoch": 0.06791, + "grad_norm": 0.4404711279990138, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 6791 + }, + { + "epoch": 0.06792, + "grad_norm": 0.46477638175762537, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 6792 + }, + { + "epoch": 0.06793, + "grad_norm": 0.48722297592983815, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 6793 + }, + { + "epoch": 0.06794, + "grad_norm": 0.4979919688575191, + "learning_rate": 0.003, + "loss": 4.1275, + "step": 6794 + }, + { + "epoch": 0.06795, + "grad_norm": 0.49093393128752905, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 6795 + }, + { + "epoch": 0.06796, + "grad_norm": 0.5892149599358569, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 6796 + }, + { + "epoch": 0.06797, + "grad_norm": 0.7982288521309417, + "learning_rate": 0.003, + "loss": 4.134, + "step": 6797 + }, + { + "epoch": 0.06798, + "grad_norm": 0.9112336477160669, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 6798 + }, + { + "epoch": 0.06799, + "grad_norm": 0.8816688090009441, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 6799 + }, + { + "epoch": 0.068, + "grad_norm": 0.8279620352251622, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 6800 + }, + { + "epoch": 0.06801, + "grad_norm": 0.8613030262720641, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 6801 + }, + { + "epoch": 0.06802, + "grad_norm": 0.9654181970654568, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 6802 + }, + { + "epoch": 0.06803, + "grad_norm": 0.7831222168974614, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 6803 + }, + { + "epoch": 0.06804, + "grad_norm": 0.7859386559785015, + "learning_rate": 0.003, + "loss": 4.137, + "step": 6804 + }, + { + "epoch": 0.06805, + "grad_norm": 0.7680250323768286, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 6805 + }, + { + "epoch": 0.06806, + "grad_norm": 0.7789471170116108, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 6806 + }, + { + "epoch": 0.06807, + "grad_norm": 0.6389939368418367, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 6807 + }, + { + "epoch": 0.06808, + "grad_norm": 0.708633773967604, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 6808 + }, + { + "epoch": 0.06809, + "grad_norm": 0.7180318843790542, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 6809 + }, + { + "epoch": 0.0681, + "grad_norm": 0.7301766381754865, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 6810 + }, + { + "epoch": 0.06811, + "grad_norm": 0.7471263469142446, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 6811 + }, + { + "epoch": 0.06812, + "grad_norm": 0.7142209108867097, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 6812 + }, + { + "epoch": 0.06813, + "grad_norm": 0.6372769592641551, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 6813 + }, + { + "epoch": 0.06814, + "grad_norm": 0.6379955586156076, + "learning_rate": 0.003, + "loss": 4.1221, + "step": 6814 + }, + { + "epoch": 0.06815, + "grad_norm": 0.6038210649447708, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 6815 + }, + { + "epoch": 0.06816, + "grad_norm": 0.5751849767984581, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 6816 + }, + { + "epoch": 0.06817, + "grad_norm": 0.5359776355709539, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 6817 + }, + { + "epoch": 0.06818, + "grad_norm": 0.5146004463623628, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 6818 + }, + { + "epoch": 0.06819, + "grad_norm": 0.5178577295105196, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 6819 + }, + { + "epoch": 0.0682, + "grad_norm": 0.5036407285943472, + "learning_rate": 0.003, + "loss": 4.1358, + "step": 6820 + }, + { + "epoch": 0.06821, + "grad_norm": 0.5531066591245212, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 6821 + }, + { + "epoch": 0.06822, + "grad_norm": 0.6470177042281421, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 6822 + }, + { + "epoch": 0.06823, + "grad_norm": 0.6539637525535354, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 6823 + }, + { + "epoch": 0.06824, + "grad_norm": 0.7493448106054319, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 6824 + }, + { + "epoch": 0.06825, + "grad_norm": 0.7477095595533654, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 6825 + }, + { + "epoch": 0.06826, + "grad_norm": 0.616205641208411, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 6826 + }, + { + "epoch": 0.06827, + "grad_norm": 0.5343803407507347, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 6827 + }, + { + "epoch": 0.06828, + "grad_norm": 0.5516727566441831, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 6828 + }, + { + "epoch": 0.06829, + "grad_norm": 0.5678645003238811, + "learning_rate": 0.003, + "loss": 4.114, + "step": 6829 + }, + { + "epoch": 0.0683, + "grad_norm": 0.5736945535959624, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 6830 + }, + { + "epoch": 0.06831, + "grad_norm": 0.5766186753999193, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 6831 + }, + { + "epoch": 0.06832, + "grad_norm": 0.5412637917693169, + "learning_rate": 0.003, + "loss": 4.094, + "step": 6832 + }, + { + "epoch": 0.06833, + "grad_norm": 0.5354756199374949, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 6833 + }, + { + "epoch": 0.06834, + "grad_norm": 0.55758586869952, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 6834 + }, + { + "epoch": 0.06835, + "grad_norm": 0.5851752358292377, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 6835 + }, + { + "epoch": 0.06836, + "grad_norm": 0.5357594576872605, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 6836 + }, + { + "epoch": 0.06837, + "grad_norm": 0.5493992586473802, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 6837 + }, + { + "epoch": 0.06838, + "grad_norm": 0.6237990486728586, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 6838 + }, + { + "epoch": 0.06839, + "grad_norm": 0.9418401714608854, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 6839 + }, + { + "epoch": 0.0684, + "grad_norm": 1.3064642474920947, + "learning_rate": 0.003, + "loss": 4.1397, + "step": 6840 + }, + { + "epoch": 0.06841, + "grad_norm": 0.6597660684920605, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 6841 + }, + { + "epoch": 0.06842, + "grad_norm": 0.6489661413957194, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 6842 + }, + { + "epoch": 0.06843, + "grad_norm": 0.6773408231334266, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 6843 + }, + { + "epoch": 0.06844, + "grad_norm": 0.6589160490897324, + "learning_rate": 0.003, + "loss": 4.106, + "step": 6844 + }, + { + "epoch": 0.06845, + "grad_norm": 0.5891507948339562, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 6845 + }, + { + "epoch": 0.06846, + "grad_norm": 0.5999658896363759, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 6846 + }, + { + "epoch": 0.06847, + "grad_norm": 0.5887881579441098, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 6847 + }, + { + "epoch": 0.06848, + "grad_norm": 0.5649925957595652, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 6848 + }, + { + "epoch": 0.06849, + "grad_norm": 0.5705197551609986, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 6849 + }, + { + "epoch": 0.0685, + "grad_norm": 0.5952753055541125, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 6850 + }, + { + "epoch": 0.06851, + "grad_norm": 0.6213756923220035, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 6851 + }, + { + "epoch": 0.06852, + "grad_norm": 0.6907364246047624, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 6852 + }, + { + "epoch": 0.06853, + "grad_norm": 0.6969902001664416, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 6853 + }, + { + "epoch": 0.06854, + "grad_norm": 0.662173631889165, + "learning_rate": 0.003, + "loss": 4.1239, + "step": 6854 + }, + { + "epoch": 0.06855, + "grad_norm": 0.7232071834064421, + "learning_rate": 0.003, + "loss": 4.1377, + "step": 6855 + }, + { + "epoch": 0.06856, + "grad_norm": 0.7279714519404217, + "learning_rate": 0.003, + "loss": 4.1325, + "step": 6856 + }, + { + "epoch": 0.06857, + "grad_norm": 0.7721813680083663, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 6857 + }, + { + "epoch": 0.06858, + "grad_norm": 0.7425927331139939, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 6858 + }, + { + "epoch": 0.06859, + "grad_norm": 0.6721982817507133, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 6859 + }, + { + "epoch": 0.0686, + "grad_norm": 0.5774934823673541, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 6860 + }, + { + "epoch": 0.06861, + "grad_norm": 0.6138735149136986, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 6861 + }, + { + "epoch": 0.06862, + "grad_norm": 0.6955887372108659, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 6862 + }, + { + "epoch": 0.06863, + "grad_norm": 0.7889995228865956, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 6863 + }, + { + "epoch": 0.06864, + "grad_norm": 0.8319268473846568, + "learning_rate": 0.003, + "loss": 4.1189, + "step": 6864 + }, + { + "epoch": 0.06865, + "grad_norm": 0.7332088452187622, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 6865 + }, + { + "epoch": 0.06866, + "grad_norm": 0.7569264658828823, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 6866 + }, + { + "epoch": 0.06867, + "grad_norm": 0.7482157424829708, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 6867 + }, + { + "epoch": 0.06868, + "grad_norm": 0.8067432575353655, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 6868 + }, + { + "epoch": 0.06869, + "grad_norm": 0.795724395615385, + "learning_rate": 0.003, + "loss": 4.127, + "step": 6869 + }, + { + "epoch": 0.0687, + "grad_norm": 0.7990167361942389, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 6870 + }, + { + "epoch": 0.06871, + "grad_norm": 0.751687574887691, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 6871 + }, + { + "epoch": 0.06872, + "grad_norm": 0.7941670622571882, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 6872 + }, + { + "epoch": 0.06873, + "grad_norm": 0.818073654767055, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 6873 + }, + { + "epoch": 0.06874, + "grad_norm": 0.8810084565313016, + "learning_rate": 0.003, + "loss": 4.13, + "step": 6874 + }, + { + "epoch": 0.06875, + "grad_norm": 0.9712435694596839, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 6875 + }, + { + "epoch": 0.06876, + "grad_norm": 1.1759386697440901, + "learning_rate": 0.003, + "loss": 4.1556, + "step": 6876 + }, + { + "epoch": 0.06877, + "grad_norm": 0.8914085065829237, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 6877 + }, + { + "epoch": 0.06878, + "grad_norm": 0.6941362711827244, + "learning_rate": 0.003, + "loss": 4.102, + "step": 6878 + }, + { + "epoch": 0.06879, + "grad_norm": 0.6928015779389758, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 6879 + }, + { + "epoch": 0.0688, + "grad_norm": 0.738782255552787, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 6880 + }, + { + "epoch": 0.06881, + "grad_norm": 0.6910853720919746, + "learning_rate": 0.003, + "loss": 4.112, + "step": 6881 + }, + { + "epoch": 0.06882, + "grad_norm": 0.7254038162214872, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 6882 + }, + { + "epoch": 0.06883, + "grad_norm": 0.8207902946108598, + "learning_rate": 0.003, + "loss": 4.1434, + "step": 6883 + }, + { + "epoch": 0.06884, + "grad_norm": 0.7716553726102112, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 6884 + }, + { + "epoch": 0.06885, + "grad_norm": 0.6958687421650827, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 6885 + }, + { + "epoch": 0.06886, + "grad_norm": 0.5821526100113974, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 6886 + }, + { + "epoch": 0.06887, + "grad_norm": 0.5981430384947571, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 6887 + }, + { + "epoch": 0.06888, + "grad_norm": 0.643772790056028, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 6888 + }, + { + "epoch": 0.06889, + "grad_norm": 0.6944406405800507, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 6889 + }, + { + "epoch": 0.0689, + "grad_norm": 0.6385881084418807, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 6890 + }, + { + "epoch": 0.06891, + "grad_norm": 0.6086144164229688, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 6891 + }, + { + "epoch": 0.06892, + "grad_norm": 0.6012480896266729, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 6892 + }, + { + "epoch": 0.06893, + "grad_norm": 0.561071573044732, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 6893 + }, + { + "epoch": 0.06894, + "grad_norm": 0.48860461588730103, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 6894 + }, + { + "epoch": 0.06895, + "grad_norm": 0.5246990023618513, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 6895 + }, + { + "epoch": 0.06896, + "grad_norm": 0.5887964150643885, + "learning_rate": 0.003, + "loss": 4.088, + "step": 6896 + }, + { + "epoch": 0.06897, + "grad_norm": 0.5241278786378765, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 6897 + }, + { + "epoch": 0.06898, + "grad_norm": 0.5243694688992354, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 6898 + }, + { + "epoch": 0.06899, + "grad_norm": 0.5729571128070601, + "learning_rate": 0.003, + "loss": 4.1318, + "step": 6899 + }, + { + "epoch": 0.069, + "grad_norm": 0.5895390705006303, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 6900 + }, + { + "epoch": 0.06901, + "grad_norm": 0.6952354448422888, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 6901 + }, + { + "epoch": 0.06902, + "grad_norm": 0.6648257134889515, + "learning_rate": 0.003, + "loss": 4.086, + "step": 6902 + }, + { + "epoch": 0.06903, + "grad_norm": 0.6781063401244306, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 6903 + }, + { + "epoch": 0.06904, + "grad_norm": 0.6750907763884256, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 6904 + }, + { + "epoch": 0.06905, + "grad_norm": 0.701961664699957, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 6905 + }, + { + "epoch": 0.06906, + "grad_norm": 0.6469621356429142, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 6906 + }, + { + "epoch": 0.06907, + "grad_norm": 0.5765290038257871, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 6907 + }, + { + "epoch": 0.06908, + "grad_norm": 0.6197899283242572, + "learning_rate": 0.003, + "loss": 4.075, + "step": 6908 + }, + { + "epoch": 0.06909, + "grad_norm": 0.6933557799843862, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 6909 + }, + { + "epoch": 0.0691, + "grad_norm": 0.7562290219169062, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 6910 + }, + { + "epoch": 0.06911, + "grad_norm": 0.7734263332293689, + "learning_rate": 0.003, + "loss": 4.1301, + "step": 6911 + }, + { + "epoch": 0.06912, + "grad_norm": 0.8992983547552829, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 6912 + }, + { + "epoch": 0.06913, + "grad_norm": 1.0208555338096066, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 6913 + }, + { + "epoch": 0.06914, + "grad_norm": 0.9933731989818703, + "learning_rate": 0.003, + "loss": 4.1472, + "step": 6914 + }, + { + "epoch": 0.06915, + "grad_norm": 1.142744843357704, + "learning_rate": 0.003, + "loss": 4.1688, + "step": 6915 + }, + { + "epoch": 0.06916, + "grad_norm": 0.671739611249756, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 6916 + }, + { + "epoch": 0.06917, + "grad_norm": 0.7866141531687029, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 6917 + }, + { + "epoch": 0.06918, + "grad_norm": 0.9611567239474412, + "learning_rate": 0.003, + "loss": 4.1474, + "step": 6918 + }, + { + "epoch": 0.06919, + "grad_norm": 0.9336705733194112, + "learning_rate": 0.003, + "loss": 4.1458, + "step": 6919 + }, + { + "epoch": 0.0692, + "grad_norm": 0.8550272945447894, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 6920 + }, + { + "epoch": 0.06921, + "grad_norm": 0.8255769935938679, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 6921 + }, + { + "epoch": 0.06922, + "grad_norm": 0.7887735703255148, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 6922 + }, + { + "epoch": 0.06923, + "grad_norm": 0.7586591684231498, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 6923 + }, + { + "epoch": 0.06924, + "grad_norm": 0.833512973215651, + "learning_rate": 0.003, + "loss": 4.1555, + "step": 6924 + }, + { + "epoch": 0.06925, + "grad_norm": 0.8413623073932563, + "learning_rate": 0.003, + "loss": 4.1537, + "step": 6925 + }, + { + "epoch": 0.06926, + "grad_norm": 0.7024551972837352, + "learning_rate": 0.003, + "loss": 4.1405, + "step": 6926 + }, + { + "epoch": 0.06927, + "grad_norm": 0.5829899478703182, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 6927 + }, + { + "epoch": 0.06928, + "grad_norm": 0.5794264624289892, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 6928 + }, + { + "epoch": 0.06929, + "grad_norm": 0.5549181816114577, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 6929 + }, + { + "epoch": 0.0693, + "grad_norm": 0.5430801494911696, + "learning_rate": 0.003, + "loss": 4.1566, + "step": 6930 + }, + { + "epoch": 0.06931, + "grad_norm": 0.47027424052399663, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 6931 + }, + { + "epoch": 0.06932, + "grad_norm": 0.4173009048479236, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 6932 + }, + { + "epoch": 0.06933, + "grad_norm": 0.40150485804104163, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 6933 + }, + { + "epoch": 0.06934, + "grad_norm": 0.3862168071563836, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 6934 + }, + { + "epoch": 0.06935, + "grad_norm": 0.446771899060793, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 6935 + }, + { + "epoch": 0.06936, + "grad_norm": 0.5004501018374794, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 6936 + }, + { + "epoch": 0.06937, + "grad_norm": 0.5488151795943027, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 6937 + }, + { + "epoch": 0.06938, + "grad_norm": 0.5770212398497512, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 6938 + }, + { + "epoch": 0.06939, + "grad_norm": 0.558653999723117, + "learning_rate": 0.003, + "loss": 4.104, + "step": 6939 + }, + { + "epoch": 0.0694, + "grad_norm": 0.5466069373357318, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 6940 + }, + { + "epoch": 0.06941, + "grad_norm": 0.49853177456508263, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 6941 + }, + { + "epoch": 0.06942, + "grad_norm": 0.506244292367137, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 6942 + }, + { + "epoch": 0.06943, + "grad_norm": 0.49638439608831303, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 6943 + }, + { + "epoch": 0.06944, + "grad_norm": 0.5489964428142321, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 6944 + }, + { + "epoch": 0.06945, + "grad_norm": 0.7004086562916291, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 6945 + }, + { + "epoch": 0.06946, + "grad_norm": 0.8618988339111132, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 6946 + }, + { + "epoch": 0.06947, + "grad_norm": 0.8898968651463349, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 6947 + }, + { + "epoch": 0.06948, + "grad_norm": 0.8891675654883304, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 6948 + }, + { + "epoch": 0.06949, + "grad_norm": 0.8415925076980938, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 6949 + }, + { + "epoch": 0.0695, + "grad_norm": 0.8358314389372723, + "learning_rate": 0.003, + "loss": 4.106, + "step": 6950 + }, + { + "epoch": 0.06951, + "grad_norm": 0.8991779236158414, + "learning_rate": 0.003, + "loss": 4.1444, + "step": 6951 + }, + { + "epoch": 0.06952, + "grad_norm": 0.9143145994663924, + "learning_rate": 0.003, + "loss": 4.1467, + "step": 6952 + }, + { + "epoch": 0.06953, + "grad_norm": 0.8582198172305813, + "learning_rate": 0.003, + "loss": 4.1448, + "step": 6953 + }, + { + "epoch": 0.06954, + "grad_norm": 0.8532675006476708, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 6954 + }, + { + "epoch": 0.06955, + "grad_norm": 0.8361880964205014, + "learning_rate": 0.003, + "loss": 4.1358, + "step": 6955 + }, + { + "epoch": 0.06956, + "grad_norm": 0.7160992886820927, + "learning_rate": 0.003, + "loss": 4.1352, + "step": 6956 + }, + { + "epoch": 0.06957, + "grad_norm": 0.6572004040022867, + "learning_rate": 0.003, + "loss": 4.112, + "step": 6957 + }, + { + "epoch": 0.06958, + "grad_norm": 0.6687661358163699, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 6958 + }, + { + "epoch": 0.06959, + "grad_norm": 0.6817550869048531, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 6959 + }, + { + "epoch": 0.0696, + "grad_norm": 0.7535821582695273, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 6960 + }, + { + "epoch": 0.06961, + "grad_norm": 0.8825707617170178, + "learning_rate": 0.003, + "loss": 4.138, + "step": 6961 + }, + { + "epoch": 0.06962, + "grad_norm": 0.8710338520388163, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 6962 + }, + { + "epoch": 0.06963, + "grad_norm": 0.816650176387816, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 6963 + }, + { + "epoch": 0.06964, + "grad_norm": 0.7260316746191819, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 6964 + }, + { + "epoch": 0.06965, + "grad_norm": 0.6648041497655378, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 6965 + }, + { + "epoch": 0.06966, + "grad_norm": 0.6784510161564018, + "learning_rate": 0.003, + "loss": 4.131, + "step": 6966 + }, + { + "epoch": 0.06967, + "grad_norm": 0.6452607728019111, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 6967 + }, + { + "epoch": 0.06968, + "grad_norm": 0.6001265248194674, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 6968 + }, + { + "epoch": 0.06969, + "grad_norm": 0.5934642265206468, + "learning_rate": 0.003, + "loss": 4.1301, + "step": 6969 + }, + { + "epoch": 0.0697, + "grad_norm": 0.5831037045271329, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 6970 + }, + { + "epoch": 0.06971, + "grad_norm": 0.6199075375261113, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 6971 + }, + { + "epoch": 0.06972, + "grad_norm": 0.6436556062187045, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 6972 + }, + { + "epoch": 0.06973, + "grad_norm": 0.5645264329417377, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 6973 + }, + { + "epoch": 0.06974, + "grad_norm": 0.6421089807938115, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 6974 + }, + { + "epoch": 0.06975, + "grad_norm": 0.7540210531000154, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 6975 + }, + { + "epoch": 0.06976, + "grad_norm": 0.7638332625980192, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 6976 + }, + { + "epoch": 0.06977, + "grad_norm": 0.8335885386349529, + "learning_rate": 0.003, + "loss": 4.1551, + "step": 6977 + }, + { + "epoch": 0.06978, + "grad_norm": 0.8006226145225701, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 6978 + }, + { + "epoch": 0.06979, + "grad_norm": 0.7348797862910301, + "learning_rate": 0.003, + "loss": 4.1325, + "step": 6979 + }, + { + "epoch": 0.0698, + "grad_norm": 0.8010250875166668, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 6980 + }, + { + "epoch": 0.06981, + "grad_norm": 0.7496972705067594, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 6981 + }, + { + "epoch": 0.06982, + "grad_norm": 0.7126647429947338, + "learning_rate": 0.003, + "loss": 4.105, + "step": 6982 + }, + { + "epoch": 0.06983, + "grad_norm": 0.676360984622482, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 6983 + }, + { + "epoch": 0.06984, + "grad_norm": 0.6167843107245039, + "learning_rate": 0.003, + "loss": 4.092, + "step": 6984 + }, + { + "epoch": 0.06985, + "grad_norm": 0.5226662640709087, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 6985 + }, + { + "epoch": 0.06986, + "grad_norm": 0.4870800595121638, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 6986 + }, + { + "epoch": 0.06987, + "grad_norm": 0.4732107972316138, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 6987 + }, + { + "epoch": 0.06988, + "grad_norm": 0.5060749910774011, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 6988 + }, + { + "epoch": 0.06989, + "grad_norm": 0.6140987263407802, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 6989 + }, + { + "epoch": 0.0699, + "grad_norm": 0.6629504376964789, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 6990 + }, + { + "epoch": 0.06991, + "grad_norm": 0.7586325106957013, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 6991 + }, + { + "epoch": 0.06992, + "grad_norm": 0.6309648702561338, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 6992 + }, + { + "epoch": 0.06993, + "grad_norm": 0.5315699899335162, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 6993 + }, + { + "epoch": 0.06994, + "grad_norm": 0.48772801302005414, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 6994 + }, + { + "epoch": 0.06995, + "grad_norm": 0.5131657989899862, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 6995 + }, + { + "epoch": 0.06996, + "grad_norm": 0.5515157780044487, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 6996 + }, + { + "epoch": 0.06997, + "grad_norm": 0.5525209598406478, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 6997 + }, + { + "epoch": 0.06998, + "grad_norm": 0.479008437370693, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 6998 + }, + { + "epoch": 0.06999, + "grad_norm": 0.4505882666020481, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 6999 + }, + { + "epoch": 0.07, + "grad_norm": 0.520401586803475, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 7000 + }, + { + "epoch": 0.07001, + "grad_norm": 0.5963833247005839, + "learning_rate": 0.003, + "loss": 4.1418, + "step": 7001 + }, + { + "epoch": 0.07002, + "grad_norm": 0.7816965535130395, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 7002 + }, + { + "epoch": 0.07003, + "grad_norm": 0.8103989079782393, + "learning_rate": 0.003, + "loss": 4.09, + "step": 7003 + }, + { + "epoch": 0.07004, + "grad_norm": 0.7881315831755371, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 7004 + }, + { + "epoch": 0.07005, + "grad_norm": 0.8308231793794756, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 7005 + }, + { + "epoch": 0.07006, + "grad_norm": 0.8109007968000713, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 7006 + }, + { + "epoch": 0.07007, + "grad_norm": 0.6344467493229883, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 7007 + }, + { + "epoch": 0.07008, + "grad_norm": 0.6334215629519069, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 7008 + }, + { + "epoch": 0.07009, + "grad_norm": 0.6472001020552638, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 7009 + }, + { + "epoch": 0.0701, + "grad_norm": 0.6243314055174884, + "learning_rate": 0.003, + "loss": 4.09, + "step": 7010 + }, + { + "epoch": 0.07011, + "grad_norm": 0.6932842423649617, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 7011 + }, + { + "epoch": 0.07012, + "grad_norm": 0.8521696913542735, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 7012 + }, + { + "epoch": 0.07013, + "grad_norm": 1.0813751987049736, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 7013 + }, + { + "epoch": 0.07014, + "grad_norm": 0.8685320890891782, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 7014 + }, + { + "epoch": 0.07015, + "grad_norm": 0.714165389488949, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 7015 + }, + { + "epoch": 0.07016, + "grad_norm": 0.6240822477720798, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 7016 + }, + { + "epoch": 0.07017, + "grad_norm": 0.6300786532998714, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 7017 + }, + { + "epoch": 0.07018, + "grad_norm": 0.6076108348038608, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7018 + }, + { + "epoch": 0.07019, + "grad_norm": 0.6908646612154652, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 7019 + }, + { + "epoch": 0.0702, + "grad_norm": 0.6690787244309745, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 7020 + }, + { + "epoch": 0.07021, + "grad_norm": 0.6214099299934482, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 7021 + }, + { + "epoch": 0.07022, + "grad_norm": 0.5406773132983848, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 7022 + }, + { + "epoch": 0.07023, + "grad_norm": 0.5564313613572602, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 7023 + }, + { + "epoch": 0.07024, + "grad_norm": 0.565839780513049, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 7024 + }, + { + "epoch": 0.07025, + "grad_norm": 0.6209168576950462, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 7025 + }, + { + "epoch": 0.07026, + "grad_norm": 0.6750589310489371, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 7026 + }, + { + "epoch": 0.07027, + "grad_norm": 0.683143198082187, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 7027 + }, + { + "epoch": 0.07028, + "grad_norm": 0.6337869888733688, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 7028 + }, + { + "epoch": 0.07029, + "grad_norm": 0.6399163304203913, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 7029 + }, + { + "epoch": 0.0703, + "grad_norm": 0.6748721551610196, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 7030 + }, + { + "epoch": 0.07031, + "grad_norm": 0.7321679893744895, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 7031 + }, + { + "epoch": 0.07032, + "grad_norm": 0.8299726001491252, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 7032 + }, + { + "epoch": 0.07033, + "grad_norm": 0.9060063098737187, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 7033 + }, + { + "epoch": 0.07034, + "grad_norm": 0.9522354378897846, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 7034 + }, + { + "epoch": 0.07035, + "grad_norm": 0.972269290200196, + "learning_rate": 0.003, + "loss": 4.128, + "step": 7035 + }, + { + "epoch": 0.07036, + "grad_norm": 0.924567971293714, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 7036 + }, + { + "epoch": 0.07037, + "grad_norm": 0.8651360365392121, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 7037 + }, + { + "epoch": 0.07038, + "grad_norm": 0.8060967376427206, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 7038 + }, + { + "epoch": 0.07039, + "grad_norm": 0.7677552044419784, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 7039 + }, + { + "epoch": 0.0704, + "grad_norm": 0.7934246766415343, + "learning_rate": 0.003, + "loss": 4.1431, + "step": 7040 + }, + { + "epoch": 0.07041, + "grad_norm": 0.7702031563514626, + "learning_rate": 0.003, + "loss": 4.116, + "step": 7041 + }, + { + "epoch": 0.07042, + "grad_norm": 0.8294171357357263, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 7042 + }, + { + "epoch": 0.07043, + "grad_norm": 0.8699931370635664, + "learning_rate": 0.003, + "loss": 4.1423, + "step": 7043 + }, + { + "epoch": 0.07044, + "grad_norm": 0.8662992135149028, + "learning_rate": 0.003, + "loss": 4.1445, + "step": 7044 + }, + { + "epoch": 0.07045, + "grad_norm": 0.8556015774986996, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 7045 + }, + { + "epoch": 0.07046, + "grad_norm": 0.8195747843665967, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 7046 + }, + { + "epoch": 0.07047, + "grad_norm": 0.7213877816923308, + "learning_rate": 0.003, + "loss": 4.1305, + "step": 7047 + }, + { + "epoch": 0.07048, + "grad_norm": 0.6149100905732247, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 7048 + }, + { + "epoch": 0.07049, + "grad_norm": 0.6732224266829641, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 7049 + }, + { + "epoch": 0.0705, + "grad_norm": 0.7014634218429036, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 7050 + }, + { + "epoch": 0.07051, + "grad_norm": 0.6522225830743739, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 7051 + }, + { + "epoch": 0.07052, + "grad_norm": 0.6500823083801979, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 7052 + }, + { + "epoch": 0.07053, + "grad_norm": 0.6368059007709821, + "learning_rate": 0.003, + "loss": 4.103, + "step": 7053 + }, + { + "epoch": 0.07054, + "grad_norm": 0.6937824763862059, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 7054 + }, + { + "epoch": 0.07055, + "grad_norm": 0.6678203074975709, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 7055 + }, + { + "epoch": 0.07056, + "grad_norm": 0.5999332348304974, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 7056 + }, + { + "epoch": 0.07057, + "grad_norm": 0.6207684817645435, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 7057 + }, + { + "epoch": 0.07058, + "grad_norm": 0.6334942997590186, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 7058 + }, + { + "epoch": 0.07059, + "grad_norm": 0.6248499550694564, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 7059 + }, + { + "epoch": 0.0706, + "grad_norm": 0.6690971751004321, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 7060 + }, + { + "epoch": 0.07061, + "grad_norm": 0.6681006456986236, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 7061 + }, + { + "epoch": 0.07062, + "grad_norm": 0.5873099188423571, + "learning_rate": 0.003, + "loss": 4.133, + "step": 7062 + }, + { + "epoch": 0.07063, + "grad_norm": 0.5660315602781396, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 7063 + }, + { + "epoch": 0.07064, + "grad_norm": 0.5354212695010325, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 7064 + }, + { + "epoch": 0.07065, + "grad_norm": 0.5541650096697861, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 7065 + }, + { + "epoch": 0.07066, + "grad_norm": 0.5737529936227163, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 7066 + }, + { + "epoch": 0.07067, + "grad_norm": 0.6443507476053769, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 7067 + }, + { + "epoch": 0.07068, + "grad_norm": 0.7079051560187819, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 7068 + }, + { + "epoch": 0.07069, + "grad_norm": 0.5681815389061051, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 7069 + }, + { + "epoch": 0.0707, + "grad_norm": 0.5054781241764719, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 7070 + }, + { + "epoch": 0.07071, + "grad_norm": 0.5363029539486839, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 7071 + }, + { + "epoch": 0.07072, + "grad_norm": 0.5662713105012626, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 7072 + }, + { + "epoch": 0.07073, + "grad_norm": 0.5886362212149604, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 7073 + }, + { + "epoch": 0.07074, + "grad_norm": 0.5383952438154733, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 7074 + }, + { + "epoch": 0.07075, + "grad_norm": 0.49108475293469883, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 7075 + }, + { + "epoch": 0.07076, + "grad_norm": 0.46684864123512826, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 7076 + }, + { + "epoch": 0.07077, + "grad_norm": 0.5465447445092506, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 7077 + }, + { + "epoch": 0.07078, + "grad_norm": 0.6015521724166992, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 7078 + }, + { + "epoch": 0.07079, + "grad_norm": 0.85890424119227, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 7079 + }, + { + "epoch": 0.0708, + "grad_norm": 1.282741314578649, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 7080 + }, + { + "epoch": 0.07081, + "grad_norm": 0.6908289270870185, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 7081 + }, + { + "epoch": 0.07082, + "grad_norm": 0.6179688734599202, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 7082 + }, + { + "epoch": 0.07083, + "grad_norm": 0.7995600194026101, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 7083 + }, + { + "epoch": 0.07084, + "grad_norm": 0.8857457001103319, + "learning_rate": 0.003, + "loss": 4.113, + "step": 7084 + }, + { + "epoch": 0.07085, + "grad_norm": 0.9143259065826967, + "learning_rate": 0.003, + "loss": 4.1363, + "step": 7085 + }, + { + "epoch": 0.07086, + "grad_norm": 0.9810070731706124, + "learning_rate": 0.003, + "loss": 4.12, + "step": 7086 + }, + { + "epoch": 0.07087, + "grad_norm": 0.9123121727972462, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 7087 + }, + { + "epoch": 0.07088, + "grad_norm": 0.8624587235237147, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 7088 + }, + { + "epoch": 0.07089, + "grad_norm": 0.7897069970288599, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 7089 + }, + { + "epoch": 0.0709, + "grad_norm": 0.6952949093057026, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 7090 + }, + { + "epoch": 0.07091, + "grad_norm": 0.6864230472842601, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 7091 + }, + { + "epoch": 0.07092, + "grad_norm": 0.6649365850217969, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 7092 + }, + { + "epoch": 0.07093, + "grad_norm": 0.6841019927195824, + "learning_rate": 0.003, + "loss": 4.1305, + "step": 7093 + }, + { + "epoch": 0.07094, + "grad_norm": 0.6127976340559059, + "learning_rate": 0.003, + "loss": 4.1468, + "step": 7094 + }, + { + "epoch": 0.07095, + "grad_norm": 0.616886834283065, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 7095 + }, + { + "epoch": 0.07096, + "grad_norm": 0.6971966421797507, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 7096 + }, + { + "epoch": 0.07097, + "grad_norm": 0.6710301285847851, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 7097 + }, + { + "epoch": 0.07098, + "grad_norm": 0.61562136549296, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 7098 + }, + { + "epoch": 0.07099, + "grad_norm": 0.6419148521913456, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 7099 + }, + { + "epoch": 0.071, + "grad_norm": 0.6323659270253572, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 7100 + }, + { + "epoch": 0.07101, + "grad_norm": 0.6790975940780517, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 7101 + }, + { + "epoch": 0.07102, + "grad_norm": 0.7012073427223774, + "learning_rate": 0.003, + "loss": 4.112, + "step": 7102 + }, + { + "epoch": 0.07103, + "grad_norm": 0.8170376783966959, + "learning_rate": 0.003, + "loss": 4.1282, + "step": 7103 + }, + { + "epoch": 0.07104, + "grad_norm": 0.9864196954449765, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 7104 + }, + { + "epoch": 0.07105, + "grad_norm": 0.9605522303404178, + "learning_rate": 0.003, + "loss": 4.123, + "step": 7105 + }, + { + "epoch": 0.07106, + "grad_norm": 0.8886566459127305, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 7106 + }, + { + "epoch": 0.07107, + "grad_norm": 0.8973256822058434, + "learning_rate": 0.003, + "loss": 4.1364, + "step": 7107 + }, + { + "epoch": 0.07108, + "grad_norm": 0.8957355256957436, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 7108 + }, + { + "epoch": 0.07109, + "grad_norm": 0.9603444798838409, + "learning_rate": 0.003, + "loss": 4.1331, + "step": 7109 + }, + { + "epoch": 0.0711, + "grad_norm": 1.071506569773605, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 7110 + }, + { + "epoch": 0.07111, + "grad_norm": 0.8800294655537059, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 7111 + }, + { + "epoch": 0.07112, + "grad_norm": 0.651480864700506, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 7112 + }, + { + "epoch": 0.07113, + "grad_norm": 0.5845950738429475, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 7113 + }, + { + "epoch": 0.07114, + "grad_norm": 0.5822849601468679, + "learning_rate": 0.003, + "loss": 4.1284, + "step": 7114 + }, + { + "epoch": 0.07115, + "grad_norm": 0.5474586592685721, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 7115 + }, + { + "epoch": 0.07116, + "grad_norm": 0.5187520875086112, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 7116 + }, + { + "epoch": 0.07117, + "grad_norm": 0.4394900872650167, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 7117 + }, + { + "epoch": 0.07118, + "grad_norm": 0.3463005557935632, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 7118 + }, + { + "epoch": 0.07119, + "grad_norm": 0.3752207233130492, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 7119 + }, + { + "epoch": 0.0712, + "grad_norm": 0.4024093721623634, + "learning_rate": 0.003, + "loss": 4.102, + "step": 7120 + }, + { + "epoch": 0.07121, + "grad_norm": 0.4320668079849204, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 7121 + }, + { + "epoch": 0.07122, + "grad_norm": 0.43814596025832847, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 7122 + }, + { + "epoch": 0.07123, + "grad_norm": 0.4660611414965408, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 7123 + }, + { + "epoch": 0.07124, + "grad_norm": 0.4926215620935902, + "learning_rate": 0.003, + "loss": 4.083, + "step": 7124 + }, + { + "epoch": 0.07125, + "grad_norm": 0.4996717729986805, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 7125 + }, + { + "epoch": 0.07126, + "grad_norm": 0.5425463804440391, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 7126 + }, + { + "epoch": 0.07127, + "grad_norm": 0.6028489984656705, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 7127 + }, + { + "epoch": 0.07128, + "grad_norm": 0.6414630429602017, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 7128 + }, + { + "epoch": 0.07129, + "grad_norm": 0.6306384667736967, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 7129 + }, + { + "epoch": 0.0713, + "grad_norm": 0.6762556869839564, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 7130 + }, + { + "epoch": 0.07131, + "grad_norm": 0.782955807754172, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 7131 + }, + { + "epoch": 0.07132, + "grad_norm": 0.8150492320482005, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 7132 + }, + { + "epoch": 0.07133, + "grad_norm": 0.7371939952383894, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 7133 + }, + { + "epoch": 0.07134, + "grad_norm": 0.7676831192441333, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 7134 + }, + { + "epoch": 0.07135, + "grad_norm": 0.7690673723760045, + "learning_rate": 0.003, + "loss": 4.11, + "step": 7135 + }, + { + "epoch": 0.07136, + "grad_norm": 0.7766203756298646, + "learning_rate": 0.003, + "loss": 4.1474, + "step": 7136 + }, + { + "epoch": 0.07137, + "grad_norm": 0.807723913043927, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 7137 + }, + { + "epoch": 0.07138, + "grad_norm": 0.7120609852111736, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 7138 + }, + { + "epoch": 0.07139, + "grad_norm": 0.6409244184377964, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 7139 + }, + { + "epoch": 0.0714, + "grad_norm": 0.6983765449622829, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 7140 + }, + { + "epoch": 0.07141, + "grad_norm": 0.6749975641693673, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 7141 + }, + { + "epoch": 0.07142, + "grad_norm": 0.7443875679801981, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 7142 + }, + { + "epoch": 0.07143, + "grad_norm": 0.8036162553734473, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 7143 + }, + { + "epoch": 0.07144, + "grad_norm": 0.8238780582102404, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 7144 + }, + { + "epoch": 0.07145, + "grad_norm": 0.7818657301386684, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 7145 + }, + { + "epoch": 0.07146, + "grad_norm": 0.589772103883361, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 7146 + }, + { + "epoch": 0.07147, + "grad_norm": 0.5061896394583955, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 7147 + }, + { + "epoch": 0.07148, + "grad_norm": 0.5563222484565196, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 7148 + }, + { + "epoch": 0.07149, + "grad_norm": 0.6306367666731154, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 7149 + }, + { + "epoch": 0.0715, + "grad_norm": 0.6045730081269641, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 7150 + }, + { + "epoch": 0.07151, + "grad_norm": 0.5783897582471499, + "learning_rate": 0.003, + "loss": 4.101, + "step": 7151 + }, + { + "epoch": 0.07152, + "grad_norm": 0.5932667743048016, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 7152 + }, + { + "epoch": 0.07153, + "grad_norm": 0.60473248923499, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 7153 + }, + { + "epoch": 0.07154, + "grad_norm": 0.6236732709390373, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 7154 + }, + { + "epoch": 0.07155, + "grad_norm": 0.7143317986692795, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 7155 + }, + { + "epoch": 0.07156, + "grad_norm": 0.806274009051711, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 7156 + }, + { + "epoch": 0.07157, + "grad_norm": 0.8013068941242575, + "learning_rate": 0.003, + "loss": 4.105, + "step": 7157 + }, + { + "epoch": 0.07158, + "grad_norm": 0.8062400294331641, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 7158 + }, + { + "epoch": 0.07159, + "grad_norm": 0.901901589441292, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 7159 + }, + { + "epoch": 0.0716, + "grad_norm": 0.9668482710989087, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 7160 + }, + { + "epoch": 0.07161, + "grad_norm": 0.905245384275865, + "learning_rate": 0.003, + "loss": 4.1265, + "step": 7161 + }, + { + "epoch": 0.07162, + "grad_norm": 0.9280503905267976, + "learning_rate": 0.003, + "loss": 4.1358, + "step": 7162 + }, + { + "epoch": 0.07163, + "grad_norm": 0.9016130134973356, + "learning_rate": 0.003, + "loss": 4.126, + "step": 7163 + }, + { + "epoch": 0.07164, + "grad_norm": 0.8238069946322247, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 7164 + }, + { + "epoch": 0.07165, + "grad_norm": 0.9229684257907796, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 7165 + }, + { + "epoch": 0.07166, + "grad_norm": 0.8602566555422861, + "learning_rate": 0.003, + "loss": 4.121, + "step": 7166 + }, + { + "epoch": 0.07167, + "grad_norm": 0.7912994410141391, + "learning_rate": 0.003, + "loss": 4.1337, + "step": 7167 + }, + { + "epoch": 0.07168, + "grad_norm": 0.6645028530506353, + "learning_rate": 0.003, + "loss": 4.1462, + "step": 7168 + }, + { + "epoch": 0.07169, + "grad_norm": 0.6619833308914435, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7169 + }, + { + "epoch": 0.0717, + "grad_norm": 0.6040314764385558, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 7170 + }, + { + "epoch": 0.07171, + "grad_norm": 0.6833108775226145, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 7171 + }, + { + "epoch": 0.07172, + "grad_norm": 0.7725987913634927, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 7172 + }, + { + "epoch": 0.07173, + "grad_norm": 0.7147302013430032, + "learning_rate": 0.003, + "loss": 4.1332, + "step": 7173 + }, + { + "epoch": 0.07174, + "grad_norm": 0.7180021296498525, + "learning_rate": 0.003, + "loss": 4.102, + "step": 7174 + }, + { + "epoch": 0.07175, + "grad_norm": 0.7457543221263961, + "learning_rate": 0.003, + "loss": 4.1433, + "step": 7175 + }, + { + "epoch": 0.07176, + "grad_norm": 0.6924904976395436, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 7176 + }, + { + "epoch": 0.07177, + "grad_norm": 0.5304650992546365, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 7177 + }, + { + "epoch": 0.07178, + "grad_norm": 0.5008427987279399, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 7178 + }, + { + "epoch": 0.07179, + "grad_norm": 0.4679265582031314, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 7179 + }, + { + "epoch": 0.0718, + "grad_norm": 0.43745452895304754, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 7180 + }, + { + "epoch": 0.07181, + "grad_norm": 0.45907357705620727, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 7181 + }, + { + "epoch": 0.07182, + "grad_norm": 0.49846152510871444, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 7182 + }, + { + "epoch": 0.07183, + "grad_norm": 0.5359985839562037, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 7183 + }, + { + "epoch": 0.07184, + "grad_norm": 0.6563790510844429, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 7184 + }, + { + "epoch": 0.07185, + "grad_norm": 0.6720180437894167, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 7185 + }, + { + "epoch": 0.07186, + "grad_norm": 0.5587772637850045, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 7186 + }, + { + "epoch": 0.07187, + "grad_norm": 0.5401622815209526, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 7187 + }, + { + "epoch": 0.07188, + "grad_norm": 0.5467984886698021, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 7188 + }, + { + "epoch": 0.07189, + "grad_norm": 0.5726158372896095, + "learning_rate": 0.003, + "loss": 4.11, + "step": 7189 + }, + { + "epoch": 0.0719, + "grad_norm": 0.7251779521154695, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 7190 + }, + { + "epoch": 0.07191, + "grad_norm": 0.8469754288010317, + "learning_rate": 0.003, + "loss": 4.1261, + "step": 7191 + }, + { + "epoch": 0.07192, + "grad_norm": 0.9128848705622925, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 7192 + }, + { + "epoch": 0.07193, + "grad_norm": 0.8310324783740308, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 7193 + }, + { + "epoch": 0.07194, + "grad_norm": 0.6879939761909492, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 7194 + }, + { + "epoch": 0.07195, + "grad_norm": 0.5970160052475568, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 7195 + }, + { + "epoch": 0.07196, + "grad_norm": 0.5894178919524454, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 7196 + }, + { + "epoch": 0.07197, + "grad_norm": 0.5262073910189121, + "learning_rate": 0.003, + "loss": 4.114, + "step": 7197 + }, + { + "epoch": 0.07198, + "grad_norm": 0.5265600947376748, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 7198 + }, + { + "epoch": 0.07199, + "grad_norm": 0.541452819187127, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 7199 + }, + { + "epoch": 0.072, + "grad_norm": 0.6458006332096501, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 7200 + }, + { + "epoch": 0.07201, + "grad_norm": 0.8094071517689202, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 7201 + }, + { + "epoch": 0.07202, + "grad_norm": 0.7787699581874201, + "learning_rate": 0.003, + "loss": 4.1254, + "step": 7202 + }, + { + "epoch": 0.07203, + "grad_norm": 0.668461238530481, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 7203 + }, + { + "epoch": 0.07204, + "grad_norm": 0.6519472171530598, + "learning_rate": 0.003, + "loss": 4.1263, + "step": 7204 + }, + { + "epoch": 0.07205, + "grad_norm": 0.553477955345092, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 7205 + }, + { + "epoch": 0.07206, + "grad_norm": 0.576969241800161, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 7206 + }, + { + "epoch": 0.07207, + "grad_norm": 0.6130153388967977, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 7207 + }, + { + "epoch": 0.07208, + "grad_norm": 0.6120567702128528, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 7208 + }, + { + "epoch": 0.07209, + "grad_norm": 0.5925349870606897, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 7209 + }, + { + "epoch": 0.0721, + "grad_norm": 0.5460934958885093, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 7210 + }, + { + "epoch": 0.07211, + "grad_norm": 0.5617487245207603, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 7211 + }, + { + "epoch": 0.07212, + "grad_norm": 0.5415277872929839, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 7212 + }, + { + "epoch": 0.07213, + "grad_norm": 0.5502607732589133, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 7213 + }, + { + "epoch": 0.07214, + "grad_norm": 0.6409863289739841, + "learning_rate": 0.003, + "loss": 4.072, + "step": 7214 + }, + { + "epoch": 0.07215, + "grad_norm": 0.7570834086211581, + "learning_rate": 0.003, + "loss": 4.104, + "step": 7215 + }, + { + "epoch": 0.07216, + "grad_norm": 0.8132567876934826, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 7216 + }, + { + "epoch": 0.07217, + "grad_norm": 0.9345387667787567, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 7217 + }, + { + "epoch": 0.07218, + "grad_norm": 1.0833907300880954, + "learning_rate": 0.003, + "loss": 4.127, + "step": 7218 + }, + { + "epoch": 0.07219, + "grad_norm": 0.9260287293122886, + "learning_rate": 0.003, + "loss": 4.101, + "step": 7219 + }, + { + "epoch": 0.0722, + "grad_norm": 1.038503139941199, + "learning_rate": 0.003, + "loss": 4.1388, + "step": 7220 + }, + { + "epoch": 0.07221, + "grad_norm": 0.9210746523660105, + "learning_rate": 0.003, + "loss": 4.1333, + "step": 7221 + }, + { + "epoch": 0.07222, + "grad_norm": 0.9858808371441633, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 7222 + }, + { + "epoch": 0.07223, + "grad_norm": 0.9836284417759733, + "learning_rate": 0.003, + "loss": 4.1506, + "step": 7223 + }, + { + "epoch": 0.07224, + "grad_norm": 0.985172160848877, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 7224 + }, + { + "epoch": 0.07225, + "grad_norm": 1.0092262989567715, + "learning_rate": 0.003, + "loss": 4.1431, + "step": 7225 + }, + { + "epoch": 0.07226, + "grad_norm": 1.0380410348021303, + "learning_rate": 0.003, + "loss": 4.1487, + "step": 7226 + }, + { + "epoch": 0.07227, + "grad_norm": 0.897496931177557, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 7227 + }, + { + "epoch": 0.07228, + "grad_norm": 0.8371397499199325, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 7228 + }, + { + "epoch": 0.07229, + "grad_norm": 0.68089602479955, + "learning_rate": 0.003, + "loss": 4.13, + "step": 7229 + }, + { + "epoch": 0.0723, + "grad_norm": 0.7440697649975428, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 7230 + }, + { + "epoch": 0.07231, + "grad_norm": 0.8546390577858282, + "learning_rate": 0.003, + "loss": 4.1437, + "step": 7231 + }, + { + "epoch": 0.07232, + "grad_norm": 0.9911495928687625, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 7232 + }, + { + "epoch": 0.07233, + "grad_norm": 1.0402994652801865, + "learning_rate": 0.003, + "loss": 4.1494, + "step": 7233 + }, + { + "epoch": 0.07234, + "grad_norm": 0.7834142777445765, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 7234 + }, + { + "epoch": 0.07235, + "grad_norm": 0.79697526236287, + "learning_rate": 0.003, + "loss": 4.1635, + "step": 7235 + }, + { + "epoch": 0.07236, + "grad_norm": 0.8580397156652976, + "learning_rate": 0.003, + "loss": 4.1617, + "step": 7236 + }, + { + "epoch": 0.07237, + "grad_norm": 0.8311156048197793, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 7237 + }, + { + "epoch": 0.07238, + "grad_norm": 0.7018648916636654, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 7238 + }, + { + "epoch": 0.07239, + "grad_norm": 0.6039008355156773, + "learning_rate": 0.003, + "loss": 4.1381, + "step": 7239 + }, + { + "epoch": 0.0724, + "grad_norm": 0.5161419422417888, + "learning_rate": 0.003, + "loss": 4.093, + "step": 7240 + }, + { + "epoch": 0.07241, + "grad_norm": 0.4874109183137781, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 7241 + }, + { + "epoch": 0.07242, + "grad_norm": 0.6217288908705515, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 7242 + }, + { + "epoch": 0.07243, + "grad_norm": 0.6488298417249205, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 7243 + }, + { + "epoch": 0.07244, + "grad_norm": 0.6010925037781651, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 7244 + }, + { + "epoch": 0.07245, + "grad_norm": 0.5324768587577942, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 7245 + }, + { + "epoch": 0.07246, + "grad_norm": 0.49764740145745306, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 7246 + }, + { + "epoch": 0.07247, + "grad_norm": 0.4879501046529579, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 7247 + }, + { + "epoch": 0.07248, + "grad_norm": 0.44818383989320487, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 7248 + }, + { + "epoch": 0.07249, + "grad_norm": 0.4019767982051826, + "learning_rate": 0.003, + "loss": 4.072, + "step": 7249 + }, + { + "epoch": 0.0725, + "grad_norm": 0.37656345224727716, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 7250 + }, + { + "epoch": 0.07251, + "grad_norm": 0.39765646327210397, + "learning_rate": 0.003, + "loss": 4.1, + "step": 7251 + }, + { + "epoch": 0.07252, + "grad_norm": 0.4087153991886569, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 7252 + }, + { + "epoch": 0.07253, + "grad_norm": 0.4413249795378963, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 7253 + }, + { + "epoch": 0.07254, + "grad_norm": 0.49658754339700917, + "learning_rate": 0.003, + "loss": 4.117, + "step": 7254 + }, + { + "epoch": 0.07255, + "grad_norm": 0.6065287271274902, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7255 + }, + { + "epoch": 0.07256, + "grad_norm": 0.7783935834491852, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 7256 + }, + { + "epoch": 0.07257, + "grad_norm": 0.8736962562673857, + "learning_rate": 0.003, + "loss": 4.11, + "step": 7257 + }, + { + "epoch": 0.07258, + "grad_norm": 0.7615763185801613, + "learning_rate": 0.003, + "loss": 4.076, + "step": 7258 + }, + { + "epoch": 0.07259, + "grad_norm": 0.6225198130561953, + "learning_rate": 0.003, + "loss": 4.102, + "step": 7259 + }, + { + "epoch": 0.0726, + "grad_norm": 0.6660945346455424, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 7260 + }, + { + "epoch": 0.07261, + "grad_norm": 0.6994963663146176, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 7261 + }, + { + "epoch": 0.07262, + "grad_norm": 0.7558699256531053, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 7262 + }, + { + "epoch": 0.07263, + "grad_norm": 0.8389060291693147, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 7263 + }, + { + "epoch": 0.07264, + "grad_norm": 0.9199074585746712, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 7264 + }, + { + "epoch": 0.07265, + "grad_norm": 0.9046795786100689, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 7265 + }, + { + "epoch": 0.07266, + "grad_norm": 0.8984966553892594, + "learning_rate": 0.003, + "loss": 4.1462, + "step": 7266 + }, + { + "epoch": 0.07267, + "grad_norm": 0.8779492964239948, + "learning_rate": 0.003, + "loss": 4.1303, + "step": 7267 + }, + { + "epoch": 0.07268, + "grad_norm": 0.9458771160182521, + "learning_rate": 0.003, + "loss": 4.1426, + "step": 7268 + }, + { + "epoch": 0.07269, + "grad_norm": 1.0021477216087307, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 7269 + }, + { + "epoch": 0.0727, + "grad_norm": 1.0517837764583857, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 7270 + }, + { + "epoch": 0.07271, + "grad_norm": 0.7757058905695857, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 7271 + }, + { + "epoch": 0.07272, + "grad_norm": 0.6955324357433823, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 7272 + }, + { + "epoch": 0.07273, + "grad_norm": 0.727966212131769, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 7273 + }, + { + "epoch": 0.07274, + "grad_norm": 0.6894516072772396, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 7274 + }, + { + "epoch": 0.07275, + "grad_norm": 0.6466287667559749, + "learning_rate": 0.003, + "loss": 4.104, + "step": 7275 + }, + { + "epoch": 0.07276, + "grad_norm": 0.5806437052644794, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 7276 + }, + { + "epoch": 0.07277, + "grad_norm": 0.5491496894626376, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 7277 + }, + { + "epoch": 0.07278, + "grad_norm": 0.5807535736273424, + "learning_rate": 0.003, + "loss": 4.118, + "step": 7278 + }, + { + "epoch": 0.07279, + "grad_norm": 0.5810337347879152, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 7279 + }, + { + "epoch": 0.0728, + "grad_norm": 0.593882083760101, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 7280 + }, + { + "epoch": 0.07281, + "grad_norm": 0.5145677719892531, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 7281 + }, + { + "epoch": 0.07282, + "grad_norm": 0.5310247033119996, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 7282 + }, + { + "epoch": 0.07283, + "grad_norm": 0.5752791694559306, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 7283 + }, + { + "epoch": 0.07284, + "grad_norm": 0.6120341790052407, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 7284 + }, + { + "epoch": 0.07285, + "grad_norm": 0.7608771862935181, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 7285 + }, + { + "epoch": 0.07286, + "grad_norm": 0.8881767066517879, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 7286 + }, + { + "epoch": 0.07287, + "grad_norm": 0.929008736891511, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 7287 + }, + { + "epoch": 0.07288, + "grad_norm": 0.7448189338903293, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 7288 + }, + { + "epoch": 0.07289, + "grad_norm": 0.6196893187777653, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 7289 + }, + { + "epoch": 0.0729, + "grad_norm": 0.6617627631781875, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 7290 + }, + { + "epoch": 0.07291, + "grad_norm": 0.5587104110160175, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 7291 + }, + { + "epoch": 0.07292, + "grad_norm": 0.5227848078058569, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 7292 + }, + { + "epoch": 0.07293, + "grad_norm": 0.4287989659731113, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 7293 + }, + { + "epoch": 0.07294, + "grad_norm": 0.39434672478777266, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 7294 + }, + { + "epoch": 0.07295, + "grad_norm": 0.35908636579555575, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 7295 + }, + { + "epoch": 0.07296, + "grad_norm": 0.3644756637139822, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 7296 + }, + { + "epoch": 0.07297, + "grad_norm": 0.3770629664236333, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 7297 + }, + { + "epoch": 0.07298, + "grad_norm": 0.48678207336741464, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 7298 + }, + { + "epoch": 0.07299, + "grad_norm": 0.5846170384424557, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 7299 + }, + { + "epoch": 0.073, + "grad_norm": 0.6595284998416144, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 7300 + }, + { + "epoch": 0.07301, + "grad_norm": 0.6211347463265787, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 7301 + }, + { + "epoch": 0.07302, + "grad_norm": 0.48150627502231, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 7302 + }, + { + "epoch": 0.07303, + "grad_norm": 0.5489707911727371, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 7303 + }, + { + "epoch": 0.07304, + "grad_norm": 0.5974812098818115, + "learning_rate": 0.003, + "loss": 4.085, + "step": 7304 + }, + { + "epoch": 0.07305, + "grad_norm": 0.7024670978584663, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 7305 + }, + { + "epoch": 0.07306, + "grad_norm": 0.7638676834438926, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 7306 + }, + { + "epoch": 0.07307, + "grad_norm": 0.8266201952032484, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 7307 + }, + { + "epoch": 0.07308, + "grad_norm": 0.8513186722362418, + "learning_rate": 0.003, + "loss": 4.103, + "step": 7308 + }, + { + "epoch": 0.07309, + "grad_norm": 1.0336678643068749, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 7309 + }, + { + "epoch": 0.0731, + "grad_norm": 1.1463223547284502, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 7310 + }, + { + "epoch": 0.07311, + "grad_norm": 0.7757914479144218, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 7311 + }, + { + "epoch": 0.07312, + "grad_norm": 0.7055116339180564, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 7312 + }, + { + "epoch": 0.07313, + "grad_norm": 0.6548456604315757, + "learning_rate": 0.003, + "loss": 4.11, + "step": 7313 + }, + { + "epoch": 0.07314, + "grad_norm": 0.7320846700471259, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 7314 + }, + { + "epoch": 0.07315, + "grad_norm": 0.8166597914868653, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 7315 + }, + { + "epoch": 0.07316, + "grad_norm": 0.7595697268852177, + "learning_rate": 0.003, + "loss": 4.1299, + "step": 7316 + }, + { + "epoch": 0.07317, + "grad_norm": 0.6198775441688376, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 7317 + }, + { + "epoch": 0.07318, + "grad_norm": 0.5723028866588152, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 7318 + }, + { + "epoch": 0.07319, + "grad_norm": 0.6057245627345597, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 7319 + }, + { + "epoch": 0.0732, + "grad_norm": 0.7239900990155307, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 7320 + }, + { + "epoch": 0.07321, + "grad_norm": 0.6568845099790048, + "learning_rate": 0.003, + "loss": 4.098, + "step": 7321 + }, + { + "epoch": 0.07322, + "grad_norm": 0.5961986038574478, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 7322 + }, + { + "epoch": 0.07323, + "grad_norm": 0.6111250018556825, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 7323 + }, + { + "epoch": 0.07324, + "grad_norm": 0.6314202107843266, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 7324 + }, + { + "epoch": 0.07325, + "grad_norm": 0.5603304228911395, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 7325 + }, + { + "epoch": 0.07326, + "grad_norm": 0.6198242641884363, + "learning_rate": 0.003, + "loss": 4.112, + "step": 7326 + }, + { + "epoch": 0.07327, + "grad_norm": 0.6180461035522653, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 7327 + }, + { + "epoch": 0.07328, + "grad_norm": 0.5997168347855873, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 7328 + }, + { + "epoch": 0.07329, + "grad_norm": 0.7505933625758654, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 7329 + }, + { + "epoch": 0.0733, + "grad_norm": 0.95882955808798, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 7330 + }, + { + "epoch": 0.07331, + "grad_norm": 1.1625555883916676, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 7331 + }, + { + "epoch": 0.07332, + "grad_norm": 0.713532800458944, + "learning_rate": 0.003, + "loss": 4.118, + "step": 7332 + }, + { + "epoch": 0.07333, + "grad_norm": 0.6381195482238602, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 7333 + }, + { + "epoch": 0.07334, + "grad_norm": 0.5803769095988841, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 7334 + }, + { + "epoch": 0.07335, + "grad_norm": 0.5486263031540451, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 7335 + }, + { + "epoch": 0.07336, + "grad_norm": 0.5662029197744988, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 7336 + }, + { + "epoch": 0.07337, + "grad_norm": 0.4935245740209676, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 7337 + }, + { + "epoch": 0.07338, + "grad_norm": 0.5251534080906687, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 7338 + }, + { + "epoch": 0.07339, + "grad_norm": 0.5426405338587877, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 7339 + }, + { + "epoch": 0.0734, + "grad_norm": 0.572679387243017, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 7340 + }, + { + "epoch": 0.07341, + "grad_norm": 0.592683237309645, + "learning_rate": 0.003, + "loss": 4.1, + "step": 7341 + }, + { + "epoch": 0.07342, + "grad_norm": 0.6784287433146412, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 7342 + }, + { + "epoch": 0.07343, + "grad_norm": 0.8466966833790952, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 7343 + }, + { + "epoch": 0.07344, + "grad_norm": 0.9627040021124741, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 7344 + }, + { + "epoch": 0.07345, + "grad_norm": 0.8840562235128062, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 7345 + }, + { + "epoch": 0.07346, + "grad_norm": 0.7548058738161189, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 7346 + }, + { + "epoch": 0.07347, + "grad_norm": 0.6631446634048735, + "learning_rate": 0.003, + "loss": 4.096, + "step": 7347 + }, + { + "epoch": 0.07348, + "grad_norm": 0.7265237879004127, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 7348 + }, + { + "epoch": 0.07349, + "grad_norm": 0.7506619044246161, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 7349 + }, + { + "epoch": 0.0735, + "grad_norm": 0.8461103544372488, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 7350 + }, + { + "epoch": 0.07351, + "grad_norm": 0.990138732808367, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7351 + }, + { + "epoch": 0.07352, + "grad_norm": 1.033284964426441, + "learning_rate": 0.003, + "loss": 4.121, + "step": 7352 + }, + { + "epoch": 0.07353, + "grad_norm": 0.9900655416873149, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 7353 + }, + { + "epoch": 0.07354, + "grad_norm": 0.8494895382662214, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 7354 + }, + { + "epoch": 0.07355, + "grad_norm": 0.8514735721257103, + "learning_rate": 0.003, + "loss": 4.1326, + "step": 7355 + }, + { + "epoch": 0.07356, + "grad_norm": 0.7642132298715535, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 7356 + }, + { + "epoch": 0.07357, + "grad_norm": 0.7217630714630613, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 7357 + }, + { + "epoch": 0.07358, + "grad_norm": 0.6372792070079403, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 7358 + }, + { + "epoch": 0.07359, + "grad_norm": 0.6640339672379827, + "learning_rate": 0.003, + "loss": 4.126, + "step": 7359 + }, + { + "epoch": 0.0736, + "grad_norm": 0.6688944780134485, + "learning_rate": 0.003, + "loss": 4.1342, + "step": 7360 + }, + { + "epoch": 0.07361, + "grad_norm": 0.6165260490708869, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 7361 + }, + { + "epoch": 0.07362, + "grad_norm": 0.6046582282651884, + "learning_rate": 0.003, + "loss": 4.1271, + "step": 7362 + }, + { + "epoch": 0.07363, + "grad_norm": 0.5897978442551399, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 7363 + }, + { + "epoch": 0.07364, + "grad_norm": 0.5332996805238572, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 7364 + }, + { + "epoch": 0.07365, + "grad_norm": 0.46010756257859925, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 7365 + }, + { + "epoch": 0.07366, + "grad_norm": 0.5374848389207422, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 7366 + }, + { + "epoch": 0.07367, + "grad_norm": 0.6597380812902667, + "learning_rate": 0.003, + "loss": 4.115, + "step": 7367 + }, + { + "epoch": 0.07368, + "grad_norm": 0.8518139437419292, + "learning_rate": 0.003, + "loss": 4.1366, + "step": 7368 + }, + { + "epoch": 0.07369, + "grad_norm": 0.9618450628071217, + "learning_rate": 0.003, + "loss": 4.09, + "step": 7369 + }, + { + "epoch": 0.0737, + "grad_norm": 0.8620882223010047, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 7370 + }, + { + "epoch": 0.07371, + "grad_norm": 0.6611222884366442, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 7371 + }, + { + "epoch": 0.07372, + "grad_norm": 0.7543698836104294, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 7372 + }, + { + "epoch": 0.07373, + "grad_norm": 0.7294801364711708, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 7373 + }, + { + "epoch": 0.07374, + "grad_norm": 0.6703748242790802, + "learning_rate": 0.003, + "loss": 4.106, + "step": 7374 + }, + { + "epoch": 0.07375, + "grad_norm": 0.6644624708027584, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 7375 + }, + { + "epoch": 0.07376, + "grad_norm": 0.7311681812607845, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 7376 + }, + { + "epoch": 0.07377, + "grad_norm": 0.7075965457515309, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 7377 + }, + { + "epoch": 0.07378, + "grad_norm": 0.6352699526430687, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 7378 + }, + { + "epoch": 0.07379, + "grad_norm": 0.6697856496315371, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 7379 + }, + { + "epoch": 0.0738, + "grad_norm": 0.6784741265654184, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 7380 + }, + { + "epoch": 0.07381, + "grad_norm": 0.5900914351363498, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 7381 + }, + { + "epoch": 0.07382, + "grad_norm": 0.608569663017179, + "learning_rate": 0.003, + "loss": 4.1352, + "step": 7382 + }, + { + "epoch": 0.07383, + "grad_norm": 0.6350237567101997, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 7383 + }, + { + "epoch": 0.07384, + "grad_norm": 0.6424806458733962, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 7384 + }, + { + "epoch": 0.07385, + "grad_norm": 0.6720061518073006, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 7385 + }, + { + "epoch": 0.07386, + "grad_norm": 0.7795027909968454, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 7386 + }, + { + "epoch": 0.07387, + "grad_norm": 0.8143599209673019, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 7387 + }, + { + "epoch": 0.07388, + "grad_norm": 0.7807403224128945, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 7388 + }, + { + "epoch": 0.07389, + "grad_norm": 0.6943770705390825, + "learning_rate": 0.003, + "loss": 4.097, + "step": 7389 + }, + { + "epoch": 0.0739, + "grad_norm": 0.6749062613953275, + "learning_rate": 0.003, + "loss": 4.089, + "step": 7390 + }, + { + "epoch": 0.07391, + "grad_norm": 0.6980025211034937, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 7391 + }, + { + "epoch": 0.07392, + "grad_norm": 0.8142287222162783, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 7392 + }, + { + "epoch": 0.07393, + "grad_norm": 0.848080762828504, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 7393 + }, + { + "epoch": 0.07394, + "grad_norm": 0.7052754295757714, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 7394 + }, + { + "epoch": 0.07395, + "grad_norm": 0.6918679030912211, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 7395 + }, + { + "epoch": 0.07396, + "grad_norm": 0.6396360507546505, + "learning_rate": 0.003, + "loss": 4.101, + "step": 7396 + }, + { + "epoch": 0.07397, + "grad_norm": 0.5575428300488946, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 7397 + }, + { + "epoch": 0.07398, + "grad_norm": 0.47819343358592886, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 7398 + }, + { + "epoch": 0.07399, + "grad_norm": 0.49123569869531275, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 7399 + }, + { + "epoch": 0.074, + "grad_norm": 0.4873818583702739, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 7400 + }, + { + "epoch": 0.07401, + "grad_norm": 0.5338185744914158, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 7401 + }, + { + "epoch": 0.07402, + "grad_norm": 0.6259981261871912, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 7402 + }, + { + "epoch": 0.07403, + "grad_norm": 0.7875202985260396, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 7403 + }, + { + "epoch": 0.07404, + "grad_norm": 0.9375888555588073, + "learning_rate": 0.003, + "loss": 4.148, + "step": 7404 + }, + { + "epoch": 0.07405, + "grad_norm": 0.916338808307593, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 7405 + }, + { + "epoch": 0.07406, + "grad_norm": 0.7809170835167528, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 7406 + }, + { + "epoch": 0.07407, + "grad_norm": 0.6654944055402259, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 7407 + }, + { + "epoch": 0.07408, + "grad_norm": 0.6915372132184653, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 7408 + }, + { + "epoch": 0.07409, + "grad_norm": 0.7828828066535588, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 7409 + }, + { + "epoch": 0.0741, + "grad_norm": 0.8226790763634696, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 7410 + }, + { + "epoch": 0.07411, + "grad_norm": 0.8943168078020164, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 7411 + }, + { + "epoch": 0.07412, + "grad_norm": 0.8016790470869424, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 7412 + }, + { + "epoch": 0.07413, + "grad_norm": 0.7242923730785583, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 7413 + }, + { + "epoch": 0.07414, + "grad_norm": 0.7803236835873817, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 7414 + }, + { + "epoch": 0.07415, + "grad_norm": 0.809440770991952, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 7415 + }, + { + "epoch": 0.07416, + "grad_norm": 0.814374589546072, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7416 + }, + { + "epoch": 0.07417, + "grad_norm": 0.7472307812051249, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 7417 + }, + { + "epoch": 0.07418, + "grad_norm": 0.668144266675147, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 7418 + }, + { + "epoch": 0.07419, + "grad_norm": 0.7132902738686082, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 7419 + }, + { + "epoch": 0.0742, + "grad_norm": 0.6426183694835322, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 7420 + }, + { + "epoch": 0.07421, + "grad_norm": 0.6842452822488992, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 7421 + }, + { + "epoch": 0.07422, + "grad_norm": 0.6935109936445435, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7422 + }, + { + "epoch": 0.07423, + "grad_norm": 0.6977420494019977, + "learning_rate": 0.003, + "loss": 4.092, + "step": 7423 + }, + { + "epoch": 0.07424, + "grad_norm": 0.66836051737428, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 7424 + }, + { + "epoch": 0.07425, + "grad_norm": 0.6071918274550937, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 7425 + }, + { + "epoch": 0.07426, + "grad_norm": 0.5672292395284034, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 7426 + }, + { + "epoch": 0.07427, + "grad_norm": 0.6881340403118316, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 7427 + }, + { + "epoch": 0.07428, + "grad_norm": 0.8295995612479321, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 7428 + }, + { + "epoch": 0.07429, + "grad_norm": 1.0310591530396886, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 7429 + }, + { + "epoch": 0.0743, + "grad_norm": 1.1113844190744362, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 7430 + }, + { + "epoch": 0.07431, + "grad_norm": 0.8368950095459845, + "learning_rate": 0.003, + "loss": 4.101, + "step": 7431 + }, + { + "epoch": 0.07432, + "grad_norm": 0.7370171566843224, + "learning_rate": 0.003, + "loss": 4.121, + "step": 7432 + }, + { + "epoch": 0.07433, + "grad_norm": 0.7186305212736634, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 7433 + }, + { + "epoch": 0.07434, + "grad_norm": 0.6592511244859814, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 7434 + }, + { + "epoch": 0.07435, + "grad_norm": 0.6659908164403538, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 7435 + }, + { + "epoch": 0.07436, + "grad_norm": 0.7024784942727049, + "learning_rate": 0.003, + "loss": 4.134, + "step": 7436 + }, + { + "epoch": 0.07437, + "grad_norm": 0.750590282273592, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 7437 + }, + { + "epoch": 0.07438, + "grad_norm": 0.7342624547457341, + "learning_rate": 0.003, + "loss": 4.117, + "step": 7438 + }, + { + "epoch": 0.07439, + "grad_norm": 0.6552175598983562, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 7439 + }, + { + "epoch": 0.0744, + "grad_norm": 0.6670994817257487, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 7440 + }, + { + "epoch": 0.07441, + "grad_norm": 0.5868493465333204, + "learning_rate": 0.003, + "loss": 4.1391, + "step": 7441 + }, + { + "epoch": 0.07442, + "grad_norm": 0.5749846079049408, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 7442 + }, + { + "epoch": 0.07443, + "grad_norm": 0.616246467740031, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 7443 + }, + { + "epoch": 0.07444, + "grad_norm": 0.6314116396577024, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 7444 + }, + { + "epoch": 0.07445, + "grad_norm": 0.6891618905408962, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 7445 + }, + { + "epoch": 0.07446, + "grad_norm": 0.7665111909100017, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 7446 + }, + { + "epoch": 0.07447, + "grad_norm": 0.7960586954783623, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 7447 + }, + { + "epoch": 0.07448, + "grad_norm": 0.6932171237233758, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 7448 + }, + { + "epoch": 0.07449, + "grad_norm": 0.555157340249593, + "learning_rate": 0.003, + "loss": 4.099, + "step": 7449 + }, + { + "epoch": 0.0745, + "grad_norm": 0.5727726738711602, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 7450 + }, + { + "epoch": 0.07451, + "grad_norm": 0.6677189755758194, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 7451 + }, + { + "epoch": 0.07452, + "grad_norm": 0.7461108960326893, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 7452 + }, + { + "epoch": 0.07453, + "grad_norm": 0.7689716942440091, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 7453 + }, + { + "epoch": 0.07454, + "grad_norm": 0.6847633220363298, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 7454 + }, + { + "epoch": 0.07455, + "grad_norm": 0.6834076703366209, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 7455 + }, + { + "epoch": 0.07456, + "grad_norm": 0.5915374732547324, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 7456 + }, + { + "epoch": 0.07457, + "grad_norm": 0.5353509342865896, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 7457 + }, + { + "epoch": 0.07458, + "grad_norm": 0.6021411618752377, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 7458 + }, + { + "epoch": 0.07459, + "grad_norm": 0.6164057139569667, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 7459 + }, + { + "epoch": 0.0746, + "grad_norm": 0.7212773204762993, + "learning_rate": 0.003, + "loss": 4.104, + "step": 7460 + }, + { + "epoch": 0.07461, + "grad_norm": 0.7976317064628464, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 7461 + }, + { + "epoch": 0.07462, + "grad_norm": 0.8064167886203844, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 7462 + }, + { + "epoch": 0.07463, + "grad_norm": 0.7632409751906114, + "learning_rate": 0.003, + "loss": 4.1378, + "step": 7463 + }, + { + "epoch": 0.07464, + "grad_norm": 0.7512717490057481, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 7464 + }, + { + "epoch": 0.07465, + "grad_norm": 0.7943363392230436, + "learning_rate": 0.003, + "loss": 4.107, + "step": 7465 + }, + { + "epoch": 0.07466, + "grad_norm": 0.8637427260824427, + "learning_rate": 0.003, + "loss": 4.1269, + "step": 7466 + }, + { + "epoch": 0.07467, + "grad_norm": 0.8498740214887592, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 7467 + }, + { + "epoch": 0.07468, + "grad_norm": 0.8357355406639847, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 7468 + }, + { + "epoch": 0.07469, + "grad_norm": 0.8142934568301395, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 7469 + }, + { + "epoch": 0.0747, + "grad_norm": 0.8503882150265013, + "learning_rate": 0.003, + "loss": 4.1361, + "step": 7470 + }, + { + "epoch": 0.07471, + "grad_norm": 0.8925914077857183, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 7471 + }, + { + "epoch": 0.07472, + "grad_norm": 0.9101180407681418, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 7472 + }, + { + "epoch": 0.07473, + "grad_norm": 0.8580441555622639, + "learning_rate": 0.003, + "loss": 4.1278, + "step": 7473 + }, + { + "epoch": 0.07474, + "grad_norm": 0.8896005983371758, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 7474 + }, + { + "epoch": 0.07475, + "grad_norm": 0.7824169038965194, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 7475 + }, + { + "epoch": 0.07476, + "grad_norm": 0.7710890200443516, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 7476 + }, + { + "epoch": 0.07477, + "grad_norm": 0.7900469190579128, + "learning_rate": 0.003, + "loss": 4.127, + "step": 7477 + }, + { + "epoch": 0.07478, + "grad_norm": 0.6977697020555949, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 7478 + }, + { + "epoch": 0.07479, + "grad_norm": 0.6860967351037519, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 7479 + }, + { + "epoch": 0.0748, + "grad_norm": 0.6740365801183335, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 7480 + }, + { + "epoch": 0.07481, + "grad_norm": 0.7756716209864886, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 7481 + }, + { + "epoch": 0.07482, + "grad_norm": 0.7133092139283264, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 7482 + }, + { + "epoch": 0.07483, + "grad_norm": 0.6326111223603825, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 7483 + }, + { + "epoch": 0.07484, + "grad_norm": 0.5282476293532727, + "learning_rate": 0.003, + "loss": 4.1102, + "step": 7484 + }, + { + "epoch": 0.07485, + "grad_norm": 0.5706931649668086, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 7485 + }, + { + "epoch": 0.07486, + "grad_norm": 0.606269213255391, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 7486 + }, + { + "epoch": 0.07487, + "grad_norm": 0.6105645025729305, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 7487 + }, + { + "epoch": 0.07488, + "grad_norm": 0.6237051677171727, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 7488 + }, + { + "epoch": 0.07489, + "grad_norm": 0.7811835813919411, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 7489 + }, + { + "epoch": 0.0749, + "grad_norm": 0.7579316329950212, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 7490 + }, + { + "epoch": 0.07491, + "grad_norm": 0.6336904857400839, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 7491 + }, + { + "epoch": 0.07492, + "grad_norm": 0.5762678308899742, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 7492 + }, + { + "epoch": 0.07493, + "grad_norm": 0.5896176068966966, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 7493 + }, + { + "epoch": 0.07494, + "grad_norm": 0.5953859444943526, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 7494 + }, + { + "epoch": 0.07495, + "grad_norm": 0.6189501827559954, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 7495 + }, + { + "epoch": 0.07496, + "grad_norm": 0.6476212217965059, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 7496 + }, + { + "epoch": 0.07497, + "grad_norm": 0.590017860504856, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 7497 + }, + { + "epoch": 0.07498, + "grad_norm": 0.5711936608923054, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 7498 + }, + { + "epoch": 0.07499, + "grad_norm": 0.6570356392413452, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 7499 + }, + { + "epoch": 0.075, + "grad_norm": 0.6759155863873976, + "learning_rate": 0.003, + "loss": 4.1151, + "step": 7500 + }, + { + "epoch": 0.07501, + "grad_norm": 0.8110049470072964, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 7501 + }, + { + "epoch": 0.07502, + "grad_norm": 0.9727707162810135, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 7502 + }, + { + "epoch": 0.07503, + "grad_norm": 1.1150877882227244, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 7503 + }, + { + "epoch": 0.07504, + "grad_norm": 0.7577757721039465, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 7504 + }, + { + "epoch": 0.07505, + "grad_norm": 0.6742452888810195, + "learning_rate": 0.003, + "loss": 4.085, + "step": 7505 + }, + { + "epoch": 0.07506, + "grad_norm": 0.6189390428317627, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 7506 + }, + { + "epoch": 0.07507, + "grad_norm": 0.6022087392016671, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 7507 + }, + { + "epoch": 0.07508, + "grad_norm": 0.5309580523879476, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 7508 + }, + { + "epoch": 0.07509, + "grad_norm": 0.5484502948250853, + "learning_rate": 0.003, + "loss": 4.096, + "step": 7509 + }, + { + "epoch": 0.0751, + "grad_norm": 0.6594506505512988, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 7510 + }, + { + "epoch": 0.07511, + "grad_norm": 0.6599050252229222, + "learning_rate": 0.003, + "loss": 4.084, + "step": 7511 + }, + { + "epoch": 0.07512, + "grad_norm": 0.5970442662703767, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 7512 + }, + { + "epoch": 0.07513, + "grad_norm": 0.5770492421013043, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 7513 + }, + { + "epoch": 0.07514, + "grad_norm": 0.5972414190275317, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 7514 + }, + { + "epoch": 0.07515, + "grad_norm": 0.5236448183039198, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 7515 + }, + { + "epoch": 0.07516, + "grad_norm": 0.491529472938158, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 7516 + }, + { + "epoch": 0.07517, + "grad_norm": 0.5269178228921855, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7517 + }, + { + "epoch": 0.07518, + "grad_norm": 0.49875426784289123, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 7518 + }, + { + "epoch": 0.07519, + "grad_norm": 0.4804224094730234, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 7519 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4238021009088441, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 7520 + }, + { + "epoch": 0.07521, + "grad_norm": 0.4356047629744244, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 7521 + }, + { + "epoch": 0.07522, + "grad_norm": 0.4848019612577865, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 7522 + }, + { + "epoch": 0.07523, + "grad_norm": 0.5676769876951443, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 7523 + }, + { + "epoch": 0.07524, + "grad_norm": 0.7969402415607215, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 7524 + }, + { + "epoch": 0.07525, + "grad_norm": 1.158159401601464, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 7525 + }, + { + "epoch": 0.07526, + "grad_norm": 0.9956964848184818, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 7526 + }, + { + "epoch": 0.07527, + "grad_norm": 1.040836474640964, + "learning_rate": 0.003, + "loss": 4.1372, + "step": 7527 + }, + { + "epoch": 0.07528, + "grad_norm": 0.9140741004270626, + "learning_rate": 0.003, + "loss": 4.1343, + "step": 7528 + }, + { + "epoch": 0.07529, + "grad_norm": 1.0052666855509356, + "learning_rate": 0.003, + "loss": 4.114, + "step": 7529 + }, + { + "epoch": 0.0753, + "grad_norm": 0.9481670539724812, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 7530 + }, + { + "epoch": 0.07531, + "grad_norm": 0.9918379168573455, + "learning_rate": 0.003, + "loss": 4.1468, + "step": 7531 + }, + { + "epoch": 0.07532, + "grad_norm": 1.0229542004769954, + "learning_rate": 0.003, + "loss": 4.1396, + "step": 7532 + }, + { + "epoch": 0.07533, + "grad_norm": 1.0918782376363978, + "learning_rate": 0.003, + "loss": 4.1343, + "step": 7533 + }, + { + "epoch": 0.07534, + "grad_norm": 0.9604541131474513, + "learning_rate": 0.003, + "loss": 4.1352, + "step": 7534 + }, + { + "epoch": 0.07535, + "grad_norm": 1.0009685102812784, + "learning_rate": 0.003, + "loss": 4.1491, + "step": 7535 + }, + { + "epoch": 0.07536, + "grad_norm": 1.0487571004898342, + "learning_rate": 0.003, + "loss": 4.1362, + "step": 7536 + }, + { + "epoch": 0.07537, + "grad_norm": 0.9991469202887211, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 7537 + }, + { + "epoch": 0.07538, + "grad_norm": 0.8819852519452202, + "learning_rate": 0.003, + "loss": 4.1478, + "step": 7538 + }, + { + "epoch": 0.07539, + "grad_norm": 0.8807197730769387, + "learning_rate": 0.003, + "loss": 4.119, + "step": 7539 + }, + { + "epoch": 0.0754, + "grad_norm": 0.877244025946411, + "learning_rate": 0.003, + "loss": 4.1383, + "step": 7540 + }, + { + "epoch": 0.07541, + "grad_norm": 0.9740029088436687, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 7541 + }, + { + "epoch": 0.07542, + "grad_norm": 0.8600848931909216, + "learning_rate": 0.003, + "loss": 4.1355, + "step": 7542 + }, + { + "epoch": 0.07543, + "grad_norm": 0.8280886875133882, + "learning_rate": 0.003, + "loss": 4.1584, + "step": 7543 + }, + { + "epoch": 0.07544, + "grad_norm": 0.8071984724777687, + "learning_rate": 0.003, + "loss": 4.1459, + "step": 7544 + }, + { + "epoch": 0.07545, + "grad_norm": 0.698095628354796, + "learning_rate": 0.003, + "loss": 4.1652, + "step": 7545 + }, + { + "epoch": 0.07546, + "grad_norm": 0.6402526854429936, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 7546 + }, + { + "epoch": 0.07547, + "grad_norm": 0.6536241524830555, + "learning_rate": 0.003, + "loss": 4.1416, + "step": 7547 + }, + { + "epoch": 0.07548, + "grad_norm": 0.5657943886646665, + "learning_rate": 0.003, + "loss": 4.1467, + "step": 7548 + }, + { + "epoch": 0.07549, + "grad_norm": 0.6076370507493793, + "learning_rate": 0.003, + "loss": 4.1471, + "step": 7549 + }, + { + "epoch": 0.0755, + "grad_norm": 0.5883507720311228, + "learning_rate": 0.003, + "loss": 4.1397, + "step": 7550 + }, + { + "epoch": 0.07551, + "grad_norm": 0.619610373235812, + "learning_rate": 0.003, + "loss": 4.127, + "step": 7551 + }, + { + "epoch": 0.07552, + "grad_norm": 0.7123411482561006, + "learning_rate": 0.003, + "loss": 4.13, + "step": 7552 + }, + { + "epoch": 0.07553, + "grad_norm": 0.6469950236749169, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 7553 + }, + { + "epoch": 0.07554, + "grad_norm": 0.6316235881426905, + "learning_rate": 0.003, + "loss": 4.1247, + "step": 7554 + }, + { + "epoch": 0.07555, + "grad_norm": 0.7256901233443528, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 7555 + }, + { + "epoch": 0.07556, + "grad_norm": 0.7618410256376742, + "learning_rate": 0.003, + "loss": 4.1341, + "step": 7556 + }, + { + "epoch": 0.07557, + "grad_norm": 0.7239253341070478, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 7557 + }, + { + "epoch": 0.07558, + "grad_norm": 0.6931338512410227, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 7558 + }, + { + "epoch": 0.07559, + "grad_norm": 0.6724575608591329, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 7559 + }, + { + "epoch": 0.0756, + "grad_norm": 0.7061480953108663, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 7560 + }, + { + "epoch": 0.07561, + "grad_norm": 0.7787438183497947, + "learning_rate": 0.003, + "loss": 4.103, + "step": 7561 + }, + { + "epoch": 0.07562, + "grad_norm": 0.9044825545181286, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 7562 + }, + { + "epoch": 0.07563, + "grad_norm": 0.9934288625594392, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 7563 + }, + { + "epoch": 0.07564, + "grad_norm": 0.8635651395659641, + "learning_rate": 0.003, + "loss": 4.1388, + "step": 7564 + }, + { + "epoch": 0.07565, + "grad_norm": 0.6895282281140435, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 7565 + }, + { + "epoch": 0.07566, + "grad_norm": 0.5363646891497745, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 7566 + }, + { + "epoch": 0.07567, + "grad_norm": 0.5387866394074117, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 7567 + }, + { + "epoch": 0.07568, + "grad_norm": 0.5090019275065167, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 7568 + }, + { + "epoch": 0.07569, + "grad_norm": 0.42405385696904213, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 7569 + }, + { + "epoch": 0.0757, + "grad_norm": 0.4110011928681986, + "learning_rate": 0.003, + "loss": 4.083, + "step": 7570 + }, + { + "epoch": 0.07571, + "grad_norm": 0.4413812175592004, + "learning_rate": 0.003, + "loss": 4.075, + "step": 7571 + }, + { + "epoch": 0.07572, + "grad_norm": 0.4976749366359514, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 7572 + }, + { + "epoch": 0.07573, + "grad_norm": 0.5537972928779329, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 7573 + }, + { + "epoch": 0.07574, + "grad_norm": 0.5308733184384575, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 7574 + }, + { + "epoch": 0.07575, + "grad_norm": 0.49041822167560783, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 7575 + }, + { + "epoch": 0.07576, + "grad_norm": 0.5081385300005833, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 7576 + }, + { + "epoch": 0.07577, + "grad_norm": 0.574336572955178, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 7577 + }, + { + "epoch": 0.07578, + "grad_norm": 0.5956477110919803, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 7578 + }, + { + "epoch": 0.07579, + "grad_norm": 0.6349724495359681, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 7579 + }, + { + "epoch": 0.0758, + "grad_norm": 0.5747733937772262, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 7580 + }, + { + "epoch": 0.07581, + "grad_norm": 0.5324184467786447, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 7581 + }, + { + "epoch": 0.07582, + "grad_norm": 0.5131944070203941, + "learning_rate": 0.003, + "loss": 4.088, + "step": 7582 + }, + { + "epoch": 0.07583, + "grad_norm": 0.5085404496587494, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 7583 + }, + { + "epoch": 0.07584, + "grad_norm": 0.541822111982503, + "learning_rate": 0.003, + "loss": 4.131, + "step": 7584 + }, + { + "epoch": 0.07585, + "grad_norm": 0.61290315130478, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 7585 + }, + { + "epoch": 0.07586, + "grad_norm": 0.7597528207897615, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 7586 + }, + { + "epoch": 0.07587, + "grad_norm": 1.0042272950879196, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 7587 + }, + { + "epoch": 0.07588, + "grad_norm": 1.1991037951904149, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 7588 + }, + { + "epoch": 0.07589, + "grad_norm": 0.7285232286835313, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 7589 + }, + { + "epoch": 0.0759, + "grad_norm": 0.7148833074382797, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 7590 + }, + { + "epoch": 0.07591, + "grad_norm": 0.7201822884141769, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 7591 + }, + { + "epoch": 0.07592, + "grad_norm": 0.6076702679439346, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 7592 + }, + { + "epoch": 0.07593, + "grad_norm": 0.5718081339296276, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 7593 + }, + { + "epoch": 0.07594, + "grad_norm": 0.6158587196440879, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 7594 + }, + { + "epoch": 0.07595, + "grad_norm": 0.6174048562726181, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 7595 + }, + { + "epoch": 0.07596, + "grad_norm": 0.6776269755619764, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 7596 + }, + { + "epoch": 0.07597, + "grad_norm": 0.7579861091139625, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 7597 + }, + { + "epoch": 0.07598, + "grad_norm": 0.821293646598518, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 7598 + }, + { + "epoch": 0.07599, + "grad_norm": 0.9172481452747592, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 7599 + }, + { + "epoch": 0.076, + "grad_norm": 0.9188185947900437, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 7600 + }, + { + "epoch": 0.07601, + "grad_norm": 0.8582002375162606, + "learning_rate": 0.003, + "loss": 4.1346, + "step": 7601 + }, + { + "epoch": 0.07602, + "grad_norm": 0.7498431640329858, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 7602 + }, + { + "epoch": 0.07603, + "grad_norm": 0.727413692498518, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 7603 + }, + { + "epoch": 0.07604, + "grad_norm": 0.7128563866226183, + "learning_rate": 0.003, + "loss": 4.1188, + "step": 7604 + }, + { + "epoch": 0.07605, + "grad_norm": 0.6925666833494799, + "learning_rate": 0.003, + "loss": 4.108, + "step": 7605 + }, + { + "epoch": 0.07606, + "grad_norm": 0.8051319465023743, + "learning_rate": 0.003, + "loss": 4.1653, + "step": 7606 + }, + { + "epoch": 0.07607, + "grad_norm": 0.8182132405488027, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 7607 + }, + { + "epoch": 0.07608, + "grad_norm": 0.8255552624586233, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 7608 + }, + { + "epoch": 0.07609, + "grad_norm": 0.7970747267263344, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 7609 + }, + { + "epoch": 0.0761, + "grad_norm": 0.8035430769234121, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 7610 + }, + { + "epoch": 0.07611, + "grad_norm": 0.7414024082233376, + "learning_rate": 0.003, + "loss": 4.1343, + "step": 7611 + }, + { + "epoch": 0.07612, + "grad_norm": 0.6738484686209927, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 7612 + }, + { + "epoch": 0.07613, + "grad_norm": 0.6313697370107805, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 7613 + }, + { + "epoch": 0.07614, + "grad_norm": 0.6779404296006304, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 7614 + }, + { + "epoch": 0.07615, + "grad_norm": 0.671043709478893, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 7615 + }, + { + "epoch": 0.07616, + "grad_norm": 0.648522037704203, + "learning_rate": 0.003, + "loss": 4.103, + "step": 7616 + }, + { + "epoch": 0.07617, + "grad_norm": 0.6992220289841787, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 7617 + }, + { + "epoch": 0.07618, + "grad_norm": 0.7961453102996263, + "learning_rate": 0.003, + "loss": 4.1408, + "step": 7618 + }, + { + "epoch": 0.07619, + "grad_norm": 0.6675342147329035, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 7619 + }, + { + "epoch": 0.0762, + "grad_norm": 0.5717583537378645, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 7620 + }, + { + "epoch": 0.07621, + "grad_norm": 0.5677927273573751, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 7621 + }, + { + "epoch": 0.07622, + "grad_norm": 0.6433993271028621, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 7622 + }, + { + "epoch": 0.07623, + "grad_norm": 0.6940232002524971, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 7623 + }, + { + "epoch": 0.07624, + "grad_norm": 0.7880990684776099, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 7624 + }, + { + "epoch": 0.07625, + "grad_norm": 0.8639480293754638, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 7625 + }, + { + "epoch": 0.07626, + "grad_norm": 0.8696121558470246, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 7626 + }, + { + "epoch": 0.07627, + "grad_norm": 0.6599012014594735, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 7627 + }, + { + "epoch": 0.07628, + "grad_norm": 0.6055187589728076, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 7628 + }, + { + "epoch": 0.07629, + "grad_norm": 0.5602974302890468, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 7629 + }, + { + "epoch": 0.0763, + "grad_norm": 0.5483897493872879, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 7630 + }, + { + "epoch": 0.07631, + "grad_norm": 0.5313336121851904, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 7631 + }, + { + "epoch": 0.07632, + "grad_norm": 0.4946584392262865, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 7632 + }, + { + "epoch": 0.07633, + "grad_norm": 0.4679730906531611, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 7633 + }, + { + "epoch": 0.07634, + "grad_norm": 0.5163571664886781, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 7634 + }, + { + "epoch": 0.07635, + "grad_norm": 0.5766441343095514, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 7635 + }, + { + "epoch": 0.07636, + "grad_norm": 0.6356527311929989, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 7636 + }, + { + "epoch": 0.07637, + "grad_norm": 0.7120458591171308, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 7637 + }, + { + "epoch": 0.07638, + "grad_norm": 0.7256150029226924, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 7638 + }, + { + "epoch": 0.07639, + "grad_norm": 0.6685928629135915, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 7639 + }, + { + "epoch": 0.0764, + "grad_norm": 0.7914866586728011, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 7640 + }, + { + "epoch": 0.07641, + "grad_norm": 0.9183061054040322, + "learning_rate": 0.003, + "loss": 4.1304, + "step": 7641 + }, + { + "epoch": 0.07642, + "grad_norm": 0.8815667821813041, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 7642 + }, + { + "epoch": 0.07643, + "grad_norm": 0.6789085139539063, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 7643 + }, + { + "epoch": 0.07644, + "grad_norm": 0.6670039351921899, + "learning_rate": 0.003, + "loss": 4.079, + "step": 7644 + }, + { + "epoch": 0.07645, + "grad_norm": 0.6793245893987427, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 7645 + }, + { + "epoch": 0.07646, + "grad_norm": 0.6629748478851417, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 7646 + }, + { + "epoch": 0.07647, + "grad_norm": 0.7288187662625558, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 7647 + }, + { + "epoch": 0.07648, + "grad_norm": 0.889407265953888, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 7648 + }, + { + "epoch": 0.07649, + "grad_norm": 0.9687671903338628, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 7649 + }, + { + "epoch": 0.0765, + "grad_norm": 0.8049244708990047, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 7650 + }, + { + "epoch": 0.07651, + "grad_norm": 0.749604908748907, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 7651 + }, + { + "epoch": 0.07652, + "grad_norm": 0.7585566864542461, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 7652 + }, + { + "epoch": 0.07653, + "grad_norm": 0.9516693847937671, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 7653 + }, + { + "epoch": 0.07654, + "grad_norm": 1.0530156043085126, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 7654 + }, + { + "epoch": 0.07655, + "grad_norm": 0.8372129657311617, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 7655 + }, + { + "epoch": 0.07656, + "grad_norm": 0.7407223152315993, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 7656 + }, + { + "epoch": 0.07657, + "grad_norm": 0.7336956785627904, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 7657 + }, + { + "epoch": 0.07658, + "grad_norm": 0.672687938805935, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 7658 + }, + { + "epoch": 0.07659, + "grad_norm": 0.7385569673711774, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 7659 + }, + { + "epoch": 0.0766, + "grad_norm": 0.8172458837525138, + "learning_rate": 0.003, + "loss": 4.1486, + "step": 7660 + }, + { + "epoch": 0.07661, + "grad_norm": 0.9105650740952083, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 7661 + }, + { + "epoch": 0.07662, + "grad_norm": 0.9795402229933363, + "learning_rate": 0.003, + "loss": 4.1453, + "step": 7662 + }, + { + "epoch": 0.07663, + "grad_norm": 0.9659565857187145, + "learning_rate": 0.003, + "loss": 4.1485, + "step": 7663 + }, + { + "epoch": 0.07664, + "grad_norm": 0.7426641164866011, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 7664 + }, + { + "epoch": 0.07665, + "grad_norm": 0.7654812929638697, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 7665 + }, + { + "epoch": 0.07666, + "grad_norm": 0.7454710802815059, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 7666 + }, + { + "epoch": 0.07667, + "grad_norm": 0.7064554873587587, + "learning_rate": 0.003, + "loss": 4.1243, + "step": 7667 + }, + { + "epoch": 0.07668, + "grad_norm": 0.6257727862295976, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 7668 + }, + { + "epoch": 0.07669, + "grad_norm": 0.5710711731797409, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 7669 + }, + { + "epoch": 0.0767, + "grad_norm": 0.528257520997173, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 7670 + }, + { + "epoch": 0.07671, + "grad_norm": 0.4959008285709095, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 7671 + }, + { + "epoch": 0.07672, + "grad_norm": 0.45441825603578956, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 7672 + }, + { + "epoch": 0.07673, + "grad_norm": 0.4429820345077854, + "learning_rate": 0.003, + "loss": 4.1317, + "step": 7673 + }, + { + "epoch": 0.07674, + "grad_norm": 0.42319123978862466, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 7674 + }, + { + "epoch": 0.07675, + "grad_norm": 0.3845278102823501, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 7675 + }, + { + "epoch": 0.07676, + "grad_norm": 0.41229574434392646, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 7676 + }, + { + "epoch": 0.07677, + "grad_norm": 0.43520964656733063, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 7677 + }, + { + "epoch": 0.07678, + "grad_norm": 0.44518003939309553, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 7678 + }, + { + "epoch": 0.07679, + "grad_norm": 0.5155228564191591, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 7679 + }, + { + "epoch": 0.0768, + "grad_norm": 0.5596883999233748, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 7680 + }, + { + "epoch": 0.07681, + "grad_norm": 0.5906511007621373, + "learning_rate": 0.003, + "loss": 4.097, + "step": 7681 + }, + { + "epoch": 0.07682, + "grad_norm": 0.6495617626958452, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 7682 + }, + { + "epoch": 0.07683, + "grad_norm": 0.813132646521943, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 7683 + }, + { + "epoch": 0.07684, + "grad_norm": 0.9701002240236403, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 7684 + }, + { + "epoch": 0.07685, + "grad_norm": 1.0224940781449272, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 7685 + }, + { + "epoch": 0.07686, + "grad_norm": 0.8278902572767264, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7686 + }, + { + "epoch": 0.07687, + "grad_norm": 0.8042154248471763, + "learning_rate": 0.003, + "loss": 4.116, + "step": 7687 + }, + { + "epoch": 0.07688, + "grad_norm": 0.8028648725581159, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 7688 + }, + { + "epoch": 0.07689, + "grad_norm": 0.7584652571831005, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 7689 + }, + { + "epoch": 0.0769, + "grad_norm": 0.668275319980837, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 7690 + }, + { + "epoch": 0.07691, + "grad_norm": 0.7293348633493727, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 7691 + }, + { + "epoch": 0.07692, + "grad_norm": 0.72632873093583, + "learning_rate": 0.003, + "loss": 4.1244, + "step": 7692 + }, + { + "epoch": 0.07693, + "grad_norm": 0.7772222742789282, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 7693 + }, + { + "epoch": 0.07694, + "grad_norm": 0.844518033365835, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 7694 + }, + { + "epoch": 0.07695, + "grad_norm": 1.0562783279259773, + "learning_rate": 0.003, + "loss": 4.1359, + "step": 7695 + }, + { + "epoch": 0.07696, + "grad_norm": 0.9678281861932773, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 7696 + }, + { + "epoch": 0.07697, + "grad_norm": 0.8583167779532994, + "learning_rate": 0.003, + "loss": 4.1507, + "step": 7697 + }, + { + "epoch": 0.07698, + "grad_norm": 0.7254348536667942, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 7698 + }, + { + "epoch": 0.07699, + "grad_norm": 0.7025270777281947, + "learning_rate": 0.003, + "loss": 4.147, + "step": 7699 + }, + { + "epoch": 0.077, + "grad_norm": 0.6298853792744631, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7700 + }, + { + "epoch": 0.07701, + "grad_norm": 0.6529473251497263, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 7701 + }, + { + "epoch": 0.07702, + "grad_norm": 0.6618947976576033, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 7702 + }, + { + "epoch": 0.07703, + "grad_norm": 0.720492970627025, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 7703 + }, + { + "epoch": 0.07704, + "grad_norm": 0.7521627176606427, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 7704 + }, + { + "epoch": 0.07705, + "grad_norm": 0.6974134473890328, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 7705 + }, + { + "epoch": 0.07706, + "grad_norm": 0.6519559813236994, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 7706 + }, + { + "epoch": 0.07707, + "grad_norm": 0.7138065901393107, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 7707 + }, + { + "epoch": 0.07708, + "grad_norm": 0.7603897989016979, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 7708 + }, + { + "epoch": 0.07709, + "grad_norm": 0.7728171935175017, + "learning_rate": 0.003, + "loss": 4.1131, + "step": 7709 + }, + { + "epoch": 0.0771, + "grad_norm": 0.6846056023357585, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 7710 + }, + { + "epoch": 0.07711, + "grad_norm": 0.6649417137508592, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 7711 + }, + { + "epoch": 0.07712, + "grad_norm": 0.7242895985336029, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 7712 + }, + { + "epoch": 0.07713, + "grad_norm": 0.7939906705321689, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 7713 + }, + { + "epoch": 0.07714, + "grad_norm": 0.8212321036889528, + "learning_rate": 0.003, + "loss": 4.08, + "step": 7714 + }, + { + "epoch": 0.07715, + "grad_norm": 0.8121576436599612, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 7715 + }, + { + "epoch": 0.07716, + "grad_norm": 0.7330469856352435, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 7716 + }, + { + "epoch": 0.07717, + "grad_norm": 0.6433922965206615, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 7717 + }, + { + "epoch": 0.07718, + "grad_norm": 0.5861977103583862, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 7718 + }, + { + "epoch": 0.07719, + "grad_norm": 0.5228180617616236, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 7719 + }, + { + "epoch": 0.0772, + "grad_norm": 0.4876233685514838, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 7720 + }, + { + "epoch": 0.07721, + "grad_norm": 0.5225866371833879, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 7721 + }, + { + "epoch": 0.07722, + "grad_norm": 0.49420226274527473, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 7722 + }, + { + "epoch": 0.07723, + "grad_norm": 0.5163143220392981, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 7723 + }, + { + "epoch": 0.07724, + "grad_norm": 0.5183943056733131, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 7724 + }, + { + "epoch": 0.07725, + "grad_norm": 0.544712728586757, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 7725 + }, + { + "epoch": 0.07726, + "grad_norm": 0.48659381970361326, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 7726 + }, + { + "epoch": 0.07727, + "grad_norm": 0.48012804652416147, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 7727 + }, + { + "epoch": 0.07728, + "grad_norm": 0.611442740466274, + "learning_rate": 0.003, + "loss": 4.084, + "step": 7728 + }, + { + "epoch": 0.07729, + "grad_norm": 0.7927419006950845, + "learning_rate": 0.003, + "loss": 4.115, + "step": 7729 + }, + { + "epoch": 0.0773, + "grad_norm": 1.0647142259813651, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 7730 + }, + { + "epoch": 0.07731, + "grad_norm": 1.2555263685268876, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7731 + }, + { + "epoch": 0.07732, + "grad_norm": 0.7349062778385043, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 7732 + }, + { + "epoch": 0.07733, + "grad_norm": 0.662067738555623, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 7733 + }, + { + "epoch": 0.07734, + "grad_norm": 0.6622088276312383, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 7734 + }, + { + "epoch": 0.07735, + "grad_norm": 0.7190113835925798, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 7735 + }, + { + "epoch": 0.07736, + "grad_norm": 0.779921763945562, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 7736 + }, + { + "epoch": 0.07737, + "grad_norm": 0.7062610550634124, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 7737 + }, + { + "epoch": 0.07738, + "grad_norm": 0.781831296452957, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 7738 + }, + { + "epoch": 0.07739, + "grad_norm": 0.881490679155474, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 7739 + }, + { + "epoch": 0.0774, + "grad_norm": 0.9161291546076472, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 7740 + }, + { + "epoch": 0.07741, + "grad_norm": 0.9669089236249705, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 7741 + }, + { + "epoch": 0.07742, + "grad_norm": 0.8371183452364397, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 7742 + }, + { + "epoch": 0.07743, + "grad_norm": 0.6183081207697575, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 7743 + }, + { + "epoch": 0.07744, + "grad_norm": 0.6071723474314588, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 7744 + }, + { + "epoch": 0.07745, + "grad_norm": 0.6556013679404881, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 7745 + }, + { + "epoch": 0.07746, + "grad_norm": 0.6908482013298095, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 7746 + }, + { + "epoch": 0.07747, + "grad_norm": 0.7453245346858474, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 7747 + }, + { + "epoch": 0.07748, + "grad_norm": 0.779119331038319, + "learning_rate": 0.003, + "loss": 4.1375, + "step": 7748 + }, + { + "epoch": 0.07749, + "grad_norm": 0.6901079756719614, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 7749 + }, + { + "epoch": 0.0775, + "grad_norm": 0.7554593110119233, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 7750 + }, + { + "epoch": 0.07751, + "grad_norm": 0.6857296299894233, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 7751 + }, + { + "epoch": 0.07752, + "grad_norm": 0.6922633638861819, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 7752 + }, + { + "epoch": 0.07753, + "grad_norm": 0.6594486091898183, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 7753 + }, + { + "epoch": 0.07754, + "grad_norm": 0.5934224681021092, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 7754 + }, + { + "epoch": 0.07755, + "grad_norm": 0.5188793093092156, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 7755 + }, + { + "epoch": 0.07756, + "grad_norm": 0.48414191971528364, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 7756 + }, + { + "epoch": 0.07757, + "grad_norm": 0.47273269138835095, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 7757 + }, + { + "epoch": 0.07758, + "grad_norm": 0.49740262524638973, + "learning_rate": 0.003, + "loss": 4.075, + "step": 7758 + }, + { + "epoch": 0.07759, + "grad_norm": 0.5065560507825277, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 7759 + }, + { + "epoch": 0.0776, + "grad_norm": 0.5375106355045344, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 7760 + }, + { + "epoch": 0.07761, + "grad_norm": 0.5513605901023222, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 7761 + }, + { + "epoch": 0.07762, + "grad_norm": 0.48015165347830113, + "learning_rate": 0.003, + "loss": 4.085, + "step": 7762 + }, + { + "epoch": 0.07763, + "grad_norm": 0.5167765335801594, + "learning_rate": 0.003, + "loss": 4.103, + "step": 7763 + }, + { + "epoch": 0.07764, + "grad_norm": 0.6089153068745908, + "learning_rate": 0.003, + "loss": 4.111, + "step": 7764 + }, + { + "epoch": 0.07765, + "grad_norm": 0.7391949344092991, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 7765 + }, + { + "epoch": 0.07766, + "grad_norm": 1.072503112575427, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 7766 + }, + { + "epoch": 0.07767, + "grad_norm": 1.2741364151057324, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 7767 + }, + { + "epoch": 0.07768, + "grad_norm": 0.8964766694765297, + "learning_rate": 0.003, + "loss": 4.104, + "step": 7768 + }, + { + "epoch": 0.07769, + "grad_norm": 1.049149406706748, + "learning_rate": 0.003, + "loss": 4.1181, + "step": 7769 + }, + { + "epoch": 0.0777, + "grad_norm": 0.930557278971072, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 7770 + }, + { + "epoch": 0.07771, + "grad_norm": 0.8923728869689802, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 7771 + }, + { + "epoch": 0.07772, + "grad_norm": 0.7822123730589138, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 7772 + }, + { + "epoch": 0.07773, + "grad_norm": 0.7222823816657866, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 7773 + }, + { + "epoch": 0.07774, + "grad_norm": 0.7187204559865789, + "learning_rate": 0.003, + "loss": 4.113, + "step": 7774 + }, + { + "epoch": 0.07775, + "grad_norm": 0.7480635321015907, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 7775 + }, + { + "epoch": 0.07776, + "grad_norm": 0.760612608235149, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 7776 + }, + { + "epoch": 0.07777, + "grad_norm": 0.6882048442017037, + "learning_rate": 0.003, + "loss": 4.124, + "step": 7777 + }, + { + "epoch": 0.07778, + "grad_norm": 0.7510711884551454, + "learning_rate": 0.003, + "loss": 4.1324, + "step": 7778 + }, + { + "epoch": 0.07779, + "grad_norm": 0.9393258215990358, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7779 + }, + { + "epoch": 0.0778, + "grad_norm": 0.9513634587238046, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 7780 + }, + { + "epoch": 0.07781, + "grad_norm": 0.9481811143998107, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 7781 + }, + { + "epoch": 0.07782, + "grad_norm": 0.9203734843556167, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 7782 + }, + { + "epoch": 0.07783, + "grad_norm": 0.7932108521278691, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 7783 + }, + { + "epoch": 0.07784, + "grad_norm": 0.7260500811767607, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 7784 + }, + { + "epoch": 0.07785, + "grad_norm": 0.7585529189874307, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 7785 + }, + { + "epoch": 0.07786, + "grad_norm": 0.6739329538427627, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 7786 + }, + { + "epoch": 0.07787, + "grad_norm": 0.5288700684147263, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 7787 + }, + { + "epoch": 0.07788, + "grad_norm": 0.5605208483007748, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 7788 + }, + { + "epoch": 0.07789, + "grad_norm": 0.5560131490937277, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 7789 + }, + { + "epoch": 0.0779, + "grad_norm": 0.5403302533220166, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 7790 + }, + { + "epoch": 0.07791, + "grad_norm": 0.5930794987356798, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 7791 + }, + { + "epoch": 0.07792, + "grad_norm": 0.6273285492908405, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 7792 + }, + { + "epoch": 0.07793, + "grad_norm": 0.6286748399924618, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 7793 + }, + { + "epoch": 0.07794, + "grad_norm": 0.6714313089602904, + "learning_rate": 0.003, + "loss": 4.128, + "step": 7794 + }, + { + "epoch": 0.07795, + "grad_norm": 0.7779410911099373, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 7795 + }, + { + "epoch": 0.07796, + "grad_norm": 0.9238625316835825, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 7796 + }, + { + "epoch": 0.07797, + "grad_norm": 0.9115617993434088, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 7797 + }, + { + "epoch": 0.07798, + "grad_norm": 0.8524686648853493, + "learning_rate": 0.003, + "loss": 4.1306, + "step": 7798 + }, + { + "epoch": 0.07799, + "grad_norm": 0.6927867557827436, + "learning_rate": 0.003, + "loss": 4.129, + "step": 7799 + }, + { + "epoch": 0.078, + "grad_norm": 0.6008938961247131, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 7800 + }, + { + "epoch": 0.07801, + "grad_norm": 0.6422683191074112, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 7801 + }, + { + "epoch": 0.07802, + "grad_norm": 0.6032600741877926, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 7802 + }, + { + "epoch": 0.07803, + "grad_norm": 0.5863000786481797, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 7803 + }, + { + "epoch": 0.07804, + "grad_norm": 0.5317230097198795, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 7804 + }, + { + "epoch": 0.07805, + "grad_norm": 0.45647691543183516, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 7805 + }, + { + "epoch": 0.07806, + "grad_norm": 0.46699207392888487, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 7806 + }, + { + "epoch": 0.07807, + "grad_norm": 0.4393858321916459, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 7807 + }, + { + "epoch": 0.07808, + "grad_norm": 0.4646599647463825, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 7808 + }, + { + "epoch": 0.07809, + "grad_norm": 0.5658965842298878, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 7809 + }, + { + "epoch": 0.0781, + "grad_norm": 0.7181637740728304, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 7810 + }, + { + "epoch": 0.07811, + "grad_norm": 0.8109929763108267, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 7811 + }, + { + "epoch": 0.07812, + "grad_norm": 0.8368920397907048, + "learning_rate": 0.003, + "loss": 4.105, + "step": 7812 + }, + { + "epoch": 0.07813, + "grad_norm": 0.8903796018565326, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 7813 + }, + { + "epoch": 0.07814, + "grad_norm": 0.9722080091605632, + "learning_rate": 0.003, + "loss": 4.1525, + "step": 7814 + }, + { + "epoch": 0.07815, + "grad_norm": 0.9736337448735193, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 7815 + }, + { + "epoch": 0.07816, + "grad_norm": 1.0961292869832548, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 7816 + }, + { + "epoch": 0.07817, + "grad_norm": 1.1565923993465608, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 7817 + }, + { + "epoch": 0.07818, + "grad_norm": 0.8585417973652822, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 7818 + }, + { + "epoch": 0.07819, + "grad_norm": 0.7416217398177496, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 7819 + }, + { + "epoch": 0.0782, + "grad_norm": 0.6344240220641632, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 7820 + }, + { + "epoch": 0.07821, + "grad_norm": 0.6739464286664512, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 7821 + }, + { + "epoch": 0.07822, + "grad_norm": 0.5825705065071117, + "learning_rate": 0.003, + "loss": 4.082, + "step": 7822 + }, + { + "epoch": 0.07823, + "grad_norm": 0.5723600939543713, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 7823 + }, + { + "epoch": 0.07824, + "grad_norm": 0.5121389736529045, + "learning_rate": 0.003, + "loss": 4.1241, + "step": 7824 + }, + { + "epoch": 0.07825, + "grad_norm": 0.5355001404670079, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 7825 + }, + { + "epoch": 0.07826, + "grad_norm": 0.527193374309933, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 7826 + }, + { + "epoch": 0.07827, + "grad_norm": 0.46903753993293595, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 7827 + }, + { + "epoch": 0.07828, + "grad_norm": 0.5421978536054349, + "learning_rate": 0.003, + "loss": 4.107, + "step": 7828 + }, + { + "epoch": 0.07829, + "grad_norm": 0.5249081779647955, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 7829 + }, + { + "epoch": 0.0783, + "grad_norm": 0.5049597468526409, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 7830 + }, + { + "epoch": 0.07831, + "grad_norm": 0.5692015688709412, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 7831 + }, + { + "epoch": 0.07832, + "grad_norm": 0.6743332070780026, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 7832 + }, + { + "epoch": 0.07833, + "grad_norm": 0.7173712685283925, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7833 + }, + { + "epoch": 0.07834, + "grad_norm": 0.8370657093807944, + "learning_rate": 0.003, + "loss": 4.149, + "step": 7834 + }, + { + "epoch": 0.07835, + "grad_norm": 1.136204980987355, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 7835 + }, + { + "epoch": 0.07836, + "grad_norm": 0.8577818143218462, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 7836 + }, + { + "epoch": 0.07837, + "grad_norm": 0.7112654311205949, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 7837 + }, + { + "epoch": 0.07838, + "grad_norm": 0.6569683165529128, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 7838 + }, + { + "epoch": 0.07839, + "grad_norm": 0.7565840760711471, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 7839 + }, + { + "epoch": 0.0784, + "grad_norm": 0.7482225327831992, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 7840 + }, + { + "epoch": 0.07841, + "grad_norm": 0.6711480993377672, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 7841 + }, + { + "epoch": 0.07842, + "grad_norm": 0.6679564294971032, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 7842 + }, + { + "epoch": 0.07843, + "grad_norm": 0.59249526868013, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 7843 + }, + { + "epoch": 0.07844, + "grad_norm": 0.5591695681252433, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 7844 + }, + { + "epoch": 0.07845, + "grad_norm": 0.6153612137689568, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 7845 + }, + { + "epoch": 0.07846, + "grad_norm": 0.6618403605569363, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 7846 + }, + { + "epoch": 0.07847, + "grad_norm": 0.6781576972563144, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 7847 + }, + { + "epoch": 0.07848, + "grad_norm": 0.6768216849272073, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 7848 + }, + { + "epoch": 0.07849, + "grad_norm": 0.7621052101869075, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 7849 + }, + { + "epoch": 0.0785, + "grad_norm": 0.7067170147953223, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 7850 + }, + { + "epoch": 0.07851, + "grad_norm": 0.7326500117988808, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 7851 + }, + { + "epoch": 0.07852, + "grad_norm": 0.7009642585608814, + "learning_rate": 0.003, + "loss": 4.1261, + "step": 7852 + }, + { + "epoch": 0.07853, + "grad_norm": 0.7619033300792631, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 7853 + }, + { + "epoch": 0.07854, + "grad_norm": 0.8435597472657554, + "learning_rate": 0.003, + "loss": 4.1189, + "step": 7854 + }, + { + "epoch": 0.07855, + "grad_norm": 0.8978457164901018, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 7855 + }, + { + "epoch": 0.07856, + "grad_norm": 1.023677440950207, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 7856 + }, + { + "epoch": 0.07857, + "grad_norm": 0.9231166097330715, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 7857 + }, + { + "epoch": 0.07858, + "grad_norm": 0.8633661145024862, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 7858 + }, + { + "epoch": 0.07859, + "grad_norm": 0.7860454695774618, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 7859 + }, + { + "epoch": 0.0786, + "grad_norm": 0.7590473523974762, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 7860 + }, + { + "epoch": 0.07861, + "grad_norm": 0.7742422968395642, + "learning_rate": 0.003, + "loss": 4.1255, + "step": 7861 + }, + { + "epoch": 0.07862, + "grad_norm": 0.6647634614520403, + "learning_rate": 0.003, + "loss": 4.1421, + "step": 7862 + }, + { + "epoch": 0.07863, + "grad_norm": 0.6187463355818706, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 7863 + }, + { + "epoch": 0.07864, + "grad_norm": 0.5110625417422245, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 7864 + }, + { + "epoch": 0.07865, + "grad_norm": 0.5020168122945233, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 7865 + }, + { + "epoch": 0.07866, + "grad_norm": 0.5824358186306762, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 7866 + }, + { + "epoch": 0.07867, + "grad_norm": 0.6255923217504209, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 7867 + }, + { + "epoch": 0.07868, + "grad_norm": 0.7772954380441937, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 7868 + }, + { + "epoch": 0.07869, + "grad_norm": 0.9860133943188509, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 7869 + }, + { + "epoch": 0.0787, + "grad_norm": 1.0195783394153966, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7870 + }, + { + "epoch": 0.07871, + "grad_norm": 0.7559459514710412, + "learning_rate": 0.003, + "loss": 4.1424, + "step": 7871 + }, + { + "epoch": 0.07872, + "grad_norm": 0.9076157878017646, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 7872 + }, + { + "epoch": 0.07873, + "grad_norm": 0.8226175326832436, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 7873 + }, + { + "epoch": 0.07874, + "grad_norm": 0.7406414466218076, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 7874 + }, + { + "epoch": 0.07875, + "grad_norm": 0.7388682612769197, + "learning_rate": 0.003, + "loss": 4.1222, + "step": 7875 + }, + { + "epoch": 0.07876, + "grad_norm": 0.904519114854065, + "learning_rate": 0.003, + "loss": 4.127, + "step": 7876 + }, + { + "epoch": 0.07877, + "grad_norm": 0.9368363860931168, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 7877 + }, + { + "epoch": 0.07878, + "grad_norm": 0.8598751903328211, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 7878 + }, + { + "epoch": 0.07879, + "grad_norm": 0.819312740767117, + "learning_rate": 0.003, + "loss": 4.1204, + "step": 7879 + }, + { + "epoch": 0.0788, + "grad_norm": 0.7760893771538491, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 7880 + }, + { + "epoch": 0.07881, + "grad_norm": 0.7187534747264536, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 7881 + }, + { + "epoch": 0.07882, + "grad_norm": 0.6858619284030656, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 7882 + }, + { + "epoch": 0.07883, + "grad_norm": 0.6188431011405061, + "learning_rate": 0.003, + "loss": 4.083, + "step": 7883 + }, + { + "epoch": 0.07884, + "grad_norm": 0.49119386209036325, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 7884 + }, + { + "epoch": 0.07885, + "grad_norm": 0.4425644032822491, + "learning_rate": 0.003, + "loss": 4.088, + "step": 7885 + }, + { + "epoch": 0.07886, + "grad_norm": 0.45859576158776894, + "learning_rate": 0.003, + "loss": 4.1142, + "step": 7886 + }, + { + "epoch": 0.07887, + "grad_norm": 0.4966339618463925, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 7887 + }, + { + "epoch": 0.07888, + "grad_norm": 0.5978659033312088, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 7888 + }, + { + "epoch": 0.07889, + "grad_norm": 0.7155472899570747, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 7889 + }, + { + "epoch": 0.0789, + "grad_norm": 0.7993574365094266, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 7890 + }, + { + "epoch": 0.07891, + "grad_norm": 0.7636085550101828, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 7891 + }, + { + "epoch": 0.07892, + "grad_norm": 0.6885425050961269, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 7892 + }, + { + "epoch": 0.07893, + "grad_norm": 0.6721526677142209, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 7893 + }, + { + "epoch": 0.07894, + "grad_norm": 0.6152441288738106, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 7894 + }, + { + "epoch": 0.07895, + "grad_norm": 0.561256579962152, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 7895 + }, + { + "epoch": 0.07896, + "grad_norm": 0.5735961480085187, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 7896 + }, + { + "epoch": 0.07897, + "grad_norm": 0.676934691180459, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 7897 + }, + { + "epoch": 0.07898, + "grad_norm": 0.7401825684646072, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 7898 + }, + { + "epoch": 0.07899, + "grad_norm": 0.7621955625823885, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 7899 + }, + { + "epoch": 0.079, + "grad_norm": 0.7808977446083992, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 7900 + }, + { + "epoch": 0.07901, + "grad_norm": 0.8881664429218848, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 7901 + }, + { + "epoch": 0.07902, + "grad_norm": 0.8885276958518861, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 7902 + }, + { + "epoch": 0.07903, + "grad_norm": 0.8190648495783674, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 7903 + }, + { + "epoch": 0.07904, + "grad_norm": 0.7646472367126418, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 7904 + }, + { + "epoch": 0.07905, + "grad_norm": 0.768832626042469, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 7905 + }, + { + "epoch": 0.07906, + "grad_norm": 0.7344374945822847, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 7906 + }, + { + "epoch": 0.07907, + "grad_norm": 0.5806783775411604, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 7907 + }, + { + "epoch": 0.07908, + "grad_norm": 0.5703784952596479, + "learning_rate": 0.003, + "loss": 4.107, + "step": 7908 + }, + { + "epoch": 0.07909, + "grad_norm": 0.5524374992586529, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 7909 + }, + { + "epoch": 0.0791, + "grad_norm": 0.593207520021952, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 7910 + }, + { + "epoch": 0.07911, + "grad_norm": 0.6796537564703014, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 7911 + }, + { + "epoch": 0.07912, + "grad_norm": 0.811473537909756, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 7912 + }, + { + "epoch": 0.07913, + "grad_norm": 0.9594020178558872, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 7913 + }, + { + "epoch": 0.07914, + "grad_norm": 0.8781644497544638, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 7914 + }, + { + "epoch": 0.07915, + "grad_norm": 0.7673157421848797, + "learning_rate": 0.003, + "loss": 4.086, + "step": 7915 + }, + { + "epoch": 0.07916, + "grad_norm": 0.7231720388780313, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 7916 + }, + { + "epoch": 0.07917, + "grad_norm": 0.747995150418676, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 7917 + }, + { + "epoch": 0.07918, + "grad_norm": 0.7558357775396138, + "learning_rate": 0.003, + "loss": 4.106, + "step": 7918 + }, + { + "epoch": 0.07919, + "grad_norm": 0.7473819497926562, + "learning_rate": 0.003, + "loss": 4.065, + "step": 7919 + }, + { + "epoch": 0.0792, + "grad_norm": 0.7630023142885092, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 7920 + }, + { + "epoch": 0.07921, + "grad_norm": 0.661482132358115, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 7921 + }, + { + "epoch": 0.07922, + "grad_norm": 0.6198116791527155, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 7922 + }, + { + "epoch": 0.07923, + "grad_norm": 0.6551552600939399, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 7923 + }, + { + "epoch": 0.07924, + "grad_norm": 0.6696835280064822, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 7924 + }, + { + "epoch": 0.07925, + "grad_norm": 0.6925330545340292, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 7925 + }, + { + "epoch": 0.07926, + "grad_norm": 0.7284626249714012, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 7926 + }, + { + "epoch": 0.07927, + "grad_norm": 0.671892879773423, + "learning_rate": 0.003, + "loss": 4.062, + "step": 7927 + }, + { + "epoch": 0.07928, + "grad_norm": 0.6609056217998242, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 7928 + }, + { + "epoch": 0.07929, + "grad_norm": 0.6999670786024481, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 7929 + }, + { + "epoch": 0.0793, + "grad_norm": 0.6958735375819932, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 7930 + }, + { + "epoch": 0.07931, + "grad_norm": 0.7554404336036357, + "learning_rate": 0.003, + "loss": 4.1153, + "step": 7931 + }, + { + "epoch": 0.07932, + "grad_norm": 0.8243101778229396, + "learning_rate": 0.003, + "loss": 4.1151, + "step": 7932 + }, + { + "epoch": 0.07933, + "grad_norm": 0.9463359506341771, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 7933 + }, + { + "epoch": 0.07934, + "grad_norm": 1.013600457137422, + "learning_rate": 0.003, + "loss": 4.1352, + "step": 7934 + }, + { + "epoch": 0.07935, + "grad_norm": 0.8419500712154685, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 7935 + }, + { + "epoch": 0.07936, + "grad_norm": 0.6245467403233166, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 7936 + }, + { + "epoch": 0.07937, + "grad_norm": 0.7486505766341088, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 7937 + }, + { + "epoch": 0.07938, + "grad_norm": 0.8254464470107613, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 7938 + }, + { + "epoch": 0.07939, + "grad_norm": 0.8503311527457658, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 7939 + }, + { + "epoch": 0.0794, + "grad_norm": 0.7668471916028556, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 7940 + }, + { + "epoch": 0.07941, + "grad_norm": 0.6283204867133623, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 7941 + }, + { + "epoch": 0.07942, + "grad_norm": 0.5166138501693774, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 7942 + }, + { + "epoch": 0.07943, + "grad_norm": 0.5021949697098655, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 7943 + }, + { + "epoch": 0.07944, + "grad_norm": 0.5349404843772013, + "learning_rate": 0.003, + "loss": 4.121, + "step": 7944 + }, + { + "epoch": 0.07945, + "grad_norm": 0.5952661363698953, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 7945 + }, + { + "epoch": 0.07946, + "grad_norm": 0.5669655747114686, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 7946 + }, + { + "epoch": 0.07947, + "grad_norm": 0.5763265373404088, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 7947 + }, + { + "epoch": 0.07948, + "grad_norm": 0.5722110776640312, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 7948 + }, + { + "epoch": 0.07949, + "grad_norm": 0.6056595909010904, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 7949 + }, + { + "epoch": 0.0795, + "grad_norm": 0.6647078565086577, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 7950 + }, + { + "epoch": 0.07951, + "grad_norm": 0.6314215964588701, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 7951 + }, + { + "epoch": 0.07952, + "grad_norm": 0.5731615532451197, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 7952 + }, + { + "epoch": 0.07953, + "grad_norm": 0.5227418275474421, + "learning_rate": 0.003, + "loss": 4.066, + "step": 7953 + }, + { + "epoch": 0.07954, + "grad_norm": 0.5425211491008127, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 7954 + }, + { + "epoch": 0.07955, + "grad_norm": 0.5562460949888285, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 7955 + }, + { + "epoch": 0.07956, + "grad_norm": 0.5957051466560916, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 7956 + }, + { + "epoch": 0.07957, + "grad_norm": 0.6056607635018184, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 7957 + }, + { + "epoch": 0.07958, + "grad_norm": 0.7512975150926531, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 7958 + }, + { + "epoch": 0.07959, + "grad_norm": 1.0080192207485463, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 7959 + }, + { + "epoch": 0.0796, + "grad_norm": 1.2487463146426276, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 7960 + }, + { + "epoch": 0.07961, + "grad_norm": 0.6826790561980052, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 7961 + }, + { + "epoch": 0.07962, + "grad_norm": 0.7451214520986289, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 7962 + }, + { + "epoch": 0.07963, + "grad_norm": 0.9631217756846927, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 7963 + }, + { + "epoch": 0.07964, + "grad_norm": 1.1941093560148814, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 7964 + }, + { + "epoch": 0.07965, + "grad_norm": 0.7914311934718551, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 7965 + }, + { + "epoch": 0.07966, + "grad_norm": 0.6290523078446999, + "learning_rate": 0.003, + "loss": 4.087, + "step": 7966 + }, + { + "epoch": 0.07967, + "grad_norm": 0.6518156345361048, + "learning_rate": 0.003, + "loss": 4.111, + "step": 7967 + }, + { + "epoch": 0.07968, + "grad_norm": 0.6214813237738295, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 7968 + }, + { + "epoch": 0.07969, + "grad_norm": 0.5554977099941081, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 7969 + }, + { + "epoch": 0.0797, + "grad_norm": 0.5640600564781908, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 7970 + }, + { + "epoch": 0.07971, + "grad_norm": 0.6596519402139773, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 7971 + }, + { + "epoch": 0.07972, + "grad_norm": 0.8545861020949079, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 7972 + }, + { + "epoch": 0.07973, + "grad_norm": 0.865871265631871, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 7973 + }, + { + "epoch": 0.07974, + "grad_norm": 0.7460236260461683, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 7974 + }, + { + "epoch": 0.07975, + "grad_norm": 0.6850055866586392, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 7975 + }, + { + "epoch": 0.07976, + "grad_norm": 0.6735013134508674, + "learning_rate": 0.003, + "loss": 4.1095, + "step": 7976 + }, + { + "epoch": 0.07977, + "grad_norm": 0.7405183514912751, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 7977 + }, + { + "epoch": 0.07978, + "grad_norm": 0.8290886310120359, + "learning_rate": 0.003, + "loss": 4.1145, + "step": 7978 + }, + { + "epoch": 0.07979, + "grad_norm": 0.8453917594571124, + "learning_rate": 0.003, + "loss": 4.093, + "step": 7979 + }, + { + "epoch": 0.0798, + "grad_norm": 0.8981121368882824, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 7980 + }, + { + "epoch": 0.07981, + "grad_norm": 0.8246540871692223, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 7981 + }, + { + "epoch": 0.07982, + "grad_norm": 0.8765802803687193, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 7982 + }, + { + "epoch": 0.07983, + "grad_norm": 0.9675677298543592, + "learning_rate": 0.003, + "loss": 4.1423, + "step": 7983 + }, + { + "epoch": 0.07984, + "grad_norm": 1.0989989237488744, + "learning_rate": 0.003, + "loss": 4.1297, + "step": 7984 + }, + { + "epoch": 0.07985, + "grad_norm": 0.865566009696521, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 7985 + }, + { + "epoch": 0.07986, + "grad_norm": 0.7217554053695046, + "learning_rate": 0.003, + "loss": 4.1241, + "step": 7986 + }, + { + "epoch": 0.07987, + "grad_norm": 0.7223309169228254, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 7987 + }, + { + "epoch": 0.07988, + "grad_norm": 0.7601248914831606, + "learning_rate": 0.003, + "loss": 4.111, + "step": 7988 + }, + { + "epoch": 0.07989, + "grad_norm": 0.754472367102963, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 7989 + }, + { + "epoch": 0.0799, + "grad_norm": 0.7962535081025417, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 7990 + }, + { + "epoch": 0.07991, + "grad_norm": 0.7457639189267391, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 7991 + }, + { + "epoch": 0.07992, + "grad_norm": 0.6851938375119424, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 7992 + }, + { + "epoch": 0.07993, + "grad_norm": 0.6269563237493135, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 7993 + }, + { + "epoch": 0.07994, + "grad_norm": 0.6070969865870026, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 7994 + }, + { + "epoch": 0.07995, + "grad_norm": 0.5627702965519503, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 7995 + }, + { + "epoch": 0.07996, + "grad_norm": 0.5112924542761613, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 7996 + }, + { + "epoch": 0.07997, + "grad_norm": 0.598007231362175, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 7997 + }, + { + "epoch": 0.07998, + "grad_norm": 0.6410257731351261, + "learning_rate": 0.003, + "loss": 4.1248, + "step": 7998 + }, + { + "epoch": 0.07999, + "grad_norm": 0.7458379686223753, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 7999 + }, + { + "epoch": 0.08, + "grad_norm": 0.7034092807624223, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 8000 + }, + { + "epoch": 0.08001, + "grad_norm": 0.5871105361909563, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 8001 + }, + { + "epoch": 0.08002, + "grad_norm": 0.5995732080621035, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 8002 + }, + { + "epoch": 0.08003, + "grad_norm": 0.6613967789238772, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 8003 + }, + { + "epoch": 0.08004, + "grad_norm": 0.6523754319361283, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 8004 + }, + { + "epoch": 0.08005, + "grad_norm": 0.7036409698174545, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 8005 + }, + { + "epoch": 0.08006, + "grad_norm": 0.7212028299745142, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 8006 + }, + { + "epoch": 0.08007, + "grad_norm": 0.6666231743781851, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 8007 + }, + { + "epoch": 0.08008, + "grad_norm": 0.7849422476214978, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 8008 + }, + { + "epoch": 0.08009, + "grad_norm": 0.8161881054836299, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 8009 + }, + { + "epoch": 0.0801, + "grad_norm": 0.7682572142792697, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 8010 + }, + { + "epoch": 0.08011, + "grad_norm": 0.777422634390519, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 8011 + }, + { + "epoch": 0.08012, + "grad_norm": 0.8100758516563243, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 8012 + }, + { + "epoch": 0.08013, + "grad_norm": 0.7710777007779627, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 8013 + }, + { + "epoch": 0.08014, + "grad_norm": 0.7527995072769255, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 8014 + }, + { + "epoch": 0.08015, + "grad_norm": 0.7561651584547401, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 8015 + }, + { + "epoch": 0.08016, + "grad_norm": 0.7035897971730628, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 8016 + }, + { + "epoch": 0.08017, + "grad_norm": 0.7174150406944461, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 8017 + }, + { + "epoch": 0.08018, + "grad_norm": 0.7624585759410506, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 8018 + }, + { + "epoch": 0.08019, + "grad_norm": 0.7182168882426316, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 8019 + }, + { + "epoch": 0.0802, + "grad_norm": 0.7048358879364466, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 8020 + }, + { + "epoch": 0.08021, + "grad_norm": 0.7497958878822685, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 8021 + }, + { + "epoch": 0.08022, + "grad_norm": 0.821004036800071, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 8022 + }, + { + "epoch": 0.08023, + "grad_norm": 0.9809693423098361, + "learning_rate": 0.003, + "loss": 4.1, + "step": 8023 + }, + { + "epoch": 0.08024, + "grad_norm": 1.052787459999515, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 8024 + }, + { + "epoch": 0.08025, + "grad_norm": 0.8450811401393215, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 8025 + }, + { + "epoch": 0.08026, + "grad_norm": 0.807742847907378, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 8026 + }, + { + "epoch": 0.08027, + "grad_norm": 0.7663131570160219, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 8027 + }, + { + "epoch": 0.08028, + "grad_norm": 0.6840081645216585, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 8028 + }, + { + "epoch": 0.08029, + "grad_norm": 0.6282444638490934, + "learning_rate": 0.003, + "loss": 4.118, + "step": 8029 + }, + { + "epoch": 0.0803, + "grad_norm": 0.5930173859419846, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 8030 + }, + { + "epoch": 0.08031, + "grad_norm": 0.6043437323260166, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 8031 + }, + { + "epoch": 0.08032, + "grad_norm": 0.6211595627768416, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 8032 + }, + { + "epoch": 0.08033, + "grad_norm": 0.6326856189776532, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 8033 + }, + { + "epoch": 0.08034, + "grad_norm": 0.7414480977291886, + "learning_rate": 0.003, + "loss": 4.112, + "step": 8034 + }, + { + "epoch": 0.08035, + "grad_norm": 0.847555227207347, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 8035 + }, + { + "epoch": 0.08036, + "grad_norm": 0.8976992992805168, + "learning_rate": 0.003, + "loss": 4.1415, + "step": 8036 + }, + { + "epoch": 0.08037, + "grad_norm": 0.7353446869433018, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 8037 + }, + { + "epoch": 0.08038, + "grad_norm": 0.5737562105586347, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 8038 + }, + { + "epoch": 0.08039, + "grad_norm": 0.6223395377390066, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 8039 + }, + { + "epoch": 0.0804, + "grad_norm": 0.6460407278049516, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 8040 + }, + { + "epoch": 0.08041, + "grad_norm": 0.6792567658949662, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 8041 + }, + { + "epoch": 0.08042, + "grad_norm": 0.6631671232762439, + "learning_rate": 0.003, + "loss": 4.098, + "step": 8042 + }, + { + "epoch": 0.08043, + "grad_norm": 0.7543589264318384, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 8043 + }, + { + "epoch": 0.08044, + "grad_norm": 0.8201149824912215, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 8044 + }, + { + "epoch": 0.08045, + "grad_norm": 0.8899252198530396, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 8045 + }, + { + "epoch": 0.08046, + "grad_norm": 0.8349673707656354, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 8046 + }, + { + "epoch": 0.08047, + "grad_norm": 0.7510934507359202, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 8047 + }, + { + "epoch": 0.08048, + "grad_norm": 0.7197955096059386, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 8048 + }, + { + "epoch": 0.08049, + "grad_norm": 0.6884550531131161, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 8049 + }, + { + "epoch": 0.0805, + "grad_norm": 0.6875283286820179, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 8050 + }, + { + "epoch": 0.08051, + "grad_norm": 0.6837157282878973, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 8051 + }, + { + "epoch": 0.08052, + "grad_norm": 0.6102356496634533, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 8052 + }, + { + "epoch": 0.08053, + "grad_norm": 0.5591025845354667, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 8053 + }, + { + "epoch": 0.08054, + "grad_norm": 0.6059431986010548, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 8054 + }, + { + "epoch": 0.08055, + "grad_norm": 0.5808217412748479, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 8055 + }, + { + "epoch": 0.08056, + "grad_norm": 0.6445970901420555, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 8056 + }, + { + "epoch": 0.08057, + "grad_norm": 0.6726343017392604, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 8057 + }, + { + "epoch": 0.08058, + "grad_norm": 0.5999523335836248, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 8058 + }, + { + "epoch": 0.08059, + "grad_norm": 0.6543038111857801, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 8059 + }, + { + "epoch": 0.0806, + "grad_norm": 0.6982671331368794, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 8060 + }, + { + "epoch": 0.08061, + "grad_norm": 0.7521101541978558, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 8061 + }, + { + "epoch": 0.08062, + "grad_norm": 0.8023563510336984, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 8062 + }, + { + "epoch": 0.08063, + "grad_norm": 0.8276610456091178, + "learning_rate": 0.003, + "loss": 4.134, + "step": 8063 + }, + { + "epoch": 0.08064, + "grad_norm": 0.9822043974280728, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 8064 + }, + { + "epoch": 0.08065, + "grad_norm": 1.1637487862676754, + "learning_rate": 0.003, + "loss": 4.1244, + "step": 8065 + }, + { + "epoch": 0.08066, + "grad_norm": 0.9728114020342268, + "learning_rate": 0.003, + "loss": 4.136, + "step": 8066 + }, + { + "epoch": 0.08067, + "grad_norm": 0.9136508947005387, + "learning_rate": 0.003, + "loss": 4.1226, + "step": 8067 + }, + { + "epoch": 0.08068, + "grad_norm": 1.069372223697199, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 8068 + }, + { + "epoch": 0.08069, + "grad_norm": 1.045484111398971, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 8069 + }, + { + "epoch": 0.0807, + "grad_norm": 0.9148751637475432, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 8070 + }, + { + "epoch": 0.08071, + "grad_norm": 0.7828647369529286, + "learning_rate": 0.003, + "loss": 4.1095, + "step": 8071 + }, + { + "epoch": 0.08072, + "grad_norm": 0.6962246302015118, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 8072 + }, + { + "epoch": 0.08073, + "grad_norm": 0.7338615119494287, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 8073 + }, + { + "epoch": 0.08074, + "grad_norm": 0.7372129834579164, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 8074 + }, + { + "epoch": 0.08075, + "grad_norm": 0.6637992164097991, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 8075 + }, + { + "epoch": 0.08076, + "grad_norm": 0.7364712485983946, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 8076 + }, + { + "epoch": 0.08077, + "grad_norm": 0.8373976570049751, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 8077 + }, + { + "epoch": 0.08078, + "grad_norm": 0.8081955608957938, + "learning_rate": 0.003, + "loss": 4.119, + "step": 8078 + }, + { + "epoch": 0.08079, + "grad_norm": 0.6241082470180894, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 8079 + }, + { + "epoch": 0.0808, + "grad_norm": 0.5925082920698637, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 8080 + }, + { + "epoch": 0.08081, + "grad_norm": 0.5771081693438284, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 8081 + }, + { + "epoch": 0.08082, + "grad_norm": 0.5644671641528122, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 8082 + }, + { + "epoch": 0.08083, + "grad_norm": 0.5574560207893343, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 8083 + }, + { + "epoch": 0.08084, + "grad_norm": 0.5516245286767989, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 8084 + }, + { + "epoch": 0.08085, + "grad_norm": 0.4994799258132661, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 8085 + }, + { + "epoch": 0.08086, + "grad_norm": 0.5387898877477255, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 8086 + }, + { + "epoch": 0.08087, + "grad_norm": 0.5279461986880152, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 8087 + }, + { + "epoch": 0.08088, + "grad_norm": 0.6653142602263723, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 8088 + }, + { + "epoch": 0.08089, + "grad_norm": 0.7838979326031138, + "learning_rate": 0.003, + "loss": 4.063, + "step": 8089 + }, + { + "epoch": 0.0809, + "grad_norm": 0.7172870093397796, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 8090 + }, + { + "epoch": 0.08091, + "grad_norm": 0.6970224864376466, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 8091 + }, + { + "epoch": 0.08092, + "grad_norm": 0.8698816808081968, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 8092 + }, + { + "epoch": 0.08093, + "grad_norm": 0.8903936647822183, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 8093 + }, + { + "epoch": 0.08094, + "grad_norm": 0.8170859896883738, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 8094 + }, + { + "epoch": 0.08095, + "grad_norm": 0.630198435688837, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 8095 + }, + { + "epoch": 0.08096, + "grad_norm": 0.561670164851885, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 8096 + }, + { + "epoch": 0.08097, + "grad_norm": 0.6003643063916327, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 8097 + }, + { + "epoch": 0.08098, + "grad_norm": 0.5928995388219754, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 8098 + }, + { + "epoch": 0.08099, + "grad_norm": 0.613396859162041, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 8099 + }, + { + "epoch": 0.081, + "grad_norm": 0.5937607661582928, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 8100 + }, + { + "epoch": 0.08101, + "grad_norm": 0.5633649505477989, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 8101 + }, + { + "epoch": 0.08102, + "grad_norm": 0.517609472371696, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 8102 + }, + { + "epoch": 0.08103, + "grad_norm": 0.5099084616246116, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 8103 + }, + { + "epoch": 0.08104, + "grad_norm": 0.5423305382438013, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 8104 + }, + { + "epoch": 0.08105, + "grad_norm": 0.5677547112279588, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8105 + }, + { + "epoch": 0.08106, + "grad_norm": 0.6327286014925814, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 8106 + }, + { + "epoch": 0.08107, + "grad_norm": 0.5935405615580722, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 8107 + }, + { + "epoch": 0.08108, + "grad_norm": 0.5906936959536371, + "learning_rate": 0.003, + "loss": 4.041, + "step": 8108 + }, + { + "epoch": 0.08109, + "grad_norm": 0.7713130902488691, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 8109 + }, + { + "epoch": 0.0811, + "grad_norm": 1.089819423816047, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 8110 + }, + { + "epoch": 0.08111, + "grad_norm": 0.9827588739551303, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 8111 + }, + { + "epoch": 0.08112, + "grad_norm": 1.0089600330173312, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 8112 + }, + { + "epoch": 0.08113, + "grad_norm": 0.9171567562711044, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 8113 + }, + { + "epoch": 0.08114, + "grad_norm": 0.8631573233521649, + "learning_rate": 0.003, + "loss": 4.127, + "step": 8114 + }, + { + "epoch": 0.08115, + "grad_norm": 0.8166838272437511, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 8115 + }, + { + "epoch": 0.08116, + "grad_norm": 0.788874725717097, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 8116 + }, + { + "epoch": 0.08117, + "grad_norm": 0.7767336993314875, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 8117 + }, + { + "epoch": 0.08118, + "grad_norm": 0.9294129096540645, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 8118 + }, + { + "epoch": 0.08119, + "grad_norm": 0.8162169856495585, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 8119 + }, + { + "epoch": 0.0812, + "grad_norm": 0.8009999394525374, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 8120 + }, + { + "epoch": 0.08121, + "grad_norm": 0.8865514748225773, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 8121 + }, + { + "epoch": 0.08122, + "grad_norm": 0.9884968101525333, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 8122 + }, + { + "epoch": 0.08123, + "grad_norm": 1.0369283181463518, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 8123 + }, + { + "epoch": 0.08124, + "grad_norm": 0.7962033997430238, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 8124 + }, + { + "epoch": 0.08125, + "grad_norm": 0.6823614488148949, + "learning_rate": 0.003, + "loss": 4.1377, + "step": 8125 + }, + { + "epoch": 0.08126, + "grad_norm": 0.7327655636296029, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 8126 + }, + { + "epoch": 0.08127, + "grad_norm": 0.7378713020854091, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 8127 + }, + { + "epoch": 0.08128, + "grad_norm": 0.7764634911842385, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 8128 + }, + { + "epoch": 0.08129, + "grad_norm": 0.767500774593417, + "learning_rate": 0.003, + "loss": 4.091, + "step": 8129 + }, + { + "epoch": 0.0813, + "grad_norm": 0.7754642716859383, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 8130 + }, + { + "epoch": 0.08131, + "grad_norm": 0.7938632639450202, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 8131 + }, + { + "epoch": 0.08132, + "grad_norm": 0.7890658387507653, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 8132 + }, + { + "epoch": 0.08133, + "grad_norm": 0.7935294120537186, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 8133 + }, + { + "epoch": 0.08134, + "grad_norm": 0.8391989214547845, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 8134 + }, + { + "epoch": 0.08135, + "grad_norm": 0.9554035900858945, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 8135 + }, + { + "epoch": 0.08136, + "grad_norm": 0.9663343129111622, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 8136 + }, + { + "epoch": 0.08137, + "grad_norm": 0.8734638206822923, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 8137 + }, + { + "epoch": 0.08138, + "grad_norm": 0.9659073383764333, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 8138 + }, + { + "epoch": 0.08139, + "grad_norm": 1.0174128187111928, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 8139 + }, + { + "epoch": 0.0814, + "grad_norm": 0.8902331333269284, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 8140 + }, + { + "epoch": 0.08141, + "grad_norm": 0.8349574760386372, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 8141 + }, + { + "epoch": 0.08142, + "grad_norm": 0.7575245098644054, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 8142 + }, + { + "epoch": 0.08143, + "grad_norm": 0.8507077034723728, + "learning_rate": 0.003, + "loss": 4.1328, + "step": 8143 + }, + { + "epoch": 0.08144, + "grad_norm": 0.9743221963844606, + "learning_rate": 0.003, + "loss": 4.107, + "step": 8144 + }, + { + "epoch": 0.08145, + "grad_norm": 0.9476119009807066, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 8145 + }, + { + "epoch": 0.08146, + "grad_norm": 0.9002626878014724, + "learning_rate": 0.003, + "loss": 4.1448, + "step": 8146 + }, + { + "epoch": 0.08147, + "grad_norm": 0.8588231090576398, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 8147 + }, + { + "epoch": 0.08148, + "grad_norm": 0.7514699828415461, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 8148 + }, + { + "epoch": 0.08149, + "grad_norm": 0.6673420124266218, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 8149 + }, + { + "epoch": 0.0815, + "grad_norm": 0.7038005384460582, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8150 + }, + { + "epoch": 0.08151, + "grad_norm": 0.7264508313079431, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 8151 + }, + { + "epoch": 0.08152, + "grad_norm": 0.8385234903173766, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 8152 + }, + { + "epoch": 0.08153, + "grad_norm": 0.8772929826726037, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 8153 + }, + { + "epoch": 0.08154, + "grad_norm": 0.773933052101485, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 8154 + }, + { + "epoch": 0.08155, + "grad_norm": 0.5418340291312099, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 8155 + }, + { + "epoch": 0.08156, + "grad_norm": 0.612247602405888, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 8156 + }, + { + "epoch": 0.08157, + "grad_norm": 0.6995306526237685, + "learning_rate": 0.003, + "loss": 4.054, + "step": 8157 + }, + { + "epoch": 0.08158, + "grad_norm": 0.6914288085424243, + "learning_rate": 0.003, + "loss": 4.1327, + "step": 8158 + }, + { + "epoch": 0.08159, + "grad_norm": 0.5639754454368973, + "learning_rate": 0.003, + "loss": 4.126, + "step": 8159 + }, + { + "epoch": 0.0816, + "grad_norm": 0.5308643800912878, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 8160 + }, + { + "epoch": 0.08161, + "grad_norm": 0.4794329031430582, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 8161 + }, + { + "epoch": 0.08162, + "grad_norm": 0.3941630661211312, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 8162 + }, + { + "epoch": 0.08163, + "grad_norm": 0.3500016460344925, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 8163 + }, + { + "epoch": 0.08164, + "grad_norm": 0.36562869444919177, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 8164 + }, + { + "epoch": 0.08165, + "grad_norm": 0.43629694271249264, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 8165 + }, + { + "epoch": 0.08166, + "grad_norm": 0.5708500396948257, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 8166 + }, + { + "epoch": 0.08167, + "grad_norm": 0.7973454394619072, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 8167 + }, + { + "epoch": 0.08168, + "grad_norm": 1.0166589750002435, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 8168 + }, + { + "epoch": 0.08169, + "grad_norm": 1.0549264180537317, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 8169 + }, + { + "epoch": 0.0817, + "grad_norm": 0.71968050278163, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 8170 + }, + { + "epoch": 0.08171, + "grad_norm": 0.6543779608988335, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 8171 + }, + { + "epoch": 0.08172, + "grad_norm": 0.6293237774913742, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 8172 + }, + { + "epoch": 0.08173, + "grad_norm": 0.6399157682768067, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 8173 + }, + { + "epoch": 0.08174, + "grad_norm": 0.6938917181866926, + "learning_rate": 0.003, + "loss": 4.1244, + "step": 8174 + }, + { + "epoch": 0.08175, + "grad_norm": 0.6568284294958178, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 8175 + }, + { + "epoch": 0.08176, + "grad_norm": 0.6514625676960912, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 8176 + }, + { + "epoch": 0.08177, + "grad_norm": 0.6436411792736433, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 8177 + }, + { + "epoch": 0.08178, + "grad_norm": 0.6566610790643876, + "learning_rate": 0.003, + "loss": 4.088, + "step": 8178 + }, + { + "epoch": 0.08179, + "grad_norm": 0.6775002449384784, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 8179 + }, + { + "epoch": 0.0818, + "grad_norm": 0.6047058885443021, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 8180 + }, + { + "epoch": 0.08181, + "grad_norm": 0.49255328216218053, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 8181 + }, + { + "epoch": 0.08182, + "grad_norm": 0.6513781988399094, + "learning_rate": 0.003, + "loss": 4.12, + "step": 8182 + }, + { + "epoch": 0.08183, + "grad_norm": 0.756600046387577, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 8183 + }, + { + "epoch": 0.08184, + "grad_norm": 0.8214276567397872, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 8184 + }, + { + "epoch": 0.08185, + "grad_norm": 0.8752383489148959, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 8185 + }, + { + "epoch": 0.08186, + "grad_norm": 0.8441519707828739, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 8186 + }, + { + "epoch": 0.08187, + "grad_norm": 0.7672475601697205, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 8187 + }, + { + "epoch": 0.08188, + "grad_norm": 0.7282380600276215, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 8188 + }, + { + "epoch": 0.08189, + "grad_norm": 0.6846705238578715, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 8189 + }, + { + "epoch": 0.0819, + "grad_norm": 0.7207597619767919, + "learning_rate": 0.003, + "loss": 4.092, + "step": 8190 + }, + { + "epoch": 0.08191, + "grad_norm": 0.7101247154479835, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 8191 + }, + { + "epoch": 0.08192, + "grad_norm": 0.7686485468774331, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 8192 + }, + { + "epoch": 0.08193, + "grad_norm": 0.7889428905614293, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 8193 + }, + { + "epoch": 0.08194, + "grad_norm": 0.9153699308382264, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 8194 + }, + { + "epoch": 0.08195, + "grad_norm": 0.8233344904776073, + "learning_rate": 0.003, + "loss": 4.091, + "step": 8195 + }, + { + "epoch": 0.08196, + "grad_norm": 0.6057143745665352, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 8196 + }, + { + "epoch": 0.08197, + "grad_norm": 0.5644875985686773, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 8197 + }, + { + "epoch": 0.08198, + "grad_norm": 0.5894267537410771, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 8198 + }, + { + "epoch": 0.08199, + "grad_norm": 0.692674390552851, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 8199 + }, + { + "epoch": 0.082, + "grad_norm": 0.7242086618626092, + "learning_rate": 0.003, + "loss": 4.074, + "step": 8200 + }, + { + "epoch": 0.08201, + "grad_norm": 0.7495656742172861, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 8201 + }, + { + "epoch": 0.08202, + "grad_norm": 0.5902754450349533, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 8202 + }, + { + "epoch": 0.08203, + "grad_norm": 0.4939528228400696, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 8203 + }, + { + "epoch": 0.08204, + "grad_norm": 0.5819717473152258, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 8204 + }, + { + "epoch": 0.08205, + "grad_norm": 0.6610578196934844, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 8205 + }, + { + "epoch": 0.08206, + "grad_norm": 0.6833022930778619, + "learning_rate": 0.003, + "loss": 4.12, + "step": 8206 + }, + { + "epoch": 0.08207, + "grad_norm": 0.7109226906588522, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 8207 + }, + { + "epoch": 0.08208, + "grad_norm": 0.6863440404860754, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8208 + }, + { + "epoch": 0.08209, + "grad_norm": 0.7395650743809461, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 8209 + }, + { + "epoch": 0.0821, + "grad_norm": 0.9883215726717448, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 8210 + }, + { + "epoch": 0.08211, + "grad_norm": 1.2385395849984975, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 8211 + }, + { + "epoch": 0.08212, + "grad_norm": 0.8918876274434122, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 8212 + }, + { + "epoch": 0.08213, + "grad_norm": 0.8581489019919268, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 8213 + }, + { + "epoch": 0.08214, + "grad_norm": 0.8197346780442005, + "learning_rate": 0.003, + "loss": 4.1313, + "step": 8214 + }, + { + "epoch": 0.08215, + "grad_norm": 0.762507621386095, + "learning_rate": 0.003, + "loss": 4.1312, + "step": 8215 + }, + { + "epoch": 0.08216, + "grad_norm": 0.7489221002140064, + "learning_rate": 0.003, + "loss": 4.105, + "step": 8216 + }, + { + "epoch": 0.08217, + "grad_norm": 0.9152826477695661, + "learning_rate": 0.003, + "loss": 4.1414, + "step": 8217 + }, + { + "epoch": 0.08218, + "grad_norm": 1.0382652565254082, + "learning_rate": 0.003, + "loss": 4.1102, + "step": 8218 + }, + { + "epoch": 0.08219, + "grad_norm": 1.026804807644373, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 8219 + }, + { + "epoch": 0.0822, + "grad_norm": 1.0424560881490594, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 8220 + }, + { + "epoch": 0.08221, + "grad_norm": 0.8714512313495743, + "learning_rate": 0.003, + "loss": 4.1334, + "step": 8221 + }, + { + "epoch": 0.08222, + "grad_norm": 0.7651631883270994, + "learning_rate": 0.003, + "loss": 4.1153, + "step": 8222 + }, + { + "epoch": 0.08223, + "grad_norm": 0.729346414097472, + "learning_rate": 0.003, + "loss": 4.125, + "step": 8223 + }, + { + "epoch": 0.08224, + "grad_norm": 0.7979340889752132, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 8224 + }, + { + "epoch": 0.08225, + "grad_norm": 0.7645944759378901, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 8225 + }, + { + "epoch": 0.08226, + "grad_norm": 0.5448138887864806, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 8226 + }, + { + "epoch": 0.08227, + "grad_norm": 0.5856081470610704, + "learning_rate": 0.003, + "loss": 4.1312, + "step": 8227 + }, + { + "epoch": 0.08228, + "grad_norm": 0.6128538428938678, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 8228 + }, + { + "epoch": 0.08229, + "grad_norm": 0.5828543069405947, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 8229 + }, + { + "epoch": 0.0823, + "grad_norm": 0.5252226962559838, + "learning_rate": 0.003, + "loss": 4.1198, + "step": 8230 + }, + { + "epoch": 0.08231, + "grad_norm": 0.5557346682459137, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 8231 + }, + { + "epoch": 0.08232, + "grad_norm": 0.5650481326541311, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 8232 + }, + { + "epoch": 0.08233, + "grad_norm": 0.5634969006102941, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 8233 + }, + { + "epoch": 0.08234, + "grad_norm": 0.5450247074135766, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 8234 + }, + { + "epoch": 0.08235, + "grad_norm": 0.499938817117624, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 8235 + }, + { + "epoch": 0.08236, + "grad_norm": 0.5296381965012722, + "learning_rate": 0.003, + "loss": 4.076, + "step": 8236 + }, + { + "epoch": 0.08237, + "grad_norm": 0.547808654408303, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 8237 + }, + { + "epoch": 0.08238, + "grad_norm": 0.5329237503159692, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 8238 + }, + { + "epoch": 0.08239, + "grad_norm": 0.6012253849536762, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 8239 + }, + { + "epoch": 0.0824, + "grad_norm": 0.7137927518105237, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 8240 + }, + { + "epoch": 0.08241, + "grad_norm": 0.9032854526366563, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 8241 + }, + { + "epoch": 0.08242, + "grad_norm": 1.2043691943864299, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 8242 + }, + { + "epoch": 0.08243, + "grad_norm": 0.8643493496322304, + "learning_rate": 0.003, + "loss": 4.1204, + "step": 8243 + }, + { + "epoch": 0.08244, + "grad_norm": 0.8144027974165489, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 8244 + }, + { + "epoch": 0.08245, + "grad_norm": 0.7635515232637834, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 8245 + }, + { + "epoch": 0.08246, + "grad_norm": 0.7163154200389772, + "learning_rate": 0.003, + "loss": 4.095, + "step": 8246 + }, + { + "epoch": 0.08247, + "grad_norm": 0.748090338264524, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 8247 + }, + { + "epoch": 0.08248, + "grad_norm": 0.5601792675667456, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 8248 + }, + { + "epoch": 0.08249, + "grad_norm": 0.5660826911158605, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 8249 + }, + { + "epoch": 0.0825, + "grad_norm": 0.49777269961971965, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 8250 + }, + { + "epoch": 0.08251, + "grad_norm": 0.4749763269578897, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 8251 + }, + { + "epoch": 0.08252, + "grad_norm": 0.5367890821268732, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 8252 + }, + { + "epoch": 0.08253, + "grad_norm": 0.6209172018514464, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 8253 + }, + { + "epoch": 0.08254, + "grad_norm": 0.7460752011518125, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 8254 + }, + { + "epoch": 0.08255, + "grad_norm": 0.8859483471117526, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 8255 + }, + { + "epoch": 0.08256, + "grad_norm": 0.984289350625384, + "learning_rate": 0.003, + "loss": 4.122, + "step": 8256 + }, + { + "epoch": 0.08257, + "grad_norm": 1.0835433642153152, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 8257 + }, + { + "epoch": 0.08258, + "grad_norm": 0.8632204878935943, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 8258 + }, + { + "epoch": 0.08259, + "grad_norm": 0.7275448679612547, + "learning_rate": 0.003, + "loss": 4.1329, + "step": 8259 + }, + { + "epoch": 0.0826, + "grad_norm": 0.6773298349818122, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 8260 + }, + { + "epoch": 0.08261, + "grad_norm": 0.7281743646317274, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 8261 + }, + { + "epoch": 0.08262, + "grad_norm": 0.9373357855734575, + "learning_rate": 0.003, + "loss": 4.1382, + "step": 8262 + }, + { + "epoch": 0.08263, + "grad_norm": 0.9851510358265243, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 8263 + }, + { + "epoch": 0.08264, + "grad_norm": 0.9213918489568741, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 8264 + }, + { + "epoch": 0.08265, + "grad_norm": 0.8995466212053259, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 8265 + }, + { + "epoch": 0.08266, + "grad_norm": 0.8095431576329132, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 8266 + }, + { + "epoch": 0.08267, + "grad_norm": 0.8603884649749075, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 8267 + }, + { + "epoch": 0.08268, + "grad_norm": 0.8697551596264255, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 8268 + }, + { + "epoch": 0.08269, + "grad_norm": 0.8766282707684185, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 8269 + }, + { + "epoch": 0.0827, + "grad_norm": 0.7471962671773691, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 8270 + }, + { + "epoch": 0.08271, + "grad_norm": 0.6435084919846199, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 8271 + }, + { + "epoch": 0.08272, + "grad_norm": 0.5510784750295802, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 8272 + }, + { + "epoch": 0.08273, + "grad_norm": 0.5988908318224689, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 8273 + }, + { + "epoch": 0.08274, + "grad_norm": 0.6072551102960675, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 8274 + }, + { + "epoch": 0.08275, + "grad_norm": 0.6062453075526276, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 8275 + }, + { + "epoch": 0.08276, + "grad_norm": 0.5278924020145801, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 8276 + }, + { + "epoch": 0.08277, + "grad_norm": 0.5216351289934781, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 8277 + }, + { + "epoch": 0.08278, + "grad_norm": 0.5235445369007313, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 8278 + }, + { + "epoch": 0.08279, + "grad_norm": 0.4302544923490237, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 8279 + }, + { + "epoch": 0.0828, + "grad_norm": 0.45314513906146026, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 8280 + }, + { + "epoch": 0.08281, + "grad_norm": 0.4947819664319072, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 8281 + }, + { + "epoch": 0.08282, + "grad_norm": 0.4700115344785109, + "learning_rate": 0.003, + "loss": 4.075, + "step": 8282 + }, + { + "epoch": 0.08283, + "grad_norm": 0.47458184523582214, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 8283 + }, + { + "epoch": 0.08284, + "grad_norm": 0.6217194123770353, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 8284 + }, + { + "epoch": 0.08285, + "grad_norm": 0.7679318114887002, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 8285 + }, + { + "epoch": 0.08286, + "grad_norm": 1.0091565333262968, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 8286 + }, + { + "epoch": 0.08287, + "grad_norm": 1.0308871533238988, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 8287 + }, + { + "epoch": 0.08288, + "grad_norm": 0.7715553741553134, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 8288 + }, + { + "epoch": 0.08289, + "grad_norm": 0.7246211823017321, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 8289 + }, + { + "epoch": 0.0829, + "grad_norm": 0.6693388476539852, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 8290 + }, + { + "epoch": 0.08291, + "grad_norm": 0.6792730649441251, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 8291 + }, + { + "epoch": 0.08292, + "grad_norm": 0.715764254394465, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 8292 + }, + { + "epoch": 0.08293, + "grad_norm": 0.8330895077359063, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 8293 + }, + { + "epoch": 0.08294, + "grad_norm": 0.8999946195152977, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 8294 + }, + { + "epoch": 0.08295, + "grad_norm": 0.9722144185539187, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 8295 + }, + { + "epoch": 0.08296, + "grad_norm": 0.8945126241906354, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 8296 + }, + { + "epoch": 0.08297, + "grad_norm": 0.931976526589576, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 8297 + }, + { + "epoch": 0.08298, + "grad_norm": 1.0550600149143023, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 8298 + }, + { + "epoch": 0.08299, + "grad_norm": 1.0038590631217534, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 8299 + }, + { + "epoch": 0.083, + "grad_norm": 1.0483622801143657, + "learning_rate": 0.003, + "loss": 4.1615, + "step": 8300 + }, + { + "epoch": 0.08301, + "grad_norm": 0.8663552032951759, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 8301 + }, + { + "epoch": 0.08302, + "grad_norm": 0.9615360827126928, + "learning_rate": 0.003, + "loss": 4.1514, + "step": 8302 + }, + { + "epoch": 0.08303, + "grad_norm": 0.8375481767872405, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 8303 + }, + { + "epoch": 0.08304, + "grad_norm": 0.7453597337442939, + "learning_rate": 0.003, + "loss": 4.1436, + "step": 8304 + }, + { + "epoch": 0.08305, + "grad_norm": 0.7131331051227506, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 8305 + }, + { + "epoch": 0.08306, + "grad_norm": 0.7124866649665565, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 8306 + }, + { + "epoch": 0.08307, + "grad_norm": 0.8250868166173586, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 8307 + }, + { + "epoch": 0.08308, + "grad_norm": 0.9061398647327097, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 8308 + }, + { + "epoch": 0.08309, + "grad_norm": 1.0307311740161893, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 8309 + }, + { + "epoch": 0.0831, + "grad_norm": 0.992744547897445, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 8310 + }, + { + "epoch": 0.08311, + "grad_norm": 0.7488997564809391, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 8311 + }, + { + "epoch": 0.08312, + "grad_norm": 0.7053261274926852, + "learning_rate": 0.003, + "loss": 4.1363, + "step": 8312 + }, + { + "epoch": 0.08313, + "grad_norm": 0.6593948632063114, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 8313 + }, + { + "epoch": 0.08314, + "grad_norm": 0.6995852846034631, + "learning_rate": 0.003, + "loss": 4.1251, + "step": 8314 + }, + { + "epoch": 0.08315, + "grad_norm": 0.7002924867496525, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 8315 + }, + { + "epoch": 0.08316, + "grad_norm": 0.754368216614145, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 8316 + }, + { + "epoch": 0.08317, + "grad_norm": 0.7750729970073041, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 8317 + }, + { + "epoch": 0.08318, + "grad_norm": 0.6364553809945898, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 8318 + }, + { + "epoch": 0.08319, + "grad_norm": 0.5773708268407741, + "learning_rate": 0.003, + "loss": 4.081, + "step": 8319 + }, + { + "epoch": 0.0832, + "grad_norm": 0.6001536418360034, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 8320 + }, + { + "epoch": 0.08321, + "grad_norm": 0.5089768075103857, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 8321 + }, + { + "epoch": 0.08322, + "grad_norm": 0.518428766042433, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 8322 + }, + { + "epoch": 0.08323, + "grad_norm": 0.5173998718635492, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 8323 + }, + { + "epoch": 0.08324, + "grad_norm": 0.4750014989168887, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 8324 + }, + { + "epoch": 0.08325, + "grad_norm": 0.5855181107353905, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 8325 + }, + { + "epoch": 0.08326, + "grad_norm": 0.7628006782684289, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 8326 + }, + { + "epoch": 0.08327, + "grad_norm": 1.0618146389847019, + "learning_rate": 0.003, + "loss": 4.091, + "step": 8327 + }, + { + "epoch": 0.08328, + "grad_norm": 0.9388504158158667, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 8328 + }, + { + "epoch": 0.08329, + "grad_norm": 0.6800766818090532, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 8329 + }, + { + "epoch": 0.0833, + "grad_norm": 0.5773008995872342, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 8330 + }, + { + "epoch": 0.08331, + "grad_norm": 0.5748792212675666, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 8331 + }, + { + "epoch": 0.08332, + "grad_norm": 0.5541285891852565, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 8332 + }, + { + "epoch": 0.08333, + "grad_norm": 0.5416560426836903, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 8333 + }, + { + "epoch": 0.08334, + "grad_norm": 0.5432886614396639, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 8334 + }, + { + "epoch": 0.08335, + "grad_norm": 0.6393368606314609, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 8335 + }, + { + "epoch": 0.08336, + "grad_norm": 0.670088341618419, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 8336 + }, + { + "epoch": 0.08337, + "grad_norm": 0.759669728240697, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 8337 + }, + { + "epoch": 0.08338, + "grad_norm": 0.7670617917071586, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 8338 + }, + { + "epoch": 0.08339, + "grad_norm": 0.7599957118398699, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8339 + }, + { + "epoch": 0.0834, + "grad_norm": 0.8032482037405414, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 8340 + }, + { + "epoch": 0.08341, + "grad_norm": 0.7742413499026626, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 8341 + }, + { + "epoch": 0.08342, + "grad_norm": 0.7647103140349684, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 8342 + }, + { + "epoch": 0.08343, + "grad_norm": 0.6857455040634841, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 8343 + }, + { + "epoch": 0.08344, + "grad_norm": 0.6285429482356256, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 8344 + }, + { + "epoch": 0.08345, + "grad_norm": 0.7221980546699931, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 8345 + }, + { + "epoch": 0.08346, + "grad_norm": 0.7143054001868141, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 8346 + }, + { + "epoch": 0.08347, + "grad_norm": 0.6385179914801554, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 8347 + }, + { + "epoch": 0.08348, + "grad_norm": 0.6990845577714458, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 8348 + }, + { + "epoch": 0.08349, + "grad_norm": 0.6913429696640464, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 8349 + }, + { + "epoch": 0.0835, + "grad_norm": 0.7192577771703562, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 8350 + }, + { + "epoch": 0.08351, + "grad_norm": 0.7760395598930231, + "learning_rate": 0.003, + "loss": 4.116, + "step": 8351 + }, + { + "epoch": 0.08352, + "grad_norm": 0.7993190167350477, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 8352 + }, + { + "epoch": 0.08353, + "grad_norm": 0.7398688625286849, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 8353 + }, + { + "epoch": 0.08354, + "grad_norm": 0.6834075220306864, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 8354 + }, + { + "epoch": 0.08355, + "grad_norm": 0.8175214131818854, + "learning_rate": 0.003, + "loss": 4.08, + "step": 8355 + }, + { + "epoch": 0.08356, + "grad_norm": 0.9414888867819124, + "learning_rate": 0.003, + "loss": 4.112, + "step": 8356 + }, + { + "epoch": 0.08357, + "grad_norm": 0.960843920438772, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 8357 + }, + { + "epoch": 0.08358, + "grad_norm": 0.9609403468914611, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 8358 + }, + { + "epoch": 0.08359, + "grad_norm": 0.9295346213960959, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 8359 + }, + { + "epoch": 0.0836, + "grad_norm": 0.9085274043267699, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 8360 + }, + { + "epoch": 0.08361, + "grad_norm": 0.8071363639869016, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8361 + }, + { + "epoch": 0.08362, + "grad_norm": 0.9382775907297787, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 8362 + }, + { + "epoch": 0.08363, + "grad_norm": 0.8709824514663582, + "learning_rate": 0.003, + "loss": 4.1389, + "step": 8363 + }, + { + "epoch": 0.08364, + "grad_norm": 0.675740458123866, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 8364 + }, + { + "epoch": 0.08365, + "grad_norm": 0.6496018714951994, + "learning_rate": 0.003, + "loss": 4.112, + "step": 8365 + }, + { + "epoch": 0.08366, + "grad_norm": 0.6610790017908971, + "learning_rate": 0.003, + "loss": 4.1223, + "step": 8366 + }, + { + "epoch": 0.08367, + "grad_norm": 0.6766179312441892, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 8367 + }, + { + "epoch": 0.08368, + "grad_norm": 0.6048963944008922, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 8368 + }, + { + "epoch": 0.08369, + "grad_norm": 0.5577980801642201, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 8369 + }, + { + "epoch": 0.0837, + "grad_norm": 0.5452065922923452, + "learning_rate": 0.003, + "loss": 4.093, + "step": 8370 + }, + { + "epoch": 0.08371, + "grad_norm": 0.5328227419403814, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 8371 + }, + { + "epoch": 0.08372, + "grad_norm": 0.5572436569854103, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 8372 + }, + { + "epoch": 0.08373, + "grad_norm": 0.595024680031319, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 8373 + }, + { + "epoch": 0.08374, + "grad_norm": 0.6171013061042033, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 8374 + }, + { + "epoch": 0.08375, + "grad_norm": 0.6632173727246423, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 8375 + }, + { + "epoch": 0.08376, + "grad_norm": 0.7586043969017534, + "learning_rate": 0.003, + "loss": 4.084, + "step": 8376 + }, + { + "epoch": 0.08377, + "grad_norm": 0.8201258736560947, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 8377 + }, + { + "epoch": 0.08378, + "grad_norm": 0.7660059240272924, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 8378 + }, + { + "epoch": 0.08379, + "grad_norm": 0.6820454577432933, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 8379 + }, + { + "epoch": 0.0838, + "grad_norm": 0.6391725474755902, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 8380 + }, + { + "epoch": 0.08381, + "grad_norm": 0.626073359728838, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 8381 + }, + { + "epoch": 0.08382, + "grad_norm": 0.6537353438521994, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 8382 + }, + { + "epoch": 0.08383, + "grad_norm": 0.5286621728800187, + "learning_rate": 0.003, + "loss": 4.066, + "step": 8383 + }, + { + "epoch": 0.08384, + "grad_norm": 0.49570196885919826, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 8384 + }, + { + "epoch": 0.08385, + "grad_norm": 0.4720817397929105, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 8385 + }, + { + "epoch": 0.08386, + "grad_norm": 0.4116758591839013, + "learning_rate": 0.003, + "loss": 4.075, + "step": 8386 + }, + { + "epoch": 0.08387, + "grad_norm": 0.45306876446030137, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 8387 + }, + { + "epoch": 0.08388, + "grad_norm": 0.6179847808239425, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 8388 + }, + { + "epoch": 0.08389, + "grad_norm": 0.9294947853793362, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 8389 + }, + { + "epoch": 0.0839, + "grad_norm": 1.2031306786524287, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 8390 + }, + { + "epoch": 0.08391, + "grad_norm": 0.8046262538632072, + "learning_rate": 0.003, + "loss": 4.109, + "step": 8391 + }, + { + "epoch": 0.08392, + "grad_norm": 0.882493419185855, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 8392 + }, + { + "epoch": 0.08393, + "grad_norm": 0.8613668633518297, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 8393 + }, + { + "epoch": 0.08394, + "grad_norm": 0.8844121298915572, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 8394 + }, + { + "epoch": 0.08395, + "grad_norm": 0.9156972707334563, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8395 + }, + { + "epoch": 0.08396, + "grad_norm": 0.8431255266855052, + "learning_rate": 0.003, + "loss": 4.1244, + "step": 8396 + }, + { + "epoch": 0.08397, + "grad_norm": 0.8830670687070312, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 8397 + }, + { + "epoch": 0.08398, + "grad_norm": 0.9314448769669744, + "learning_rate": 0.003, + "loss": 4.1178, + "step": 8398 + }, + { + "epoch": 0.08399, + "grad_norm": 1.0449422905278423, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 8399 + }, + { + "epoch": 0.084, + "grad_norm": 1.0703912427847535, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 8400 + }, + { + "epoch": 0.08401, + "grad_norm": 0.99544544738125, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 8401 + }, + { + "epoch": 0.08402, + "grad_norm": 0.9137811882497271, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 8402 + }, + { + "epoch": 0.08403, + "grad_norm": 0.8345353163377343, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 8403 + }, + { + "epoch": 0.08404, + "grad_norm": 0.7423619551465458, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 8404 + }, + { + "epoch": 0.08405, + "grad_norm": 0.6917338055216923, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 8405 + }, + { + "epoch": 0.08406, + "grad_norm": 0.6541321327571565, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 8406 + }, + { + "epoch": 0.08407, + "grad_norm": 0.6748689162799533, + "learning_rate": 0.003, + "loss": 4.153, + "step": 8407 + }, + { + "epoch": 0.08408, + "grad_norm": 0.6636547398677606, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 8408 + }, + { + "epoch": 0.08409, + "grad_norm": 0.6820702144071761, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 8409 + }, + { + "epoch": 0.0841, + "grad_norm": 0.6703929070086052, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 8410 + }, + { + "epoch": 0.08411, + "grad_norm": 0.6537163754004126, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 8411 + }, + { + "epoch": 0.08412, + "grad_norm": 0.6180851692722509, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 8412 + }, + { + "epoch": 0.08413, + "grad_norm": 0.5613447966463775, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 8413 + }, + { + "epoch": 0.08414, + "grad_norm": 0.5426935541427719, + "learning_rate": 0.003, + "loss": 4.109, + "step": 8414 + }, + { + "epoch": 0.08415, + "grad_norm": 0.5455362132623575, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 8415 + }, + { + "epoch": 0.08416, + "grad_norm": 0.5671062554035091, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 8416 + }, + { + "epoch": 0.08417, + "grad_norm": 0.6929581320753861, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 8417 + }, + { + "epoch": 0.08418, + "grad_norm": 0.7632658007580719, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 8418 + }, + { + "epoch": 0.08419, + "grad_norm": 0.8207298972699563, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 8419 + }, + { + "epoch": 0.0842, + "grad_norm": 0.8838166900993079, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 8420 + }, + { + "epoch": 0.08421, + "grad_norm": 0.8494239057110717, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 8421 + }, + { + "epoch": 0.08422, + "grad_norm": 0.7486051228897643, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 8422 + }, + { + "epoch": 0.08423, + "grad_norm": 0.6681036680494669, + "learning_rate": 0.003, + "loss": 4.092, + "step": 8423 + }, + { + "epoch": 0.08424, + "grad_norm": 0.6231615422695527, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 8424 + }, + { + "epoch": 0.08425, + "grad_norm": 0.5973371615272063, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 8425 + }, + { + "epoch": 0.08426, + "grad_norm": 0.590081914250373, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 8426 + }, + { + "epoch": 0.08427, + "grad_norm": 0.7124778189115221, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 8427 + }, + { + "epoch": 0.08428, + "grad_norm": 0.7887937264015177, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 8428 + }, + { + "epoch": 0.08429, + "grad_norm": 0.7678709133144459, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 8429 + }, + { + "epoch": 0.0843, + "grad_norm": 0.6332332135809398, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 8430 + }, + { + "epoch": 0.08431, + "grad_norm": 0.5107659296097526, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 8431 + }, + { + "epoch": 0.08432, + "grad_norm": 0.48858809488607446, + "learning_rate": 0.003, + "loss": 4.057, + "step": 8432 + }, + { + "epoch": 0.08433, + "grad_norm": 0.5110654376932778, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 8433 + }, + { + "epoch": 0.08434, + "grad_norm": 0.534360910387737, + "learning_rate": 0.003, + "loss": 4.048, + "step": 8434 + }, + { + "epoch": 0.08435, + "grad_norm": 0.5820631960297242, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 8435 + }, + { + "epoch": 0.08436, + "grad_norm": 0.661389118471186, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 8436 + }, + { + "epoch": 0.08437, + "grad_norm": 0.7111949328987919, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 8437 + }, + { + "epoch": 0.08438, + "grad_norm": 0.7522347549752256, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 8438 + }, + { + "epoch": 0.08439, + "grad_norm": 0.9493832565098858, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 8439 + }, + { + "epoch": 0.0844, + "grad_norm": 1.0648764993147652, + "learning_rate": 0.003, + "loss": 4.1258, + "step": 8440 + }, + { + "epoch": 0.08441, + "grad_norm": 0.8414607606679769, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8441 + }, + { + "epoch": 0.08442, + "grad_norm": 0.7257158697574967, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 8442 + }, + { + "epoch": 0.08443, + "grad_norm": 0.6399687963227431, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 8443 + }, + { + "epoch": 0.08444, + "grad_norm": 0.6122601189245481, + "learning_rate": 0.003, + "loss": 4.126, + "step": 8444 + }, + { + "epoch": 0.08445, + "grad_norm": 0.6421561307708404, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 8445 + }, + { + "epoch": 0.08446, + "grad_norm": 0.7720968877758178, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 8446 + }, + { + "epoch": 0.08447, + "grad_norm": 0.9969368300751122, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 8447 + }, + { + "epoch": 0.08448, + "grad_norm": 1.040230047556125, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 8448 + }, + { + "epoch": 0.08449, + "grad_norm": 0.9375823755529817, + "learning_rate": 0.003, + "loss": 4.1266, + "step": 8449 + }, + { + "epoch": 0.0845, + "grad_norm": 0.8771502461318594, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 8450 + }, + { + "epoch": 0.08451, + "grad_norm": 0.7739620306437573, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 8451 + }, + { + "epoch": 0.08452, + "grad_norm": 0.673756075602562, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 8452 + }, + { + "epoch": 0.08453, + "grad_norm": 0.6543280916386777, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 8453 + }, + { + "epoch": 0.08454, + "grad_norm": 0.6889773663662369, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 8454 + }, + { + "epoch": 0.08455, + "grad_norm": 0.7838053719288101, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 8455 + }, + { + "epoch": 0.08456, + "grad_norm": 0.8119139733359273, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 8456 + }, + { + "epoch": 0.08457, + "grad_norm": 0.6837577523016988, + "learning_rate": 0.003, + "loss": 4.1321, + "step": 8457 + }, + { + "epoch": 0.08458, + "grad_norm": 0.6923132881641682, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 8458 + }, + { + "epoch": 0.08459, + "grad_norm": 0.7691218378497301, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 8459 + }, + { + "epoch": 0.0846, + "grad_norm": 0.7521614723982595, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 8460 + }, + { + "epoch": 0.08461, + "grad_norm": 0.7593018376927517, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 8461 + }, + { + "epoch": 0.08462, + "grad_norm": 0.8660621840990166, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 8462 + }, + { + "epoch": 0.08463, + "grad_norm": 0.832579407041644, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 8463 + }, + { + "epoch": 0.08464, + "grad_norm": 0.8496293669573718, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 8464 + }, + { + "epoch": 0.08465, + "grad_norm": 0.9447869224057244, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 8465 + }, + { + "epoch": 0.08466, + "grad_norm": 0.9693005746173255, + "learning_rate": 0.003, + "loss": 4.129, + "step": 8466 + }, + { + "epoch": 0.08467, + "grad_norm": 0.6866322634836225, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 8467 + }, + { + "epoch": 0.08468, + "grad_norm": 0.7903651480232442, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 8468 + }, + { + "epoch": 0.08469, + "grad_norm": 0.7645618738122506, + "learning_rate": 0.003, + "loss": 4.1166, + "step": 8469 + }, + { + "epoch": 0.0847, + "grad_norm": 0.7413695835335838, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 8470 + }, + { + "epoch": 0.08471, + "grad_norm": 0.7231784570238742, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 8471 + }, + { + "epoch": 0.08472, + "grad_norm": 0.931451402440091, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 8472 + }, + { + "epoch": 0.08473, + "grad_norm": 1.1881300365948153, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 8473 + }, + { + "epoch": 0.08474, + "grad_norm": 0.7884945285700253, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 8474 + }, + { + "epoch": 0.08475, + "grad_norm": 0.6466505880641876, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 8475 + }, + { + "epoch": 0.08476, + "grad_norm": 0.6843402752034309, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 8476 + }, + { + "epoch": 0.08477, + "grad_norm": 0.6820809860569809, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 8477 + }, + { + "epoch": 0.08478, + "grad_norm": 0.6640365539259963, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 8478 + }, + { + "epoch": 0.08479, + "grad_norm": 0.6014643687487943, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 8479 + }, + { + "epoch": 0.0848, + "grad_norm": 0.6525358299661146, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 8480 + }, + { + "epoch": 0.08481, + "grad_norm": 0.706918999582302, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 8481 + }, + { + "epoch": 0.08482, + "grad_norm": 0.7468135222494826, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 8482 + }, + { + "epoch": 0.08483, + "grad_norm": 0.7732979758093033, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 8483 + }, + { + "epoch": 0.08484, + "grad_norm": 0.7492155564600703, + "learning_rate": 0.003, + "loss": 4.074, + "step": 8484 + }, + { + "epoch": 0.08485, + "grad_norm": 0.6786121897383414, + "learning_rate": 0.003, + "loss": 4.1, + "step": 8485 + }, + { + "epoch": 0.08486, + "grad_norm": 0.6273731355412779, + "learning_rate": 0.003, + "loss": 4.12, + "step": 8486 + }, + { + "epoch": 0.08487, + "grad_norm": 0.6720801463551171, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 8487 + }, + { + "epoch": 0.08488, + "grad_norm": 0.6335377972042376, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 8488 + }, + { + "epoch": 0.08489, + "grad_norm": 0.6719260396651473, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 8489 + }, + { + "epoch": 0.0849, + "grad_norm": 0.7312131012473647, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 8490 + }, + { + "epoch": 0.08491, + "grad_norm": 0.8249047120148688, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 8491 + }, + { + "epoch": 0.08492, + "grad_norm": 1.0407015940181197, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 8492 + }, + { + "epoch": 0.08493, + "grad_norm": 1.0867956366330738, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 8493 + }, + { + "epoch": 0.08494, + "grad_norm": 0.915620699057293, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 8494 + }, + { + "epoch": 0.08495, + "grad_norm": 0.89156893468348, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 8495 + }, + { + "epoch": 0.08496, + "grad_norm": 0.8517601087707647, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 8496 + }, + { + "epoch": 0.08497, + "grad_norm": 0.7993759869320393, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 8497 + }, + { + "epoch": 0.08498, + "grad_norm": 0.7017883148327033, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 8498 + }, + { + "epoch": 0.08499, + "grad_norm": 0.6236773075397719, + "learning_rate": 0.003, + "loss": 4.1259, + "step": 8499 + }, + { + "epoch": 0.085, + "grad_norm": 0.6286035320618882, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 8500 + }, + { + "epoch": 0.08501, + "grad_norm": 0.5212820111963481, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 8501 + }, + { + "epoch": 0.08502, + "grad_norm": 0.5487558855281737, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 8502 + }, + { + "epoch": 0.08503, + "grad_norm": 0.5707569684852064, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 8503 + }, + { + "epoch": 0.08504, + "grad_norm": 0.6687161931414275, + "learning_rate": 0.003, + "loss": 4.1237, + "step": 8504 + }, + { + "epoch": 0.08505, + "grad_norm": 0.6723026556005183, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 8505 + }, + { + "epoch": 0.08506, + "grad_norm": 0.6565086694246933, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 8506 + }, + { + "epoch": 0.08507, + "grad_norm": 0.6310941478662354, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 8507 + }, + { + "epoch": 0.08508, + "grad_norm": 0.5824407794995421, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 8508 + }, + { + "epoch": 0.08509, + "grad_norm": 0.570213160176025, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 8509 + }, + { + "epoch": 0.0851, + "grad_norm": 0.545653653993305, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 8510 + }, + { + "epoch": 0.08511, + "grad_norm": 0.6166091785522915, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 8511 + }, + { + "epoch": 0.08512, + "grad_norm": 0.7266811939494829, + "learning_rate": 0.003, + "loss": 4.1261, + "step": 8512 + }, + { + "epoch": 0.08513, + "grad_norm": 1.0119925425784506, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 8513 + }, + { + "epoch": 0.08514, + "grad_norm": 1.0618994395885715, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 8514 + }, + { + "epoch": 0.08515, + "grad_norm": 0.8410180154663229, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 8515 + }, + { + "epoch": 0.08516, + "grad_norm": 0.7647905005211172, + "learning_rate": 0.003, + "loss": 4.1223, + "step": 8516 + }, + { + "epoch": 0.08517, + "grad_norm": 0.7839452502836995, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 8517 + }, + { + "epoch": 0.08518, + "grad_norm": 0.8837758490786244, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 8518 + }, + { + "epoch": 0.08519, + "grad_norm": 1.0026623503391587, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 8519 + }, + { + "epoch": 0.0852, + "grad_norm": 0.8804069304034173, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 8520 + }, + { + "epoch": 0.08521, + "grad_norm": 0.7391980251114973, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 8521 + }, + { + "epoch": 0.08522, + "grad_norm": 0.6582209926239634, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 8522 + }, + { + "epoch": 0.08523, + "grad_norm": 0.686774998714554, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 8523 + }, + { + "epoch": 0.08524, + "grad_norm": 0.6276671588615992, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 8524 + }, + { + "epoch": 0.08525, + "grad_norm": 0.6285944259527609, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 8525 + }, + { + "epoch": 0.08526, + "grad_norm": 0.6730183358064831, + "learning_rate": 0.003, + "loss": 4.088, + "step": 8526 + }, + { + "epoch": 0.08527, + "grad_norm": 0.7562963445445285, + "learning_rate": 0.003, + "loss": 4.091, + "step": 8527 + }, + { + "epoch": 0.08528, + "grad_norm": 0.7728429342633116, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 8528 + }, + { + "epoch": 0.08529, + "grad_norm": 0.765937905688893, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 8529 + }, + { + "epoch": 0.0853, + "grad_norm": 0.6845167962734888, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 8530 + }, + { + "epoch": 0.08531, + "grad_norm": 0.5726987582988281, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 8531 + }, + { + "epoch": 0.08532, + "grad_norm": 0.5597095167661914, + "learning_rate": 0.003, + "loss": 4.068, + "step": 8532 + }, + { + "epoch": 0.08533, + "grad_norm": 0.5041472063185792, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 8533 + }, + { + "epoch": 0.08534, + "grad_norm": 0.45851677887407977, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 8534 + }, + { + "epoch": 0.08535, + "grad_norm": 0.42358429319858426, + "learning_rate": 0.003, + "loss": 4.073, + "step": 8535 + }, + { + "epoch": 0.08536, + "grad_norm": 0.424584021481874, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8536 + }, + { + "epoch": 0.08537, + "grad_norm": 0.5509844324435204, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 8537 + }, + { + "epoch": 0.08538, + "grad_norm": 0.7117815534754696, + "learning_rate": 0.003, + "loss": 4.066, + "step": 8538 + }, + { + "epoch": 0.08539, + "grad_norm": 0.8518554101874906, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 8539 + }, + { + "epoch": 0.0854, + "grad_norm": 0.9392658629366815, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 8540 + }, + { + "epoch": 0.08541, + "grad_norm": 0.9325601448881679, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 8541 + }, + { + "epoch": 0.08542, + "grad_norm": 0.8887155749787176, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 8542 + }, + { + "epoch": 0.08543, + "grad_norm": 0.9460176301826329, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 8543 + }, + { + "epoch": 0.08544, + "grad_norm": 0.8622072400696076, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 8544 + }, + { + "epoch": 0.08545, + "grad_norm": 0.8420027078116906, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 8545 + }, + { + "epoch": 0.08546, + "grad_norm": 0.7341692131081741, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 8546 + }, + { + "epoch": 0.08547, + "grad_norm": 0.7543016815952612, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 8547 + }, + { + "epoch": 0.08548, + "grad_norm": 0.7607341436346121, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 8548 + }, + { + "epoch": 0.08549, + "grad_norm": 0.7660211842461696, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 8549 + }, + { + "epoch": 0.0855, + "grad_norm": 0.8328996624474356, + "learning_rate": 0.003, + "loss": 4.127, + "step": 8550 + }, + { + "epoch": 0.08551, + "grad_norm": 0.9887027893940781, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 8551 + }, + { + "epoch": 0.08552, + "grad_norm": 0.9138699679462117, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 8552 + }, + { + "epoch": 0.08553, + "grad_norm": 0.88796138098534, + "learning_rate": 0.003, + "loss": 4.1251, + "step": 8553 + }, + { + "epoch": 0.08554, + "grad_norm": 0.7864887844624548, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 8554 + }, + { + "epoch": 0.08555, + "grad_norm": 0.8150705947614163, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 8555 + }, + { + "epoch": 0.08556, + "grad_norm": 0.8837241002629734, + "learning_rate": 0.003, + "loss": 4.1262, + "step": 8556 + }, + { + "epoch": 0.08557, + "grad_norm": 0.9289452003464579, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 8557 + }, + { + "epoch": 0.08558, + "grad_norm": 0.9018548936220813, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 8558 + }, + { + "epoch": 0.08559, + "grad_norm": 0.8317138215654469, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 8559 + }, + { + "epoch": 0.0856, + "grad_norm": 0.7222621649910216, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 8560 + }, + { + "epoch": 0.08561, + "grad_norm": 0.6904128334298788, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 8561 + }, + { + "epoch": 0.08562, + "grad_norm": 0.6940485635066651, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 8562 + }, + { + "epoch": 0.08563, + "grad_norm": 0.7325452827552434, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 8563 + }, + { + "epoch": 0.08564, + "grad_norm": 0.8123368725671672, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 8564 + }, + { + "epoch": 0.08565, + "grad_norm": 0.9408503931561969, + "learning_rate": 0.003, + "loss": 4.096, + "step": 8565 + }, + { + "epoch": 0.08566, + "grad_norm": 1.0616374748054886, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 8566 + }, + { + "epoch": 0.08567, + "grad_norm": 1.047024424521824, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 8567 + }, + { + "epoch": 0.08568, + "grad_norm": 0.8468698345129514, + "learning_rate": 0.003, + "loss": 4.1356, + "step": 8568 + }, + { + "epoch": 0.08569, + "grad_norm": 0.7944472675814765, + "learning_rate": 0.003, + "loss": 4.1382, + "step": 8569 + }, + { + "epoch": 0.0857, + "grad_norm": 0.8045393385309881, + "learning_rate": 0.003, + "loss": 4.1327, + "step": 8570 + }, + { + "epoch": 0.08571, + "grad_norm": 0.9098867331516741, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 8571 + }, + { + "epoch": 0.08572, + "grad_norm": 0.9195544563171938, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 8572 + }, + { + "epoch": 0.08573, + "grad_norm": 0.8423884949796944, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 8573 + }, + { + "epoch": 0.08574, + "grad_norm": 0.8214333018685611, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 8574 + }, + { + "epoch": 0.08575, + "grad_norm": 0.6350812617594055, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 8575 + }, + { + "epoch": 0.08576, + "grad_norm": 0.5568431689715675, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 8576 + }, + { + "epoch": 0.08577, + "grad_norm": 0.5632148561347211, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 8577 + }, + { + "epoch": 0.08578, + "grad_norm": 0.5940159555341951, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 8578 + }, + { + "epoch": 0.08579, + "grad_norm": 0.5819842281049936, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 8579 + }, + { + "epoch": 0.0858, + "grad_norm": 0.6099409069272952, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 8580 + }, + { + "epoch": 0.08581, + "grad_norm": 0.6188694008443366, + "learning_rate": 0.003, + "loss": 4.081, + "step": 8581 + }, + { + "epoch": 0.08582, + "grad_norm": 0.59379978193371, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 8582 + }, + { + "epoch": 0.08583, + "grad_norm": 0.5744750135944172, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 8583 + }, + { + "epoch": 0.08584, + "grad_norm": 0.6722444371986878, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 8584 + }, + { + "epoch": 0.08585, + "grad_norm": 0.7411902899854854, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 8585 + }, + { + "epoch": 0.08586, + "grad_norm": 0.7792170934870236, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 8586 + }, + { + "epoch": 0.08587, + "grad_norm": 0.6853345642159595, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 8587 + }, + { + "epoch": 0.08588, + "grad_norm": 0.6170369949749352, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 8588 + }, + { + "epoch": 0.08589, + "grad_norm": 0.49708112672361066, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 8589 + }, + { + "epoch": 0.0859, + "grad_norm": 0.5152160677520728, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 8590 + }, + { + "epoch": 0.08591, + "grad_norm": 0.49741486517713757, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 8591 + }, + { + "epoch": 0.08592, + "grad_norm": 0.5910307086102645, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 8592 + }, + { + "epoch": 0.08593, + "grad_norm": 0.6649076276760316, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 8593 + }, + { + "epoch": 0.08594, + "grad_norm": 0.6902152865247433, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 8594 + }, + { + "epoch": 0.08595, + "grad_norm": 0.7200015025904549, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 8595 + }, + { + "epoch": 0.08596, + "grad_norm": 0.8369502085711913, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 8596 + }, + { + "epoch": 0.08597, + "grad_norm": 0.9081695796712485, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 8597 + }, + { + "epoch": 0.08598, + "grad_norm": 0.8574484765894195, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 8598 + }, + { + "epoch": 0.08599, + "grad_norm": 0.7141093112530459, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 8599 + }, + { + "epoch": 0.086, + "grad_norm": 0.7618404036518704, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 8600 + }, + { + "epoch": 0.08601, + "grad_norm": 0.7471246187217334, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 8601 + }, + { + "epoch": 0.08602, + "grad_norm": 0.7547952980026077, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 8602 + }, + { + "epoch": 0.08603, + "grad_norm": 0.7629507458741042, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 8603 + }, + { + "epoch": 0.08604, + "grad_norm": 0.8034444554972223, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 8604 + }, + { + "epoch": 0.08605, + "grad_norm": 0.9611109929996176, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 8605 + }, + { + "epoch": 0.08606, + "grad_norm": 1.1203243516217172, + "learning_rate": 0.003, + "loss": 4.091, + "step": 8606 + }, + { + "epoch": 0.08607, + "grad_norm": 0.7916021324508382, + "learning_rate": 0.003, + "loss": 4.127, + "step": 8607 + }, + { + "epoch": 0.08608, + "grad_norm": 0.5801180546262028, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 8608 + }, + { + "epoch": 0.08609, + "grad_norm": 0.5983815121428754, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 8609 + }, + { + "epoch": 0.0861, + "grad_norm": 0.6226924849818875, + "learning_rate": 0.003, + "loss": 4.111, + "step": 8610 + }, + { + "epoch": 0.08611, + "grad_norm": 0.5973475758608405, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 8611 + }, + { + "epoch": 0.08612, + "grad_norm": 0.6361887564027072, + "learning_rate": 0.003, + "loss": 4.1275, + "step": 8612 + }, + { + "epoch": 0.08613, + "grad_norm": 0.6913314348371322, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 8613 + }, + { + "epoch": 0.08614, + "grad_norm": 0.6911565582790656, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 8614 + }, + { + "epoch": 0.08615, + "grad_norm": 0.6480201878105682, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 8615 + }, + { + "epoch": 0.08616, + "grad_norm": 0.621140757257243, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 8616 + }, + { + "epoch": 0.08617, + "grad_norm": 0.5814187395920917, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 8617 + }, + { + "epoch": 0.08618, + "grad_norm": 0.5949195662749583, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 8618 + }, + { + "epoch": 0.08619, + "grad_norm": 0.5474225046144992, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 8619 + }, + { + "epoch": 0.0862, + "grad_norm": 0.5884294701344107, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 8620 + }, + { + "epoch": 0.08621, + "grad_norm": 0.5414477831138819, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 8621 + }, + { + "epoch": 0.08622, + "grad_norm": 0.5853379182460213, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 8622 + }, + { + "epoch": 0.08623, + "grad_norm": 0.6748785821043584, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 8623 + }, + { + "epoch": 0.08624, + "grad_norm": 0.8007913851325251, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 8624 + }, + { + "epoch": 0.08625, + "grad_norm": 0.9532221305960991, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 8625 + }, + { + "epoch": 0.08626, + "grad_norm": 0.9383143976468733, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 8626 + }, + { + "epoch": 0.08627, + "grad_norm": 0.8045743378009347, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 8627 + }, + { + "epoch": 0.08628, + "grad_norm": 0.7402532485299439, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 8628 + }, + { + "epoch": 0.08629, + "grad_norm": 0.7187195965365745, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 8629 + }, + { + "epoch": 0.0863, + "grad_norm": 0.7749136270046196, + "learning_rate": 0.003, + "loss": 4.086, + "step": 8630 + }, + { + "epoch": 0.08631, + "grad_norm": 0.7803939926856307, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 8631 + }, + { + "epoch": 0.08632, + "grad_norm": 0.812820737542032, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 8632 + }, + { + "epoch": 0.08633, + "grad_norm": 0.9418729107920155, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 8633 + }, + { + "epoch": 0.08634, + "grad_norm": 1.0595248522005765, + "learning_rate": 0.003, + "loss": 4.1227, + "step": 8634 + }, + { + "epoch": 0.08635, + "grad_norm": 0.9627374328257062, + "learning_rate": 0.003, + "loss": 4.112, + "step": 8635 + }, + { + "epoch": 0.08636, + "grad_norm": 0.9731387011493425, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 8636 + }, + { + "epoch": 0.08637, + "grad_norm": 1.0615767557197648, + "learning_rate": 0.003, + "loss": 4.1494, + "step": 8637 + }, + { + "epoch": 0.08638, + "grad_norm": 0.8969163900961645, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 8638 + }, + { + "epoch": 0.08639, + "grad_norm": 0.7836065427193415, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 8639 + }, + { + "epoch": 0.0864, + "grad_norm": 0.7397344262293751, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 8640 + }, + { + "epoch": 0.08641, + "grad_norm": 0.7324947749888412, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 8641 + }, + { + "epoch": 0.08642, + "grad_norm": 0.7288716146484082, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 8642 + }, + { + "epoch": 0.08643, + "grad_norm": 0.7339841907154421, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 8643 + }, + { + "epoch": 0.08644, + "grad_norm": 0.8107938826013737, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 8644 + }, + { + "epoch": 0.08645, + "grad_norm": 0.9556271679368206, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 8645 + }, + { + "epoch": 0.08646, + "grad_norm": 0.9674772661574546, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 8646 + }, + { + "epoch": 0.08647, + "grad_norm": 0.9453669207011212, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 8647 + }, + { + "epoch": 0.08648, + "grad_norm": 0.7876425030935472, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 8648 + }, + { + "epoch": 0.08649, + "grad_norm": 0.6724723608937945, + "learning_rate": 0.003, + "loss": 4.1303, + "step": 8649 + }, + { + "epoch": 0.0865, + "grad_norm": 0.7454146637283006, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 8650 + }, + { + "epoch": 0.08651, + "grad_norm": 0.8517115223988251, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 8651 + }, + { + "epoch": 0.08652, + "grad_norm": 0.8860899877841061, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 8652 + }, + { + "epoch": 0.08653, + "grad_norm": 0.6986085802414974, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 8653 + }, + { + "epoch": 0.08654, + "grad_norm": 0.6383445520115142, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 8654 + }, + { + "epoch": 0.08655, + "grad_norm": 0.6126837697223702, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 8655 + }, + { + "epoch": 0.08656, + "grad_norm": 0.6303132972548725, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 8656 + }, + { + "epoch": 0.08657, + "grad_norm": 0.7051801520736573, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 8657 + }, + { + "epoch": 0.08658, + "grad_norm": 0.7534311537900272, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 8658 + }, + { + "epoch": 0.08659, + "grad_norm": 0.6915534733946069, + "learning_rate": 0.003, + "loss": 4.095, + "step": 8659 + }, + { + "epoch": 0.0866, + "grad_norm": 0.6192069664136859, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 8660 + }, + { + "epoch": 0.08661, + "grad_norm": 0.5572612681652647, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 8661 + }, + { + "epoch": 0.08662, + "grad_norm": 0.518904918411408, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 8662 + }, + { + "epoch": 0.08663, + "grad_norm": 0.45643753659454445, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 8663 + }, + { + "epoch": 0.08664, + "grad_norm": 0.40977161141131885, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 8664 + }, + { + "epoch": 0.08665, + "grad_norm": 0.3878668107895774, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 8665 + }, + { + "epoch": 0.08666, + "grad_norm": 0.413430960164737, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 8666 + }, + { + "epoch": 0.08667, + "grad_norm": 0.49272907946966926, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 8667 + }, + { + "epoch": 0.08668, + "grad_norm": 0.5766422713262668, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 8668 + }, + { + "epoch": 0.08669, + "grad_norm": 0.7090374371716733, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 8669 + }, + { + "epoch": 0.0867, + "grad_norm": 0.8903287925695574, + "learning_rate": 0.003, + "loss": 4.084, + "step": 8670 + }, + { + "epoch": 0.08671, + "grad_norm": 1.0465570588393474, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 8671 + }, + { + "epoch": 0.08672, + "grad_norm": 0.9634733776041171, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 8672 + }, + { + "epoch": 0.08673, + "grad_norm": 0.9660580246254519, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 8673 + }, + { + "epoch": 0.08674, + "grad_norm": 0.904283629793719, + "learning_rate": 0.003, + "loss": 4.1153, + "step": 8674 + }, + { + "epoch": 0.08675, + "grad_norm": 0.8438104142985071, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 8675 + }, + { + "epoch": 0.08676, + "grad_norm": 0.7857620375537205, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 8676 + }, + { + "epoch": 0.08677, + "grad_norm": 0.7969369610845204, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 8677 + }, + { + "epoch": 0.08678, + "grad_norm": 0.7396002144781133, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 8678 + }, + { + "epoch": 0.08679, + "grad_norm": 0.6673116262175337, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 8679 + }, + { + "epoch": 0.0868, + "grad_norm": 0.7555945744269758, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 8680 + }, + { + "epoch": 0.08681, + "grad_norm": 0.8839352666831195, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 8681 + }, + { + "epoch": 0.08682, + "grad_norm": 0.9051922142113155, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 8682 + }, + { + "epoch": 0.08683, + "grad_norm": 0.846027323488606, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 8683 + }, + { + "epoch": 0.08684, + "grad_norm": 0.7192665361560291, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 8684 + }, + { + "epoch": 0.08685, + "grad_norm": 0.643632971790141, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 8685 + }, + { + "epoch": 0.08686, + "grad_norm": 0.6466143794382575, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 8686 + }, + { + "epoch": 0.08687, + "grad_norm": 0.7039176614373789, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8687 + }, + { + "epoch": 0.08688, + "grad_norm": 0.7899249787925823, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 8688 + }, + { + "epoch": 0.08689, + "grad_norm": 0.8216415805263771, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 8689 + }, + { + "epoch": 0.0869, + "grad_norm": 0.7040479359385806, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 8690 + }, + { + "epoch": 0.08691, + "grad_norm": 0.6418926125651642, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 8691 + }, + { + "epoch": 0.08692, + "grad_norm": 0.7077297817088414, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 8692 + }, + { + "epoch": 0.08693, + "grad_norm": 0.7606697220731845, + "learning_rate": 0.003, + "loss": 4.076, + "step": 8693 + }, + { + "epoch": 0.08694, + "grad_norm": 0.7152450767122566, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 8694 + }, + { + "epoch": 0.08695, + "grad_norm": 0.6231380065971586, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 8695 + }, + { + "epoch": 0.08696, + "grad_norm": 0.5828288246085719, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 8696 + }, + { + "epoch": 0.08697, + "grad_norm": 0.5953032303327713, + "learning_rate": 0.003, + "loss": 4.105, + "step": 8697 + }, + { + "epoch": 0.08698, + "grad_norm": 0.6083787084008646, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 8698 + }, + { + "epoch": 0.08699, + "grad_norm": 0.7378994678558721, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 8699 + }, + { + "epoch": 0.087, + "grad_norm": 0.9161671171702003, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 8700 + }, + { + "epoch": 0.08701, + "grad_norm": 1.1377552843109822, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 8701 + }, + { + "epoch": 0.08702, + "grad_norm": 0.8079302042495463, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 8702 + }, + { + "epoch": 0.08703, + "grad_norm": 0.608371143181045, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 8703 + }, + { + "epoch": 0.08704, + "grad_norm": 0.6281858127290479, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 8704 + }, + { + "epoch": 0.08705, + "grad_norm": 0.6793561925183917, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 8705 + }, + { + "epoch": 0.08706, + "grad_norm": 0.7106800046852197, + "learning_rate": 0.003, + "loss": 4.085, + "step": 8706 + }, + { + "epoch": 0.08707, + "grad_norm": 0.7141650332904569, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 8707 + }, + { + "epoch": 0.08708, + "grad_norm": 0.8271404025231711, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 8708 + }, + { + "epoch": 0.08709, + "grad_norm": 1.0474926164726928, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 8709 + }, + { + "epoch": 0.0871, + "grad_norm": 0.9907087620885723, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 8710 + }, + { + "epoch": 0.08711, + "grad_norm": 1.0656047756713767, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 8711 + }, + { + "epoch": 0.08712, + "grad_norm": 0.881294661258653, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 8712 + }, + { + "epoch": 0.08713, + "grad_norm": 0.9263143666012618, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 8713 + }, + { + "epoch": 0.08714, + "grad_norm": 0.9652223357853811, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 8714 + }, + { + "epoch": 0.08715, + "grad_norm": 1.1556407233323303, + "learning_rate": 0.003, + "loss": 4.1354, + "step": 8715 + }, + { + "epoch": 0.08716, + "grad_norm": 0.8600125406687056, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 8716 + }, + { + "epoch": 0.08717, + "grad_norm": 0.7952102732113174, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 8717 + }, + { + "epoch": 0.08718, + "grad_norm": 0.709911569034494, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 8718 + }, + { + "epoch": 0.08719, + "grad_norm": 0.6127044794162302, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 8719 + }, + { + "epoch": 0.0872, + "grad_norm": 0.5418597465972178, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 8720 + }, + { + "epoch": 0.08721, + "grad_norm": 0.545384052999374, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 8721 + }, + { + "epoch": 0.08722, + "grad_norm": 0.5854972859534596, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 8722 + }, + { + "epoch": 0.08723, + "grad_norm": 0.599752513559582, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 8723 + }, + { + "epoch": 0.08724, + "grad_norm": 0.5979573041484251, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 8724 + }, + { + "epoch": 0.08725, + "grad_norm": 0.6180900508607188, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 8725 + }, + { + "epoch": 0.08726, + "grad_norm": 0.6336982537847569, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 8726 + }, + { + "epoch": 0.08727, + "grad_norm": 0.7620135479461384, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 8727 + }, + { + "epoch": 0.08728, + "grad_norm": 0.8150934564907844, + "learning_rate": 0.003, + "loss": 4.063, + "step": 8728 + }, + { + "epoch": 0.08729, + "grad_norm": 0.6748250974417187, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 8729 + }, + { + "epoch": 0.0873, + "grad_norm": 0.5106549285627258, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 8730 + }, + { + "epoch": 0.08731, + "grad_norm": 0.616467259066863, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 8731 + }, + { + "epoch": 0.08732, + "grad_norm": 0.6409301951961746, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 8732 + }, + { + "epoch": 0.08733, + "grad_norm": 0.6531284409173848, + "learning_rate": 0.003, + "loss": 4.075, + "step": 8733 + }, + { + "epoch": 0.08734, + "grad_norm": 0.6909519673379502, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 8734 + }, + { + "epoch": 0.08735, + "grad_norm": 0.6949538564230802, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 8735 + }, + { + "epoch": 0.08736, + "grad_norm": 0.6353533341447787, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 8736 + }, + { + "epoch": 0.08737, + "grad_norm": 0.5766041576549661, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 8737 + }, + { + "epoch": 0.08738, + "grad_norm": 0.5636229267831031, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 8738 + }, + { + "epoch": 0.08739, + "grad_norm": 0.5714257910724869, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 8739 + }, + { + "epoch": 0.0874, + "grad_norm": 0.6664233704269024, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 8740 + }, + { + "epoch": 0.08741, + "grad_norm": 0.8499302285997681, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 8741 + }, + { + "epoch": 0.08742, + "grad_norm": 1.1223141440691569, + "learning_rate": 0.003, + "loss": 4.087, + "step": 8742 + }, + { + "epoch": 0.08743, + "grad_norm": 0.9050529775378228, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 8743 + }, + { + "epoch": 0.08744, + "grad_norm": 0.7951156037626225, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 8744 + }, + { + "epoch": 0.08745, + "grad_norm": 0.9924672009502984, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 8745 + }, + { + "epoch": 0.08746, + "grad_norm": 0.971675314061291, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 8746 + }, + { + "epoch": 0.08747, + "grad_norm": 0.9032139097191704, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 8747 + }, + { + "epoch": 0.08748, + "grad_norm": 0.798921343364292, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 8748 + }, + { + "epoch": 0.08749, + "grad_norm": 0.7958369681342767, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 8749 + }, + { + "epoch": 0.0875, + "grad_norm": 0.7880652528528818, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 8750 + }, + { + "epoch": 0.08751, + "grad_norm": 0.7117714704920254, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 8751 + }, + { + "epoch": 0.08752, + "grad_norm": 0.6564224414293092, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 8752 + }, + { + "epoch": 0.08753, + "grad_norm": 0.6251613210169059, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 8753 + }, + { + "epoch": 0.08754, + "grad_norm": 0.6545076658857255, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 8754 + }, + { + "epoch": 0.08755, + "grad_norm": 0.6299656295806964, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 8755 + }, + { + "epoch": 0.08756, + "grad_norm": 0.6937544705015782, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 8756 + }, + { + "epoch": 0.08757, + "grad_norm": 0.7613436036720337, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 8757 + }, + { + "epoch": 0.08758, + "grad_norm": 0.7935810078864296, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 8758 + }, + { + "epoch": 0.08759, + "grad_norm": 0.9218245052230097, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 8759 + }, + { + "epoch": 0.0876, + "grad_norm": 1.1133184341136968, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8760 + }, + { + "epoch": 0.08761, + "grad_norm": 0.8471729174022539, + "learning_rate": 0.003, + "loss": 4.1307, + "step": 8761 + }, + { + "epoch": 0.08762, + "grad_norm": 0.6885885491623273, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 8762 + }, + { + "epoch": 0.08763, + "grad_norm": 0.6900728124677, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 8763 + }, + { + "epoch": 0.08764, + "grad_norm": 0.7448709364689655, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 8764 + }, + { + "epoch": 0.08765, + "grad_norm": 0.7003442871930288, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 8765 + }, + { + "epoch": 0.08766, + "grad_norm": 0.695189743891509, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 8766 + }, + { + "epoch": 0.08767, + "grad_norm": 0.6561447154517275, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 8767 + }, + { + "epoch": 0.08768, + "grad_norm": 0.6134378072581433, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 8768 + }, + { + "epoch": 0.08769, + "grad_norm": 0.6152250798453555, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 8769 + }, + { + "epoch": 0.0877, + "grad_norm": 0.6450512433536177, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 8770 + }, + { + "epoch": 0.08771, + "grad_norm": 0.7339302655862884, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 8771 + }, + { + "epoch": 0.08772, + "grad_norm": 0.8938660355068361, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 8772 + }, + { + "epoch": 0.08773, + "grad_norm": 0.918539366218562, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 8773 + }, + { + "epoch": 0.08774, + "grad_norm": 0.8161914119839463, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 8774 + }, + { + "epoch": 0.08775, + "grad_norm": 0.7485358173137795, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 8775 + }, + { + "epoch": 0.08776, + "grad_norm": 0.6501840423824952, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 8776 + }, + { + "epoch": 0.08777, + "grad_norm": 0.6227297966691554, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8777 + }, + { + "epoch": 0.08778, + "grad_norm": 0.6147681929920195, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 8778 + }, + { + "epoch": 0.08779, + "grad_norm": 0.6711211259228609, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 8779 + }, + { + "epoch": 0.0878, + "grad_norm": 0.6873812876268875, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 8780 + }, + { + "epoch": 0.08781, + "grad_norm": 0.7894739230084857, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 8781 + }, + { + "epoch": 0.08782, + "grad_norm": 0.9319206036015761, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 8782 + }, + { + "epoch": 0.08783, + "grad_norm": 0.785166946779399, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 8783 + }, + { + "epoch": 0.08784, + "grad_norm": 0.6883174398483335, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 8784 + }, + { + "epoch": 0.08785, + "grad_norm": 0.7618236271622089, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 8785 + }, + { + "epoch": 0.08786, + "grad_norm": 0.7882058303319279, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 8786 + }, + { + "epoch": 0.08787, + "grad_norm": 0.8009435303009658, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 8787 + }, + { + "epoch": 0.08788, + "grad_norm": 0.8043157032401507, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 8788 + }, + { + "epoch": 0.08789, + "grad_norm": 0.8053019098728231, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 8789 + }, + { + "epoch": 0.0879, + "grad_norm": 0.7507680332954385, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 8790 + }, + { + "epoch": 0.08791, + "grad_norm": 0.742881908635139, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 8791 + }, + { + "epoch": 0.08792, + "grad_norm": 0.6619963633805601, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 8792 + }, + { + "epoch": 0.08793, + "grad_norm": 0.6836794010275485, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 8793 + }, + { + "epoch": 0.08794, + "grad_norm": 0.8299110382089389, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 8794 + }, + { + "epoch": 0.08795, + "grad_norm": 1.0468573387811748, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 8795 + }, + { + "epoch": 0.08796, + "grad_norm": 1.0109579043602372, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 8796 + }, + { + "epoch": 0.08797, + "grad_norm": 0.9354583933034025, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 8797 + }, + { + "epoch": 0.08798, + "grad_norm": 0.8801038141490901, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 8798 + }, + { + "epoch": 0.08799, + "grad_norm": 0.8373596391595058, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 8799 + }, + { + "epoch": 0.088, + "grad_norm": 0.8236058297216525, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 8800 + }, + { + "epoch": 0.08801, + "grad_norm": 0.9171639160570122, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 8801 + }, + { + "epoch": 0.08802, + "grad_norm": 0.950129352800799, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 8802 + }, + { + "epoch": 0.08803, + "grad_norm": 0.97417915794881, + "learning_rate": 0.003, + "loss": 4.087, + "step": 8803 + }, + { + "epoch": 0.08804, + "grad_norm": 0.9961244608205473, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 8804 + }, + { + "epoch": 0.08805, + "grad_norm": 0.8887568015028089, + "learning_rate": 0.003, + "loss": 4.085, + "step": 8805 + }, + { + "epoch": 0.08806, + "grad_norm": 0.8507811413672998, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 8806 + }, + { + "epoch": 0.08807, + "grad_norm": 0.7799651810021191, + "learning_rate": 0.003, + "loss": 4.114, + "step": 8807 + }, + { + "epoch": 0.08808, + "grad_norm": 0.6310859184139921, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 8808 + }, + { + "epoch": 0.08809, + "grad_norm": 0.5415343793054738, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 8809 + }, + { + "epoch": 0.0881, + "grad_norm": 0.5417480126046187, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 8810 + }, + { + "epoch": 0.08811, + "grad_norm": 0.5437548342900228, + "learning_rate": 0.003, + "loss": 4.091, + "step": 8811 + }, + { + "epoch": 0.08812, + "grad_norm": 0.5065008549086789, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 8812 + }, + { + "epoch": 0.08813, + "grad_norm": 0.44906733576252533, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 8813 + }, + { + "epoch": 0.08814, + "grad_norm": 0.4572901434044213, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 8814 + }, + { + "epoch": 0.08815, + "grad_norm": 0.39740682905597274, + "learning_rate": 0.003, + "loss": 4.096, + "step": 8815 + }, + { + "epoch": 0.08816, + "grad_norm": 0.41394278364223486, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 8816 + }, + { + "epoch": 0.08817, + "grad_norm": 0.40780800547558976, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 8817 + }, + { + "epoch": 0.08818, + "grad_norm": 0.45313876957887234, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 8818 + }, + { + "epoch": 0.08819, + "grad_norm": 0.47289697821709104, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 8819 + }, + { + "epoch": 0.0882, + "grad_norm": 0.5347028769974915, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 8820 + }, + { + "epoch": 0.08821, + "grad_norm": 0.6073667369132447, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 8821 + }, + { + "epoch": 0.08822, + "grad_norm": 0.631563807802477, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 8822 + }, + { + "epoch": 0.08823, + "grad_norm": 0.8219598063684808, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 8823 + }, + { + "epoch": 0.08824, + "grad_norm": 1.1113151394557685, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 8824 + }, + { + "epoch": 0.08825, + "grad_norm": 0.9855650333902289, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 8825 + }, + { + "epoch": 0.08826, + "grad_norm": 0.9778239420914661, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 8826 + }, + { + "epoch": 0.08827, + "grad_norm": 0.8179910297155395, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 8827 + }, + { + "epoch": 0.08828, + "grad_norm": 0.7966604826205193, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 8828 + }, + { + "epoch": 0.08829, + "grad_norm": 0.7977026742349369, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 8829 + }, + { + "epoch": 0.0883, + "grad_norm": 0.792129459764214, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 8830 + }, + { + "epoch": 0.08831, + "grad_norm": 0.9326888184375913, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 8831 + }, + { + "epoch": 0.08832, + "grad_norm": 0.866391958750175, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 8832 + }, + { + "epoch": 0.08833, + "grad_norm": 0.8717561470437578, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 8833 + }, + { + "epoch": 0.08834, + "grad_norm": 0.9086463815238154, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 8834 + }, + { + "epoch": 0.08835, + "grad_norm": 0.9982211117034512, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 8835 + }, + { + "epoch": 0.08836, + "grad_norm": 1.0116778830665036, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 8836 + }, + { + "epoch": 0.08837, + "grad_norm": 0.9047723332769526, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 8837 + }, + { + "epoch": 0.08838, + "grad_norm": 0.7218841347767363, + "learning_rate": 0.003, + "loss": 4.105, + "step": 8838 + }, + { + "epoch": 0.08839, + "grad_norm": 0.6219028863083907, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 8839 + }, + { + "epoch": 0.0884, + "grad_norm": 0.6370526447568182, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 8840 + }, + { + "epoch": 0.08841, + "grad_norm": 0.6865207786249934, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 8841 + }, + { + "epoch": 0.08842, + "grad_norm": 0.8079884245973484, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 8842 + }, + { + "epoch": 0.08843, + "grad_norm": 0.7716447791785704, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 8843 + }, + { + "epoch": 0.08844, + "grad_norm": 0.6552595716080165, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 8844 + }, + { + "epoch": 0.08845, + "grad_norm": 0.6411659090784259, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 8845 + }, + { + "epoch": 0.08846, + "grad_norm": 0.6345516705136226, + "learning_rate": 0.003, + "loss": 4.1264, + "step": 8846 + }, + { + "epoch": 0.08847, + "grad_norm": 0.6208423879645917, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 8847 + }, + { + "epoch": 0.08848, + "grad_norm": 0.6896343586488453, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 8848 + }, + { + "epoch": 0.08849, + "grad_norm": 0.6711212989454174, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 8849 + }, + { + "epoch": 0.0885, + "grad_norm": 0.787384336504839, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 8850 + }, + { + "epoch": 0.08851, + "grad_norm": 0.9296696469370899, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8851 + }, + { + "epoch": 0.08852, + "grad_norm": 0.9491351647008734, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 8852 + }, + { + "epoch": 0.08853, + "grad_norm": 0.8823989114621453, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 8853 + }, + { + "epoch": 0.08854, + "grad_norm": 0.7356047162096173, + "learning_rate": 0.003, + "loss": 4.1288, + "step": 8854 + }, + { + "epoch": 0.08855, + "grad_norm": 0.6817590094654521, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 8855 + }, + { + "epoch": 0.08856, + "grad_norm": 0.6432697273240098, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 8856 + }, + { + "epoch": 0.08857, + "grad_norm": 0.5912854441908766, + "learning_rate": 0.003, + "loss": 4.097, + "step": 8857 + }, + { + "epoch": 0.08858, + "grad_norm": 0.6032547637065733, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 8858 + }, + { + "epoch": 0.08859, + "grad_norm": 0.5317724895750301, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 8859 + }, + { + "epoch": 0.0886, + "grad_norm": 0.490974624643615, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 8860 + }, + { + "epoch": 0.08861, + "grad_norm": 0.5134567395667045, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 8861 + }, + { + "epoch": 0.08862, + "grad_norm": 0.563666635376977, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 8862 + }, + { + "epoch": 0.08863, + "grad_norm": 0.7032312250587909, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 8863 + }, + { + "epoch": 0.08864, + "grad_norm": 0.8431923998213133, + "learning_rate": 0.003, + "loss": 4.103, + "step": 8864 + }, + { + "epoch": 0.08865, + "grad_norm": 0.9517906387901386, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 8865 + }, + { + "epoch": 0.08866, + "grad_norm": 0.894832461876128, + "learning_rate": 0.003, + "loss": 4.073, + "step": 8866 + }, + { + "epoch": 0.08867, + "grad_norm": 0.7886402628779215, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 8867 + }, + { + "epoch": 0.08868, + "grad_norm": 0.7427477147931492, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 8868 + }, + { + "epoch": 0.08869, + "grad_norm": 0.7801814362733093, + "learning_rate": 0.003, + "loss": 4.14, + "step": 8869 + }, + { + "epoch": 0.0887, + "grad_norm": 0.9378074963893244, + "learning_rate": 0.003, + "loss": 4.079, + "step": 8870 + }, + { + "epoch": 0.08871, + "grad_norm": 1.0785126885762188, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 8871 + }, + { + "epoch": 0.08872, + "grad_norm": 0.9686509189429044, + "learning_rate": 0.003, + "loss": 4.1374, + "step": 8872 + }, + { + "epoch": 0.08873, + "grad_norm": 0.9359552632213535, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 8873 + }, + { + "epoch": 0.08874, + "grad_norm": 0.801952593241515, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 8874 + }, + { + "epoch": 0.08875, + "grad_norm": 0.7300737081360735, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 8875 + }, + { + "epoch": 0.08876, + "grad_norm": 0.6845784358472665, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 8876 + }, + { + "epoch": 0.08877, + "grad_norm": 0.5992699690439727, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 8877 + }, + { + "epoch": 0.08878, + "grad_norm": 0.6987685567227716, + "learning_rate": 0.003, + "loss": 4.096, + "step": 8878 + }, + { + "epoch": 0.08879, + "grad_norm": 0.755918105586785, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 8879 + }, + { + "epoch": 0.0888, + "grad_norm": 0.804896715950834, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 8880 + }, + { + "epoch": 0.08881, + "grad_norm": 0.8597307396006292, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 8881 + }, + { + "epoch": 0.08882, + "grad_norm": 0.890705623678484, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 8882 + }, + { + "epoch": 0.08883, + "grad_norm": 0.87665671220314, + "learning_rate": 0.003, + "loss": 4.1507, + "step": 8883 + }, + { + "epoch": 0.08884, + "grad_norm": 0.9160504734206271, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 8884 + }, + { + "epoch": 0.08885, + "grad_norm": 0.9915450251876738, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 8885 + }, + { + "epoch": 0.08886, + "grad_norm": 0.9459675534923236, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 8886 + }, + { + "epoch": 0.08887, + "grad_norm": 0.8822638917509666, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 8887 + }, + { + "epoch": 0.08888, + "grad_norm": 0.9182248673162067, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 8888 + }, + { + "epoch": 0.08889, + "grad_norm": 0.8960795751997341, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 8889 + }, + { + "epoch": 0.0889, + "grad_norm": 0.7029348940189843, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 8890 + }, + { + "epoch": 0.08891, + "grad_norm": 0.5917270992086658, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 8891 + }, + { + "epoch": 0.08892, + "grad_norm": 0.5729686662161869, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 8892 + }, + { + "epoch": 0.08893, + "grad_norm": 0.4933602379272417, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 8893 + }, + { + "epoch": 0.08894, + "grad_norm": 0.4816956228568598, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 8894 + }, + { + "epoch": 0.08895, + "grad_norm": 0.5401089965190982, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 8895 + }, + { + "epoch": 0.08896, + "grad_norm": 0.6834674353769287, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 8896 + }, + { + "epoch": 0.08897, + "grad_norm": 0.9000006866364382, + "learning_rate": 0.003, + "loss": 4.082, + "step": 8897 + }, + { + "epoch": 0.08898, + "grad_norm": 1.0128344751677745, + "learning_rate": 0.003, + "loss": 4.108, + "step": 8898 + }, + { + "epoch": 0.08899, + "grad_norm": 0.8361225818664343, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 8899 + }, + { + "epoch": 0.089, + "grad_norm": 0.7604422714843287, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 8900 + }, + { + "epoch": 0.08901, + "grad_norm": 0.7721795341276967, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 8901 + }, + { + "epoch": 0.08902, + "grad_norm": 0.900896957082942, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 8902 + }, + { + "epoch": 0.08903, + "grad_norm": 0.8920547801921579, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 8903 + }, + { + "epoch": 0.08904, + "grad_norm": 0.8420148885506412, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 8904 + }, + { + "epoch": 0.08905, + "grad_norm": 0.8570250538255739, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 8905 + }, + { + "epoch": 0.08906, + "grad_norm": 0.9205488642424341, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 8906 + }, + { + "epoch": 0.08907, + "grad_norm": 1.0405334047811572, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 8907 + }, + { + "epoch": 0.08908, + "grad_norm": 0.9468509956798719, + "learning_rate": 0.003, + "loss": 4.1323, + "step": 8908 + }, + { + "epoch": 0.08909, + "grad_norm": 0.6926689852243239, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 8909 + }, + { + "epoch": 0.0891, + "grad_norm": 0.673009652964538, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 8910 + }, + { + "epoch": 0.08911, + "grad_norm": 0.6764278448657347, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 8911 + }, + { + "epoch": 0.08912, + "grad_norm": 0.713765564163065, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 8912 + }, + { + "epoch": 0.08913, + "grad_norm": 0.6427371366529472, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 8913 + }, + { + "epoch": 0.08914, + "grad_norm": 0.581053838640892, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 8914 + }, + { + "epoch": 0.08915, + "grad_norm": 0.6154512813452391, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 8915 + }, + { + "epoch": 0.08916, + "grad_norm": 0.5811153917294403, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 8916 + }, + { + "epoch": 0.08917, + "grad_norm": 0.5826469378007894, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 8917 + }, + { + "epoch": 0.08918, + "grad_norm": 0.659868224790148, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 8918 + }, + { + "epoch": 0.08919, + "grad_norm": 0.7475809786764346, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 8919 + }, + { + "epoch": 0.0892, + "grad_norm": 0.8085506843447738, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 8920 + }, + { + "epoch": 0.08921, + "grad_norm": 0.8851417211555018, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 8921 + }, + { + "epoch": 0.08922, + "grad_norm": 0.8405802565396343, + "learning_rate": 0.003, + "loss": 4.11, + "step": 8922 + }, + { + "epoch": 0.08923, + "grad_norm": 0.7493224650962431, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 8923 + }, + { + "epoch": 0.08924, + "grad_norm": 0.6524585129634565, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 8924 + }, + { + "epoch": 0.08925, + "grad_norm": 0.6041119003990524, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 8925 + }, + { + "epoch": 0.08926, + "grad_norm": 0.6170325812343832, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 8926 + }, + { + "epoch": 0.08927, + "grad_norm": 0.5943880002209878, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 8927 + }, + { + "epoch": 0.08928, + "grad_norm": 0.6087426636265381, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 8928 + }, + { + "epoch": 0.08929, + "grad_norm": 0.6534682552946309, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 8929 + }, + { + "epoch": 0.0893, + "grad_norm": 0.7123983106901958, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 8930 + }, + { + "epoch": 0.08931, + "grad_norm": 0.6560939231760763, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 8931 + }, + { + "epoch": 0.08932, + "grad_norm": 0.5027522782217787, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 8932 + }, + { + "epoch": 0.08933, + "grad_norm": 0.4365213401530024, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 8933 + }, + { + "epoch": 0.08934, + "grad_norm": 0.5000766612819153, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 8934 + }, + { + "epoch": 0.08935, + "grad_norm": 0.5952483131640884, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 8935 + }, + { + "epoch": 0.08936, + "grad_norm": 0.6156724481814724, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 8936 + }, + { + "epoch": 0.08937, + "grad_norm": 0.5931999695378637, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 8937 + }, + { + "epoch": 0.08938, + "grad_norm": 0.6894161685221871, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 8938 + }, + { + "epoch": 0.08939, + "grad_norm": 0.7627353264346703, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 8939 + }, + { + "epoch": 0.0894, + "grad_norm": 0.7356138960546812, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 8940 + }, + { + "epoch": 0.08941, + "grad_norm": 0.6420508209011688, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 8941 + }, + { + "epoch": 0.08942, + "grad_norm": 0.6518833670204465, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 8942 + }, + { + "epoch": 0.08943, + "grad_norm": 0.6146219772409397, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 8943 + }, + { + "epoch": 0.08944, + "grad_norm": 0.6396512737329096, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 8944 + }, + { + "epoch": 0.08945, + "grad_norm": 0.6374698041220779, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 8945 + }, + { + "epoch": 0.08946, + "grad_norm": 0.7068156644089753, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 8946 + }, + { + "epoch": 0.08947, + "grad_norm": 0.845317356111474, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 8947 + }, + { + "epoch": 0.08948, + "grad_norm": 1.0368306661318563, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 8948 + }, + { + "epoch": 0.08949, + "grad_norm": 1.198555666195333, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 8949 + }, + { + "epoch": 0.0895, + "grad_norm": 0.5996385514208301, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 8950 + }, + { + "epoch": 0.08951, + "grad_norm": 0.7108449111037725, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 8951 + }, + { + "epoch": 0.08952, + "grad_norm": 0.855830753478774, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 8952 + }, + { + "epoch": 0.08953, + "grad_norm": 0.9206337595900704, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 8953 + }, + { + "epoch": 0.08954, + "grad_norm": 0.812196407142358, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 8954 + }, + { + "epoch": 0.08955, + "grad_norm": 0.7982919359283707, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 8955 + }, + { + "epoch": 0.08956, + "grad_norm": 0.8635629919504556, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 8956 + }, + { + "epoch": 0.08957, + "grad_norm": 0.9322894837024904, + "learning_rate": 0.003, + "loss": 4.118, + "step": 8957 + }, + { + "epoch": 0.08958, + "grad_norm": 0.9187549703391346, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 8958 + }, + { + "epoch": 0.08959, + "grad_norm": 0.9062695515862552, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 8959 + }, + { + "epoch": 0.0896, + "grad_norm": 0.8138333486268774, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 8960 + }, + { + "epoch": 0.08961, + "grad_norm": 0.8118676867241735, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 8961 + }, + { + "epoch": 0.08962, + "grad_norm": 0.915869089395175, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 8962 + }, + { + "epoch": 0.08963, + "grad_norm": 1.105783806141911, + "learning_rate": 0.003, + "loss": 4.1452, + "step": 8963 + }, + { + "epoch": 0.08964, + "grad_norm": 0.9871252968891374, + "learning_rate": 0.003, + "loss": 4.1247, + "step": 8964 + }, + { + "epoch": 0.08965, + "grad_norm": 0.9621593755289617, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 8965 + }, + { + "epoch": 0.08966, + "grad_norm": 0.8845068329445493, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 8966 + }, + { + "epoch": 0.08967, + "grad_norm": 0.9262452245654436, + "learning_rate": 0.003, + "loss": 4.1443, + "step": 8967 + }, + { + "epoch": 0.08968, + "grad_norm": 1.0690148845058771, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 8968 + }, + { + "epoch": 0.08969, + "grad_norm": 0.9874560303810536, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 8969 + }, + { + "epoch": 0.0897, + "grad_norm": 1.1933287056871495, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 8970 + }, + { + "epoch": 0.08971, + "grad_norm": 0.83496478128569, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 8971 + }, + { + "epoch": 0.08972, + "grad_norm": 0.8851270091462285, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 8972 + }, + { + "epoch": 0.08973, + "grad_norm": 0.7361881327430853, + "learning_rate": 0.003, + "loss": 4.139, + "step": 8973 + }, + { + "epoch": 0.08974, + "grad_norm": 0.6947872113441466, + "learning_rate": 0.003, + "loss": 4.1308, + "step": 8974 + }, + { + "epoch": 0.08975, + "grad_norm": 0.6448548962054766, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 8975 + }, + { + "epoch": 0.08976, + "grad_norm": 0.6115477618993307, + "learning_rate": 0.003, + "loss": 4.12, + "step": 8976 + }, + { + "epoch": 0.08977, + "grad_norm": 0.6406580925867694, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 8977 + }, + { + "epoch": 0.08978, + "grad_norm": 0.5844796689209106, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 8978 + }, + { + "epoch": 0.08979, + "grad_norm": 0.4899670091468122, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 8979 + }, + { + "epoch": 0.0898, + "grad_norm": 0.45907274050387364, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 8980 + }, + { + "epoch": 0.08981, + "grad_norm": 0.4787661572615123, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 8981 + }, + { + "epoch": 0.08982, + "grad_norm": 0.5314004944865418, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 8982 + }, + { + "epoch": 0.08983, + "grad_norm": 0.5367270316206084, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 8983 + }, + { + "epoch": 0.08984, + "grad_norm": 0.6099160821198948, + "learning_rate": 0.003, + "loss": 4.111, + "step": 8984 + }, + { + "epoch": 0.08985, + "grad_norm": 0.6151453422803892, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 8985 + }, + { + "epoch": 0.08986, + "grad_norm": 0.6431362909917693, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 8986 + }, + { + "epoch": 0.08987, + "grad_norm": 0.6245835812002626, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 8987 + }, + { + "epoch": 0.08988, + "grad_norm": 0.65684747848799, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 8988 + }, + { + "epoch": 0.08989, + "grad_norm": 0.7624170463908312, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 8989 + }, + { + "epoch": 0.0899, + "grad_norm": 0.7803016187618669, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 8990 + }, + { + "epoch": 0.08991, + "grad_norm": 0.6199387610169692, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 8991 + }, + { + "epoch": 0.08992, + "grad_norm": 0.6328852031482564, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 8992 + }, + { + "epoch": 0.08993, + "grad_norm": 0.6444379781150428, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 8993 + }, + { + "epoch": 0.08994, + "grad_norm": 0.7004376193465449, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 8994 + }, + { + "epoch": 0.08995, + "grad_norm": 0.7021581496420412, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 8995 + }, + { + "epoch": 0.08996, + "grad_norm": 0.735896084022374, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 8996 + }, + { + "epoch": 0.08997, + "grad_norm": 0.7654208285833823, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 8997 + }, + { + "epoch": 0.08998, + "grad_norm": 0.8110039047646157, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 8998 + }, + { + "epoch": 0.08999, + "grad_norm": 0.9354160143582295, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 8999 + }, + { + "epoch": 0.09, + "grad_norm": 0.9976774961172428, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 9000 + }, + { + "epoch": 0.09001, + "grad_norm": 0.9736972254727577, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 9001 + }, + { + "epoch": 0.09002, + "grad_norm": 0.9273588401300575, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 9002 + }, + { + "epoch": 0.09003, + "grad_norm": 0.9369574420893962, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 9003 + }, + { + "epoch": 0.09004, + "grad_norm": 0.8974590725426277, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 9004 + }, + { + "epoch": 0.09005, + "grad_norm": 0.9593522302258589, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 9005 + }, + { + "epoch": 0.09006, + "grad_norm": 1.148323754626388, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 9006 + }, + { + "epoch": 0.09007, + "grad_norm": 0.7314690534751829, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 9007 + }, + { + "epoch": 0.09008, + "grad_norm": 0.7435332389856724, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 9008 + }, + { + "epoch": 0.09009, + "grad_norm": 0.7078932397563384, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 9009 + }, + { + "epoch": 0.0901, + "grad_norm": 0.7615661108195854, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 9010 + }, + { + "epoch": 0.09011, + "grad_norm": 0.6793695558724677, + "learning_rate": 0.003, + "loss": 4.093, + "step": 9011 + }, + { + "epoch": 0.09012, + "grad_norm": 0.5832604883899358, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 9012 + }, + { + "epoch": 0.09013, + "grad_norm": 0.6009269936700211, + "learning_rate": 0.003, + "loss": 4.081, + "step": 9013 + }, + { + "epoch": 0.09014, + "grad_norm": 0.5849630009375443, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 9014 + }, + { + "epoch": 0.09015, + "grad_norm": 0.6889294107758397, + "learning_rate": 0.003, + "loss": 4.1277, + "step": 9015 + }, + { + "epoch": 0.09016, + "grad_norm": 0.7296383845886047, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 9016 + }, + { + "epoch": 0.09017, + "grad_norm": 0.6317455085918721, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 9017 + }, + { + "epoch": 0.09018, + "grad_norm": 0.6094236570193985, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 9018 + }, + { + "epoch": 0.09019, + "grad_norm": 0.6782840216217128, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 9019 + }, + { + "epoch": 0.0902, + "grad_norm": 0.7866077854724068, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 9020 + }, + { + "epoch": 0.09021, + "grad_norm": 0.9385063075453379, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 9021 + }, + { + "epoch": 0.09022, + "grad_norm": 1.0620409463514187, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 9022 + }, + { + "epoch": 0.09023, + "grad_norm": 0.7658741306051069, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 9023 + }, + { + "epoch": 0.09024, + "grad_norm": 0.6814552110746341, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 9024 + }, + { + "epoch": 0.09025, + "grad_norm": 0.7730019678384572, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 9025 + }, + { + "epoch": 0.09026, + "grad_norm": 0.7526237882680384, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 9026 + }, + { + "epoch": 0.09027, + "grad_norm": 0.6507122800031654, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 9027 + }, + { + "epoch": 0.09028, + "grad_norm": 0.6004305708920464, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 9028 + }, + { + "epoch": 0.09029, + "grad_norm": 0.524677033109727, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 9029 + }, + { + "epoch": 0.0903, + "grad_norm": 0.6033650045330714, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 9030 + }, + { + "epoch": 0.09031, + "grad_norm": 0.5871602565860976, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 9031 + }, + { + "epoch": 0.09032, + "grad_norm": 0.6156339772467885, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 9032 + }, + { + "epoch": 0.09033, + "grad_norm": 0.5735688885353999, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 9033 + }, + { + "epoch": 0.09034, + "grad_norm": 0.5650782637932048, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 9034 + }, + { + "epoch": 0.09035, + "grad_norm": 0.6052372993053132, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 9035 + }, + { + "epoch": 0.09036, + "grad_norm": 0.5476010632727801, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 9036 + }, + { + "epoch": 0.09037, + "grad_norm": 0.583873535910786, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 9037 + }, + { + "epoch": 0.09038, + "grad_norm": 0.7287923974736967, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 9038 + }, + { + "epoch": 0.09039, + "grad_norm": 0.9144241018735565, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 9039 + }, + { + "epoch": 0.0904, + "grad_norm": 1.0063720204208155, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 9040 + }, + { + "epoch": 0.09041, + "grad_norm": 0.9856961456597553, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 9041 + }, + { + "epoch": 0.09042, + "grad_norm": 0.854851014929688, + "learning_rate": 0.003, + "loss": 4.079, + "step": 9042 + }, + { + "epoch": 0.09043, + "grad_norm": 0.888431844356114, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 9043 + }, + { + "epoch": 0.09044, + "grad_norm": 1.017236877807041, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 9044 + }, + { + "epoch": 0.09045, + "grad_norm": 1.031807206439807, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 9045 + }, + { + "epoch": 0.09046, + "grad_norm": 0.85682286882739, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 9046 + }, + { + "epoch": 0.09047, + "grad_norm": 0.879826141603597, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 9047 + }, + { + "epoch": 0.09048, + "grad_norm": 0.8831965636813381, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 9048 + }, + { + "epoch": 0.09049, + "grad_norm": 0.9392095074031075, + "learning_rate": 0.003, + "loss": 4.1268, + "step": 9049 + }, + { + "epoch": 0.0905, + "grad_norm": 0.7828170543785254, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 9050 + }, + { + "epoch": 0.09051, + "grad_norm": 0.8339468015119822, + "learning_rate": 0.003, + "loss": 4.094, + "step": 9051 + }, + { + "epoch": 0.09052, + "grad_norm": 1.0336225007587214, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 9052 + }, + { + "epoch": 0.09053, + "grad_norm": 1.1758495676250011, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 9053 + }, + { + "epoch": 0.09054, + "grad_norm": 0.6902237254747, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 9054 + }, + { + "epoch": 0.09055, + "grad_norm": 0.6385588548368929, + "learning_rate": 0.003, + "loss": 4.08, + "step": 9055 + }, + { + "epoch": 0.09056, + "grad_norm": 0.5682201761251565, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 9056 + }, + { + "epoch": 0.09057, + "grad_norm": 0.6058063245223638, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 9057 + }, + { + "epoch": 0.09058, + "grad_norm": 0.6296895298242686, + "learning_rate": 0.003, + "loss": 4.116, + "step": 9058 + }, + { + "epoch": 0.09059, + "grad_norm": 0.6548379189063352, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 9059 + }, + { + "epoch": 0.0906, + "grad_norm": 0.556996825598181, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 9060 + }, + { + "epoch": 0.09061, + "grad_norm": 0.521158274387155, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 9061 + }, + { + "epoch": 0.09062, + "grad_norm": 0.49652995554371476, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 9062 + }, + { + "epoch": 0.09063, + "grad_norm": 0.4672371999627178, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 9063 + }, + { + "epoch": 0.09064, + "grad_norm": 0.39411015877585076, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 9064 + }, + { + "epoch": 0.09065, + "grad_norm": 0.4138073282094934, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 9065 + }, + { + "epoch": 0.09066, + "grad_norm": 0.4761901801806996, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 9066 + }, + { + "epoch": 0.09067, + "grad_norm": 0.529331487923414, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 9067 + }, + { + "epoch": 0.09068, + "grad_norm": 0.5533215938891374, + "learning_rate": 0.003, + "loss": 4.088, + "step": 9068 + }, + { + "epoch": 0.09069, + "grad_norm": 0.6302304828764494, + "learning_rate": 0.003, + "loss": 4.1273, + "step": 9069 + }, + { + "epoch": 0.0907, + "grad_norm": 0.7733370160113519, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 9070 + }, + { + "epoch": 0.09071, + "grad_norm": 0.9201950339828795, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 9071 + }, + { + "epoch": 0.09072, + "grad_norm": 1.0000279216059271, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 9072 + }, + { + "epoch": 0.09073, + "grad_norm": 0.9703960584155082, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 9073 + }, + { + "epoch": 0.09074, + "grad_norm": 0.7378599237261866, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 9074 + }, + { + "epoch": 0.09075, + "grad_norm": 0.6334621773210859, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 9075 + }, + { + "epoch": 0.09076, + "grad_norm": 0.7233595328453737, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 9076 + }, + { + "epoch": 0.09077, + "grad_norm": 0.6798870748945902, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 9077 + }, + { + "epoch": 0.09078, + "grad_norm": 0.6666031376042267, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 9078 + }, + { + "epoch": 0.09079, + "grad_norm": 0.8196363695329622, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 9079 + }, + { + "epoch": 0.0908, + "grad_norm": 0.8369563991262994, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 9080 + }, + { + "epoch": 0.09081, + "grad_norm": 0.7879366661568333, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 9081 + }, + { + "epoch": 0.09082, + "grad_norm": 0.9098587816411382, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 9082 + }, + { + "epoch": 0.09083, + "grad_norm": 0.9134982635005068, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 9083 + }, + { + "epoch": 0.09084, + "grad_norm": 0.9505568276434497, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 9084 + }, + { + "epoch": 0.09085, + "grad_norm": 0.8638717507088994, + "learning_rate": 0.003, + "loss": 4.116, + "step": 9085 + }, + { + "epoch": 0.09086, + "grad_norm": 0.8895115781730211, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 9086 + }, + { + "epoch": 0.09087, + "grad_norm": 1.060170360539398, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 9087 + }, + { + "epoch": 0.09088, + "grad_norm": 0.901927345203865, + "learning_rate": 0.003, + "loss": 4.091, + "step": 9088 + }, + { + "epoch": 0.09089, + "grad_norm": 0.785289100063639, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 9089 + }, + { + "epoch": 0.0909, + "grad_norm": 0.8620215806982862, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 9090 + }, + { + "epoch": 0.09091, + "grad_norm": 0.7979739519204804, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 9091 + }, + { + "epoch": 0.09092, + "grad_norm": 0.8103723620422554, + "learning_rate": 0.003, + "loss": 4.1311, + "step": 9092 + }, + { + "epoch": 0.09093, + "grad_norm": 0.7761450446921329, + "learning_rate": 0.003, + "loss": 4.089, + "step": 9093 + }, + { + "epoch": 0.09094, + "grad_norm": 0.7568316928415706, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 9094 + }, + { + "epoch": 0.09095, + "grad_norm": 0.759347481387229, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 9095 + }, + { + "epoch": 0.09096, + "grad_norm": 0.766744250041468, + "learning_rate": 0.003, + "loss": 4.087, + "step": 9096 + }, + { + "epoch": 0.09097, + "grad_norm": 0.7055484847046865, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 9097 + }, + { + "epoch": 0.09098, + "grad_norm": 0.7249870657972629, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 9098 + }, + { + "epoch": 0.09099, + "grad_norm": 0.794584702637224, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 9099 + }, + { + "epoch": 0.091, + "grad_norm": 0.9292342497012281, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 9100 + }, + { + "epoch": 0.09101, + "grad_norm": 1.1151713569446484, + "learning_rate": 0.003, + "loss": 4.107, + "step": 9101 + }, + { + "epoch": 0.09102, + "grad_norm": 0.9856776803625353, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 9102 + }, + { + "epoch": 0.09103, + "grad_norm": 0.8883418202472645, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 9103 + }, + { + "epoch": 0.09104, + "grad_norm": 0.7209861355720855, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 9104 + }, + { + "epoch": 0.09105, + "grad_norm": 0.6974273614545563, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 9105 + }, + { + "epoch": 0.09106, + "grad_norm": 0.7573369118352933, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 9106 + }, + { + "epoch": 0.09107, + "grad_norm": 0.8573483561968565, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 9107 + }, + { + "epoch": 0.09108, + "grad_norm": 0.843416858845127, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 9108 + }, + { + "epoch": 0.09109, + "grad_norm": 0.7331572974610379, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 9109 + }, + { + "epoch": 0.0911, + "grad_norm": 0.6866482073169092, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 9110 + }, + { + "epoch": 0.09111, + "grad_norm": 0.6146584810372685, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 9111 + }, + { + "epoch": 0.09112, + "grad_norm": 0.6233531858518306, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 9112 + }, + { + "epoch": 0.09113, + "grad_norm": 0.6630440885467346, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 9113 + }, + { + "epoch": 0.09114, + "grad_norm": 0.6675611226061425, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 9114 + }, + { + "epoch": 0.09115, + "grad_norm": 0.6851264099217901, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 9115 + }, + { + "epoch": 0.09116, + "grad_norm": 0.6743396804700748, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 9116 + }, + { + "epoch": 0.09117, + "grad_norm": 0.6349134476822885, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 9117 + }, + { + "epoch": 0.09118, + "grad_norm": 0.5734871976924563, + "learning_rate": 0.003, + "loss": 4.087, + "step": 9118 + }, + { + "epoch": 0.09119, + "grad_norm": 0.5722066825396821, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 9119 + }, + { + "epoch": 0.0912, + "grad_norm": 0.5747337761394117, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 9120 + }, + { + "epoch": 0.09121, + "grad_norm": 0.5703997923017384, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 9121 + }, + { + "epoch": 0.09122, + "grad_norm": 0.5937813885823265, + "learning_rate": 0.003, + "loss": 4.104, + "step": 9122 + }, + { + "epoch": 0.09123, + "grad_norm": 0.5793339846399808, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 9123 + }, + { + "epoch": 0.09124, + "grad_norm": 0.7079624241350131, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 9124 + }, + { + "epoch": 0.09125, + "grad_norm": 0.9363241418641047, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 9125 + }, + { + "epoch": 0.09126, + "grad_norm": 1.123535275176799, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 9126 + }, + { + "epoch": 0.09127, + "grad_norm": 0.9098450109303394, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 9127 + }, + { + "epoch": 0.09128, + "grad_norm": 0.7695802519035674, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 9128 + }, + { + "epoch": 0.09129, + "grad_norm": 0.6725773758279523, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 9129 + }, + { + "epoch": 0.0913, + "grad_norm": 0.7516703750695625, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 9130 + }, + { + "epoch": 0.09131, + "grad_norm": 0.7232356676372639, + "learning_rate": 0.003, + "loss": 4.078, + "step": 9131 + }, + { + "epoch": 0.09132, + "grad_norm": 0.6859783867977578, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 9132 + }, + { + "epoch": 0.09133, + "grad_norm": 0.6441924486606353, + "learning_rate": 0.003, + "loss": 4.084, + "step": 9133 + }, + { + "epoch": 0.09134, + "grad_norm": 0.655918695866971, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 9134 + }, + { + "epoch": 0.09135, + "grad_norm": 0.7860018696069528, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 9135 + }, + { + "epoch": 0.09136, + "grad_norm": 0.901439083024281, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 9136 + }, + { + "epoch": 0.09137, + "grad_norm": 1.0327495887186917, + "learning_rate": 0.003, + "loss": 4.128, + "step": 9137 + }, + { + "epoch": 0.09138, + "grad_norm": 0.9559641464369752, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 9138 + }, + { + "epoch": 0.09139, + "grad_norm": 0.8021385645026318, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 9139 + }, + { + "epoch": 0.0914, + "grad_norm": 0.6983879716817356, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 9140 + }, + { + "epoch": 0.09141, + "grad_norm": 0.6229466794575317, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 9141 + }, + { + "epoch": 0.09142, + "grad_norm": 0.5394263568511116, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 9142 + }, + { + "epoch": 0.09143, + "grad_norm": 0.5799460322868719, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 9143 + }, + { + "epoch": 0.09144, + "grad_norm": 0.7115794520642408, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 9144 + }, + { + "epoch": 0.09145, + "grad_norm": 0.9250322234700112, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 9145 + }, + { + "epoch": 0.09146, + "grad_norm": 1.1493765993374974, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 9146 + }, + { + "epoch": 0.09147, + "grad_norm": 0.6766953092233843, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 9147 + }, + { + "epoch": 0.09148, + "grad_norm": 0.615481970405836, + "learning_rate": 0.003, + "loss": 4.102, + "step": 9148 + }, + { + "epoch": 0.09149, + "grad_norm": 0.6468342965420243, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 9149 + }, + { + "epoch": 0.0915, + "grad_norm": 0.5695495986990226, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 9150 + }, + { + "epoch": 0.09151, + "grad_norm": 0.6322625762493587, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 9151 + }, + { + "epoch": 0.09152, + "grad_norm": 0.7095634250401223, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 9152 + }, + { + "epoch": 0.09153, + "grad_norm": 0.7562377869290767, + "learning_rate": 0.003, + "loss": 4.092, + "step": 9153 + }, + { + "epoch": 0.09154, + "grad_norm": 0.730100776007017, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 9154 + }, + { + "epoch": 0.09155, + "grad_norm": 0.6476046997499159, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 9155 + }, + { + "epoch": 0.09156, + "grad_norm": 0.5966611216224073, + "learning_rate": 0.003, + "loss": 4.092, + "step": 9156 + }, + { + "epoch": 0.09157, + "grad_norm": 0.729366774751956, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 9157 + }, + { + "epoch": 0.09158, + "grad_norm": 0.9225886030022502, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 9158 + }, + { + "epoch": 0.09159, + "grad_norm": 1.0003949696131078, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 9159 + }, + { + "epoch": 0.0916, + "grad_norm": 0.8838504831163226, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 9160 + }, + { + "epoch": 0.09161, + "grad_norm": 1.0435641927134112, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 9161 + }, + { + "epoch": 0.09162, + "grad_norm": 0.9849828677019289, + "learning_rate": 0.003, + "loss": 4.131, + "step": 9162 + }, + { + "epoch": 0.09163, + "grad_norm": 0.8423932570011732, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 9163 + }, + { + "epoch": 0.09164, + "grad_norm": 0.8118112808914887, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 9164 + }, + { + "epoch": 0.09165, + "grad_norm": 0.696647926319978, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 9165 + }, + { + "epoch": 0.09166, + "grad_norm": 0.7050748772822608, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 9166 + }, + { + "epoch": 0.09167, + "grad_norm": 0.6984810250947892, + "learning_rate": 0.003, + "loss": 4.082, + "step": 9167 + }, + { + "epoch": 0.09168, + "grad_norm": 0.7429002727610159, + "learning_rate": 0.003, + "loss": 4.075, + "step": 9168 + }, + { + "epoch": 0.09169, + "grad_norm": 0.8700828623089742, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 9169 + }, + { + "epoch": 0.0917, + "grad_norm": 0.8255131562046868, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 9170 + }, + { + "epoch": 0.09171, + "grad_norm": 0.8257708158169661, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 9171 + }, + { + "epoch": 0.09172, + "grad_norm": 0.9821171250112688, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 9172 + }, + { + "epoch": 0.09173, + "grad_norm": 1.1101867989693224, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 9173 + }, + { + "epoch": 0.09174, + "grad_norm": 0.8606212362625643, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 9174 + }, + { + "epoch": 0.09175, + "grad_norm": 0.7238484783389175, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 9175 + }, + { + "epoch": 0.09176, + "grad_norm": 0.7778471274442456, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 9176 + }, + { + "epoch": 0.09177, + "grad_norm": 0.7942453477102529, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 9177 + }, + { + "epoch": 0.09178, + "grad_norm": 0.6875439916755981, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 9178 + }, + { + "epoch": 0.09179, + "grad_norm": 0.7054405486618374, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 9179 + }, + { + "epoch": 0.0918, + "grad_norm": 0.7085929286496793, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 9180 + }, + { + "epoch": 0.09181, + "grad_norm": 0.6868386510039214, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 9181 + }, + { + "epoch": 0.09182, + "grad_norm": 0.693686693395854, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 9182 + }, + { + "epoch": 0.09183, + "grad_norm": 0.6969673134627956, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 9183 + }, + { + "epoch": 0.09184, + "grad_norm": 0.6607879678388875, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 9184 + }, + { + "epoch": 0.09185, + "grad_norm": 0.6097277133504103, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 9185 + }, + { + "epoch": 0.09186, + "grad_norm": 0.6707275075667879, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 9186 + }, + { + "epoch": 0.09187, + "grad_norm": 0.631051696187288, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 9187 + }, + { + "epoch": 0.09188, + "grad_norm": 0.5850735017399692, + "learning_rate": 0.003, + "loss": 4.091, + "step": 9188 + }, + { + "epoch": 0.09189, + "grad_norm": 0.5909062131280334, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 9189 + }, + { + "epoch": 0.0919, + "grad_norm": 0.5804461230556179, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 9190 + }, + { + "epoch": 0.09191, + "grad_norm": 0.74233665537322, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 9191 + }, + { + "epoch": 0.09192, + "grad_norm": 1.0070740958090563, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 9192 + }, + { + "epoch": 0.09193, + "grad_norm": 1.3162433610093374, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 9193 + }, + { + "epoch": 0.09194, + "grad_norm": 0.4948613105868042, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 9194 + }, + { + "epoch": 0.09195, + "grad_norm": 0.683388327325886, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 9195 + }, + { + "epoch": 0.09196, + "grad_norm": 0.8065888189569945, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 9196 + }, + { + "epoch": 0.09197, + "grad_norm": 0.7584747106256147, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 9197 + }, + { + "epoch": 0.09198, + "grad_norm": 0.7753320297151186, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 9198 + }, + { + "epoch": 0.09199, + "grad_norm": 0.7824527448547508, + "learning_rate": 0.003, + "loss": 4.104, + "step": 9199 + }, + { + "epoch": 0.092, + "grad_norm": 0.7775469360725809, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 9200 + }, + { + "epoch": 0.09201, + "grad_norm": 0.7680517906058487, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 9201 + }, + { + "epoch": 0.09202, + "grad_norm": 0.8531435904229961, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 9202 + }, + { + "epoch": 0.09203, + "grad_norm": 0.8780606886961511, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 9203 + }, + { + "epoch": 0.09204, + "grad_norm": 0.7891522389637466, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 9204 + }, + { + "epoch": 0.09205, + "grad_norm": 0.7455931059386063, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 9205 + }, + { + "epoch": 0.09206, + "grad_norm": 0.7535666910826962, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 9206 + }, + { + "epoch": 0.09207, + "grad_norm": 0.6917320820801799, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 9207 + }, + { + "epoch": 0.09208, + "grad_norm": 0.704126699875504, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 9208 + }, + { + "epoch": 0.09209, + "grad_norm": 0.8523565990050219, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 9209 + }, + { + "epoch": 0.0921, + "grad_norm": 0.9685150489510521, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 9210 + }, + { + "epoch": 0.09211, + "grad_norm": 1.1138999636053162, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 9211 + }, + { + "epoch": 0.09212, + "grad_norm": 0.7870842232400849, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 9212 + }, + { + "epoch": 0.09213, + "grad_norm": 0.7697833258515777, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 9213 + }, + { + "epoch": 0.09214, + "grad_norm": 0.6846278043413059, + "learning_rate": 0.003, + "loss": 4.074, + "step": 9214 + }, + { + "epoch": 0.09215, + "grad_norm": 0.6743239347064265, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 9215 + }, + { + "epoch": 0.09216, + "grad_norm": 0.7010992201639914, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 9216 + }, + { + "epoch": 0.09217, + "grad_norm": 0.7476228942446806, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 9217 + }, + { + "epoch": 0.09218, + "grad_norm": 0.8348321650225655, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 9218 + }, + { + "epoch": 0.09219, + "grad_norm": 0.9157211023791693, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 9219 + }, + { + "epoch": 0.0922, + "grad_norm": 0.83049456838394, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 9220 + }, + { + "epoch": 0.09221, + "grad_norm": 0.6939065160104407, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 9221 + }, + { + "epoch": 0.09222, + "grad_norm": 0.692227128387877, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 9222 + }, + { + "epoch": 0.09223, + "grad_norm": 0.6206638261808557, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 9223 + }, + { + "epoch": 0.09224, + "grad_norm": 0.6155175720948994, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 9224 + }, + { + "epoch": 0.09225, + "grad_norm": 0.5763438884147025, + "learning_rate": 0.003, + "loss": 4.075, + "step": 9225 + }, + { + "epoch": 0.09226, + "grad_norm": 0.5792522714322638, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 9226 + }, + { + "epoch": 0.09227, + "grad_norm": 0.5698582837332627, + "learning_rate": 0.003, + "loss": 4.078, + "step": 9227 + }, + { + "epoch": 0.09228, + "grad_norm": 0.5554181806254156, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 9228 + }, + { + "epoch": 0.09229, + "grad_norm": 0.6496771344535306, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 9229 + }, + { + "epoch": 0.0923, + "grad_norm": 0.8079631630379661, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 9230 + }, + { + "epoch": 0.09231, + "grad_norm": 1.001327955726392, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 9231 + }, + { + "epoch": 0.09232, + "grad_norm": 1.0151152557949434, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 9232 + }, + { + "epoch": 0.09233, + "grad_norm": 0.7597476613283288, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 9233 + }, + { + "epoch": 0.09234, + "grad_norm": 0.7793623041860894, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 9234 + }, + { + "epoch": 0.09235, + "grad_norm": 0.8181572642649179, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 9235 + }, + { + "epoch": 0.09236, + "grad_norm": 0.7748209029826961, + "learning_rate": 0.003, + "loss": 4.1289, + "step": 9236 + }, + { + "epoch": 0.09237, + "grad_norm": 0.7417587962678789, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 9237 + }, + { + "epoch": 0.09238, + "grad_norm": 0.7302829503235564, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 9238 + }, + { + "epoch": 0.09239, + "grad_norm": 0.8200299371821417, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 9239 + }, + { + "epoch": 0.0924, + "grad_norm": 0.7745701228788062, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 9240 + }, + { + "epoch": 0.09241, + "grad_norm": 0.8722251162417323, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 9241 + }, + { + "epoch": 0.09242, + "grad_norm": 0.890375969288489, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 9242 + }, + { + "epoch": 0.09243, + "grad_norm": 0.8425579306067554, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 9243 + }, + { + "epoch": 0.09244, + "grad_norm": 0.750374270081524, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 9244 + }, + { + "epoch": 0.09245, + "grad_norm": 0.7433474953602284, + "learning_rate": 0.003, + "loss": 4.079, + "step": 9245 + }, + { + "epoch": 0.09246, + "grad_norm": 0.8399736354443268, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 9246 + }, + { + "epoch": 0.09247, + "grad_norm": 0.8952276071336189, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 9247 + }, + { + "epoch": 0.09248, + "grad_norm": 0.9460058010349833, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 9248 + }, + { + "epoch": 0.09249, + "grad_norm": 1.028348281495935, + "learning_rate": 0.003, + "loss": 4.1543, + "step": 9249 + }, + { + "epoch": 0.0925, + "grad_norm": 0.8994505559170306, + "learning_rate": 0.003, + "loss": 4.121, + "step": 9250 + }, + { + "epoch": 0.09251, + "grad_norm": 0.8661162024839089, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 9251 + }, + { + "epoch": 0.09252, + "grad_norm": 0.7576653757537349, + "learning_rate": 0.003, + "loss": 4.1253, + "step": 9252 + }, + { + "epoch": 0.09253, + "grad_norm": 0.7300578749253667, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 9253 + }, + { + "epoch": 0.09254, + "grad_norm": 0.6655033765923661, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 9254 + }, + { + "epoch": 0.09255, + "grad_norm": 0.67210004385522, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 9255 + }, + { + "epoch": 0.09256, + "grad_norm": 0.6009374878800777, + "learning_rate": 0.003, + "loss": 4.098, + "step": 9256 + }, + { + "epoch": 0.09257, + "grad_norm": 0.5939860882674391, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 9257 + }, + { + "epoch": 0.09258, + "grad_norm": 0.6273131160536646, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 9258 + }, + { + "epoch": 0.09259, + "grad_norm": 0.6931906821491065, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 9259 + }, + { + "epoch": 0.0926, + "grad_norm": 0.6843836039174134, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 9260 + }, + { + "epoch": 0.09261, + "grad_norm": 0.664844852067105, + "learning_rate": 0.003, + "loss": 4.071, + "step": 9261 + }, + { + "epoch": 0.09262, + "grad_norm": 0.7149590284934766, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 9262 + }, + { + "epoch": 0.09263, + "grad_norm": 0.799686238491033, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 9263 + }, + { + "epoch": 0.09264, + "grad_norm": 0.9184087623202745, + "learning_rate": 0.003, + "loss": 4.1361, + "step": 9264 + }, + { + "epoch": 0.09265, + "grad_norm": 0.8917999340633649, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 9265 + }, + { + "epoch": 0.09266, + "grad_norm": 1.0690273464224147, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 9266 + }, + { + "epoch": 0.09267, + "grad_norm": 0.9224213714197611, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 9267 + }, + { + "epoch": 0.09268, + "grad_norm": 0.7819872181594957, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 9268 + }, + { + "epoch": 0.09269, + "grad_norm": 0.8903337895871032, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 9269 + }, + { + "epoch": 0.0927, + "grad_norm": 0.8789410698039444, + "learning_rate": 0.003, + "loss": 4.1291, + "step": 9270 + }, + { + "epoch": 0.09271, + "grad_norm": 0.8151226774452855, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 9271 + }, + { + "epoch": 0.09272, + "grad_norm": 0.6687056930008045, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 9272 + }, + { + "epoch": 0.09273, + "grad_norm": 0.6975950766902023, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 9273 + }, + { + "epoch": 0.09274, + "grad_norm": 0.704466186613381, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 9274 + }, + { + "epoch": 0.09275, + "grad_norm": 0.7505976895896498, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 9275 + }, + { + "epoch": 0.09276, + "grad_norm": 0.7764606026959862, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 9276 + }, + { + "epoch": 0.09277, + "grad_norm": 0.635369630015743, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 9277 + }, + { + "epoch": 0.09278, + "grad_norm": 0.6241312910359976, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 9278 + }, + { + "epoch": 0.09279, + "grad_norm": 0.6872238724869792, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 9279 + }, + { + "epoch": 0.0928, + "grad_norm": 0.7388825536520087, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 9280 + }, + { + "epoch": 0.09281, + "grad_norm": 0.7389024774211659, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 9281 + }, + { + "epoch": 0.09282, + "grad_norm": 0.7721013112557448, + "learning_rate": 0.003, + "loss": 4.1267, + "step": 9282 + }, + { + "epoch": 0.09283, + "grad_norm": 0.8336050701348083, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 9283 + }, + { + "epoch": 0.09284, + "grad_norm": 1.0158256883508041, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 9284 + }, + { + "epoch": 0.09285, + "grad_norm": 1.18058053789904, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 9285 + }, + { + "epoch": 0.09286, + "grad_norm": 0.8343774515252792, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 9286 + }, + { + "epoch": 0.09287, + "grad_norm": 0.7308200337417591, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 9287 + }, + { + "epoch": 0.09288, + "grad_norm": 0.6891633376361983, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 9288 + }, + { + "epoch": 0.09289, + "grad_norm": 0.6029232592146974, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 9289 + }, + { + "epoch": 0.0929, + "grad_norm": 0.5626416862126429, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 9290 + }, + { + "epoch": 0.09291, + "grad_norm": 0.5779583984794753, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 9291 + }, + { + "epoch": 0.09292, + "grad_norm": 0.5276189332666765, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 9292 + }, + { + "epoch": 0.09293, + "grad_norm": 0.4601373246181229, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 9293 + }, + { + "epoch": 0.09294, + "grad_norm": 0.5858557148598581, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 9294 + }, + { + "epoch": 0.09295, + "grad_norm": 0.6570320858383965, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 9295 + }, + { + "epoch": 0.09296, + "grad_norm": 0.7058986690585867, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 9296 + }, + { + "epoch": 0.09297, + "grad_norm": 0.7237851928274343, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 9297 + }, + { + "epoch": 0.09298, + "grad_norm": 0.7244115678072608, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 9298 + }, + { + "epoch": 0.09299, + "grad_norm": 0.7023860071795822, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 9299 + }, + { + "epoch": 0.093, + "grad_norm": 0.6893885792882112, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 9300 + }, + { + "epoch": 0.09301, + "grad_norm": 0.7771637507723111, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 9301 + }, + { + "epoch": 0.09302, + "grad_norm": 0.8353111718797918, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 9302 + }, + { + "epoch": 0.09303, + "grad_norm": 0.9443868135718669, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 9303 + }, + { + "epoch": 0.09304, + "grad_norm": 0.9340284309555337, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 9304 + }, + { + "epoch": 0.09305, + "grad_norm": 0.821105987408757, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 9305 + }, + { + "epoch": 0.09306, + "grad_norm": 0.6798841287919474, + "learning_rate": 0.003, + "loss": 4.088, + "step": 9306 + }, + { + "epoch": 0.09307, + "grad_norm": 0.6308321339032982, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 9307 + }, + { + "epoch": 0.09308, + "grad_norm": 0.6601870406757323, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 9308 + }, + { + "epoch": 0.09309, + "grad_norm": 0.6442557609343686, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 9309 + }, + { + "epoch": 0.0931, + "grad_norm": 0.6661782172293007, + "learning_rate": 0.003, + "loss": 4.1, + "step": 9310 + }, + { + "epoch": 0.09311, + "grad_norm": 0.694502121054022, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 9311 + }, + { + "epoch": 0.09312, + "grad_norm": 0.7404429863152544, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 9312 + }, + { + "epoch": 0.09313, + "grad_norm": 0.7933542321293575, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 9313 + }, + { + "epoch": 0.09314, + "grad_norm": 0.801634188665558, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 9314 + }, + { + "epoch": 0.09315, + "grad_norm": 0.8599424000518597, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 9315 + }, + { + "epoch": 0.09316, + "grad_norm": 0.8488962903419432, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 9316 + }, + { + "epoch": 0.09317, + "grad_norm": 0.8383529764637065, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 9317 + }, + { + "epoch": 0.09318, + "grad_norm": 0.8558462806350178, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 9318 + }, + { + "epoch": 0.09319, + "grad_norm": 0.9413535849970353, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 9319 + }, + { + "epoch": 0.0932, + "grad_norm": 1.2104907947773857, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 9320 + }, + { + "epoch": 0.09321, + "grad_norm": 0.9438406966956778, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 9321 + }, + { + "epoch": 0.09322, + "grad_norm": 0.8654878853235997, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 9322 + }, + { + "epoch": 0.09323, + "grad_norm": 0.7716832268633257, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 9323 + }, + { + "epoch": 0.09324, + "grad_norm": 0.9620871505328507, + "learning_rate": 0.003, + "loss": 4.1272, + "step": 9324 + }, + { + "epoch": 0.09325, + "grad_norm": 1.033201062271869, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 9325 + }, + { + "epoch": 0.09326, + "grad_norm": 0.9180758393766172, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 9326 + }, + { + "epoch": 0.09327, + "grad_norm": 0.9604068125223971, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 9327 + }, + { + "epoch": 0.09328, + "grad_norm": 0.8487522338913149, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 9328 + }, + { + "epoch": 0.09329, + "grad_norm": 0.7140038231334875, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 9329 + }, + { + "epoch": 0.0933, + "grad_norm": 0.6871213262271845, + "learning_rate": 0.003, + "loss": 4.1235, + "step": 9330 + }, + { + "epoch": 0.09331, + "grad_norm": 0.7546334768507528, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 9331 + }, + { + "epoch": 0.09332, + "grad_norm": 0.8370311645065874, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 9332 + }, + { + "epoch": 0.09333, + "grad_norm": 1.1267395055615177, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 9333 + }, + { + "epoch": 0.09334, + "grad_norm": 0.8851656694755474, + "learning_rate": 0.003, + "loss": 4.119, + "step": 9334 + }, + { + "epoch": 0.09335, + "grad_norm": 0.6895247005069166, + "learning_rate": 0.003, + "loss": 4.097, + "step": 9335 + }, + { + "epoch": 0.09336, + "grad_norm": 0.7529887012918819, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 9336 + }, + { + "epoch": 0.09337, + "grad_norm": 0.8968776641201162, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 9337 + }, + { + "epoch": 0.09338, + "grad_norm": 0.969190767069404, + "learning_rate": 0.003, + "loss": 4.1145, + "step": 9338 + }, + { + "epoch": 0.09339, + "grad_norm": 0.9096354758398183, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 9339 + }, + { + "epoch": 0.0934, + "grad_norm": 1.0248240766167955, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 9340 + }, + { + "epoch": 0.09341, + "grad_norm": 0.8411462432609955, + "learning_rate": 0.003, + "loss": 4.107, + "step": 9341 + }, + { + "epoch": 0.09342, + "grad_norm": 0.7516218020852315, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 9342 + }, + { + "epoch": 0.09343, + "grad_norm": 0.8168975424156755, + "learning_rate": 0.003, + "loss": 4.101, + "step": 9343 + }, + { + "epoch": 0.09344, + "grad_norm": 0.8281071489261201, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 9344 + }, + { + "epoch": 0.09345, + "grad_norm": 0.7971176476434507, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 9345 + }, + { + "epoch": 0.09346, + "grad_norm": 0.7256703424187483, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 9346 + }, + { + "epoch": 0.09347, + "grad_norm": 0.625000462139122, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 9347 + }, + { + "epoch": 0.09348, + "grad_norm": 0.5650513642145435, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 9348 + }, + { + "epoch": 0.09349, + "grad_norm": 0.5637070822633384, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 9349 + }, + { + "epoch": 0.0935, + "grad_norm": 0.5477607998488464, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 9350 + }, + { + "epoch": 0.09351, + "grad_norm": 0.5967568854844185, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 9351 + }, + { + "epoch": 0.09352, + "grad_norm": 0.6324408848296686, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 9352 + }, + { + "epoch": 0.09353, + "grad_norm": 0.5988050971610268, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 9353 + }, + { + "epoch": 0.09354, + "grad_norm": 0.6587621555860957, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 9354 + }, + { + "epoch": 0.09355, + "grad_norm": 0.6913770140531089, + "learning_rate": 0.003, + "loss": 4.104, + "step": 9355 + }, + { + "epoch": 0.09356, + "grad_norm": 0.8268308580950648, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 9356 + }, + { + "epoch": 0.09357, + "grad_norm": 0.9832141390606829, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 9357 + }, + { + "epoch": 0.09358, + "grad_norm": 1.0264443343215965, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 9358 + }, + { + "epoch": 0.09359, + "grad_norm": 0.8022400111008331, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 9359 + }, + { + "epoch": 0.0936, + "grad_norm": 0.7232160641932127, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 9360 + }, + { + "epoch": 0.09361, + "grad_norm": 0.8345708225298007, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 9361 + }, + { + "epoch": 0.09362, + "grad_norm": 0.864583150263912, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 9362 + }, + { + "epoch": 0.09363, + "grad_norm": 0.8449596309061119, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 9363 + }, + { + "epoch": 0.09364, + "grad_norm": 0.721880849977252, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 9364 + }, + { + "epoch": 0.09365, + "grad_norm": 0.6721169386273598, + "learning_rate": 0.003, + "loss": 4.11, + "step": 9365 + }, + { + "epoch": 0.09366, + "grad_norm": 0.5176745603655055, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 9366 + }, + { + "epoch": 0.09367, + "grad_norm": 0.5663365559445198, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 9367 + }, + { + "epoch": 0.09368, + "grad_norm": 0.5588855629457294, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 9368 + }, + { + "epoch": 0.09369, + "grad_norm": 0.558318290254828, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 9369 + }, + { + "epoch": 0.0937, + "grad_norm": 0.5777087869339129, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 9370 + }, + { + "epoch": 0.09371, + "grad_norm": 0.6683550838053252, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 9371 + }, + { + "epoch": 0.09372, + "grad_norm": 0.8106915641791952, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 9372 + }, + { + "epoch": 0.09373, + "grad_norm": 1.0420652181706669, + "learning_rate": 0.003, + "loss": 4.1327, + "step": 9373 + }, + { + "epoch": 0.09374, + "grad_norm": 0.9030870891200452, + "learning_rate": 0.003, + "loss": 4.1293, + "step": 9374 + }, + { + "epoch": 0.09375, + "grad_norm": 0.6642846536487922, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 9375 + }, + { + "epoch": 0.09376, + "grad_norm": 0.5926013599625012, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 9376 + }, + { + "epoch": 0.09377, + "grad_norm": 0.7333731763197545, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 9377 + }, + { + "epoch": 0.09378, + "grad_norm": 0.7205911448453179, + "learning_rate": 0.003, + "loss": 4.098, + "step": 9378 + }, + { + "epoch": 0.09379, + "grad_norm": 0.6261002234170417, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 9379 + }, + { + "epoch": 0.0938, + "grad_norm": 0.5831220315530415, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 9380 + }, + { + "epoch": 0.09381, + "grad_norm": 0.6453696033833166, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 9381 + }, + { + "epoch": 0.09382, + "grad_norm": 0.7447834448687087, + "learning_rate": 0.003, + "loss": 4.09, + "step": 9382 + }, + { + "epoch": 0.09383, + "grad_norm": 0.8017088185500696, + "learning_rate": 0.003, + "loss": 4.077, + "step": 9383 + }, + { + "epoch": 0.09384, + "grad_norm": 0.749499774993177, + "learning_rate": 0.003, + "loss": 4.106, + "step": 9384 + }, + { + "epoch": 0.09385, + "grad_norm": 0.7694390492566549, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 9385 + }, + { + "epoch": 0.09386, + "grad_norm": 0.8555158072501666, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 9386 + }, + { + "epoch": 0.09387, + "grad_norm": 0.8776130218746526, + "learning_rate": 0.003, + "loss": 4.101, + "step": 9387 + }, + { + "epoch": 0.09388, + "grad_norm": 0.9970607046729825, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 9388 + }, + { + "epoch": 0.09389, + "grad_norm": 1.0116948164111348, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 9389 + }, + { + "epoch": 0.0939, + "grad_norm": 0.8607572473294256, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 9390 + }, + { + "epoch": 0.09391, + "grad_norm": 0.8141023279481875, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 9391 + }, + { + "epoch": 0.09392, + "grad_norm": 0.8221337642937177, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 9392 + }, + { + "epoch": 0.09393, + "grad_norm": 0.8631165883014514, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 9393 + }, + { + "epoch": 0.09394, + "grad_norm": 0.8542446307614803, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 9394 + }, + { + "epoch": 0.09395, + "grad_norm": 0.7984540124391786, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 9395 + }, + { + "epoch": 0.09396, + "grad_norm": 0.833132121750485, + "learning_rate": 0.003, + "loss": 4.1332, + "step": 9396 + }, + { + "epoch": 0.09397, + "grad_norm": 0.9833442838193437, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 9397 + }, + { + "epoch": 0.09398, + "grad_norm": 1.003667576167635, + "learning_rate": 0.003, + "loss": 4.13, + "step": 9398 + }, + { + "epoch": 0.09399, + "grad_norm": 0.9559424149218053, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 9399 + }, + { + "epoch": 0.094, + "grad_norm": 0.9201692580630481, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 9400 + }, + { + "epoch": 0.09401, + "grad_norm": 0.8825427431804942, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 9401 + }, + { + "epoch": 0.09402, + "grad_norm": 0.7898577204069231, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 9402 + }, + { + "epoch": 0.09403, + "grad_norm": 0.7350505587815206, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 9403 + }, + { + "epoch": 0.09404, + "grad_norm": 0.7291684122942435, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 9404 + }, + { + "epoch": 0.09405, + "grad_norm": 0.8162712076769146, + "learning_rate": 0.003, + "loss": 4.1242, + "step": 9405 + }, + { + "epoch": 0.09406, + "grad_norm": 0.9155173238132704, + "learning_rate": 0.003, + "loss": 4.1209, + "step": 9406 + }, + { + "epoch": 0.09407, + "grad_norm": 0.8628871245046202, + "learning_rate": 0.003, + "loss": 4.107, + "step": 9407 + }, + { + "epoch": 0.09408, + "grad_norm": 0.7194018791800877, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 9408 + }, + { + "epoch": 0.09409, + "grad_norm": 0.7409924782459923, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 9409 + }, + { + "epoch": 0.0941, + "grad_norm": 0.7794190789815792, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 9410 + }, + { + "epoch": 0.09411, + "grad_norm": 0.8734670824027785, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 9411 + }, + { + "epoch": 0.09412, + "grad_norm": 0.8088836225010004, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 9412 + }, + { + "epoch": 0.09413, + "grad_norm": 0.7602993796261419, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 9413 + }, + { + "epoch": 0.09414, + "grad_norm": 0.6810778606987302, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 9414 + }, + { + "epoch": 0.09415, + "grad_norm": 0.6032263201656475, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 9415 + }, + { + "epoch": 0.09416, + "grad_norm": 0.4846341007249403, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 9416 + }, + { + "epoch": 0.09417, + "grad_norm": 0.5073097020665416, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 9417 + }, + { + "epoch": 0.09418, + "grad_norm": 0.466812325936142, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 9418 + }, + { + "epoch": 0.09419, + "grad_norm": 0.4909393054849047, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 9419 + }, + { + "epoch": 0.0942, + "grad_norm": 0.4572606419258861, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 9420 + }, + { + "epoch": 0.09421, + "grad_norm": 0.5236689638625828, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 9421 + }, + { + "epoch": 0.09422, + "grad_norm": 0.663258154130139, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 9422 + }, + { + "epoch": 0.09423, + "grad_norm": 0.928899803447629, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 9423 + }, + { + "epoch": 0.09424, + "grad_norm": 1.1926656295302425, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 9424 + }, + { + "epoch": 0.09425, + "grad_norm": 0.6853441865571991, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 9425 + }, + { + "epoch": 0.09426, + "grad_norm": 0.5128317339031602, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 9426 + }, + { + "epoch": 0.09427, + "grad_norm": 0.7070587189992616, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 9427 + }, + { + "epoch": 0.09428, + "grad_norm": 0.773759955306798, + "learning_rate": 0.003, + "loss": 4.11, + "step": 9428 + }, + { + "epoch": 0.09429, + "grad_norm": 0.811718324620787, + "learning_rate": 0.003, + "loss": 4.08, + "step": 9429 + }, + { + "epoch": 0.0943, + "grad_norm": 0.7612959659265646, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 9430 + }, + { + "epoch": 0.09431, + "grad_norm": 0.705635244969301, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 9431 + }, + { + "epoch": 0.09432, + "grad_norm": 0.7185246612672866, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 9432 + }, + { + "epoch": 0.09433, + "grad_norm": 0.808636412731162, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 9433 + }, + { + "epoch": 0.09434, + "grad_norm": 0.8588858555452056, + "learning_rate": 0.003, + "loss": 4.073, + "step": 9434 + }, + { + "epoch": 0.09435, + "grad_norm": 0.8829432922785366, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 9435 + }, + { + "epoch": 0.09436, + "grad_norm": 0.8229332972229458, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 9436 + }, + { + "epoch": 0.09437, + "grad_norm": 0.7502237223787239, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 9437 + }, + { + "epoch": 0.09438, + "grad_norm": 0.6894340570648702, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 9438 + }, + { + "epoch": 0.09439, + "grad_norm": 0.7210313125158538, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 9439 + }, + { + "epoch": 0.0944, + "grad_norm": 0.7260397120709652, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 9440 + }, + { + "epoch": 0.09441, + "grad_norm": 0.7867602084595864, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 9441 + }, + { + "epoch": 0.09442, + "grad_norm": 0.8364820552569051, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 9442 + }, + { + "epoch": 0.09443, + "grad_norm": 0.9738365901239612, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 9443 + }, + { + "epoch": 0.09444, + "grad_norm": 1.162136240204578, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 9444 + }, + { + "epoch": 0.09445, + "grad_norm": 0.8068568465517928, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 9445 + }, + { + "epoch": 0.09446, + "grad_norm": 0.7524396274055184, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 9446 + }, + { + "epoch": 0.09447, + "grad_norm": 0.6473920221674216, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 9447 + }, + { + "epoch": 0.09448, + "grad_norm": 0.6745944542419227, + "learning_rate": 0.003, + "loss": 4.099, + "step": 9448 + }, + { + "epoch": 0.09449, + "grad_norm": 0.6655279612546542, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 9449 + }, + { + "epoch": 0.0945, + "grad_norm": 0.6127094330682528, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 9450 + }, + { + "epoch": 0.09451, + "grad_norm": 0.621361177849897, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 9451 + }, + { + "epoch": 0.09452, + "grad_norm": 0.6362015223505509, + "learning_rate": 0.003, + "loss": 4.103, + "step": 9452 + }, + { + "epoch": 0.09453, + "grad_norm": 0.813045169964411, + "learning_rate": 0.003, + "loss": 4.053, + "step": 9453 + }, + { + "epoch": 0.09454, + "grad_norm": 1.1347224007728371, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 9454 + }, + { + "epoch": 0.09455, + "grad_norm": 0.8959042657473294, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 9455 + }, + { + "epoch": 0.09456, + "grad_norm": 0.7685184606728515, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 9456 + }, + { + "epoch": 0.09457, + "grad_norm": 0.7342695223426569, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 9457 + }, + { + "epoch": 0.09458, + "grad_norm": 0.8272094526351071, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 9458 + }, + { + "epoch": 0.09459, + "grad_norm": 0.8913799983064666, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 9459 + }, + { + "epoch": 0.0946, + "grad_norm": 0.7914931766921125, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 9460 + }, + { + "epoch": 0.09461, + "grad_norm": 0.7143667270257789, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 9461 + }, + { + "epoch": 0.09462, + "grad_norm": 0.7155076180457024, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 9462 + }, + { + "epoch": 0.09463, + "grad_norm": 0.689792898507005, + "learning_rate": 0.003, + "loss": 4.078, + "step": 9463 + }, + { + "epoch": 0.09464, + "grad_norm": 0.7023738600071371, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 9464 + }, + { + "epoch": 0.09465, + "grad_norm": 0.7187786079383829, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 9465 + }, + { + "epoch": 0.09466, + "grad_norm": 0.6302140939859505, + "learning_rate": 0.003, + "loss": 4.095, + "step": 9466 + }, + { + "epoch": 0.09467, + "grad_norm": 0.612265626337836, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 9467 + }, + { + "epoch": 0.09468, + "grad_norm": 0.6579286983017547, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 9468 + }, + { + "epoch": 0.09469, + "grad_norm": 0.5830435072856022, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 9469 + }, + { + "epoch": 0.0947, + "grad_norm": 0.5258507863729508, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 9470 + }, + { + "epoch": 0.09471, + "grad_norm": 0.46919009653098687, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 9471 + }, + { + "epoch": 0.09472, + "grad_norm": 0.5215402592850111, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 9472 + }, + { + "epoch": 0.09473, + "grad_norm": 0.6491591524568011, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 9473 + }, + { + "epoch": 0.09474, + "grad_norm": 0.7868164547787738, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 9474 + }, + { + "epoch": 0.09475, + "grad_norm": 0.9666742348116686, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 9475 + }, + { + "epoch": 0.09476, + "grad_norm": 1.0668974359449654, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 9476 + }, + { + "epoch": 0.09477, + "grad_norm": 0.8895495837200137, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 9477 + }, + { + "epoch": 0.09478, + "grad_norm": 0.7822408243265004, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 9478 + }, + { + "epoch": 0.09479, + "grad_norm": 0.7362350598274618, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 9479 + }, + { + "epoch": 0.0948, + "grad_norm": 0.7635551325432101, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 9480 + }, + { + "epoch": 0.09481, + "grad_norm": 0.8359593042143941, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 9481 + }, + { + "epoch": 0.09482, + "grad_norm": 0.8458302540004683, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 9482 + }, + { + "epoch": 0.09483, + "grad_norm": 0.8340671653924622, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 9483 + }, + { + "epoch": 0.09484, + "grad_norm": 0.7992408585225695, + "learning_rate": 0.003, + "loss": 4.108, + "step": 9484 + }, + { + "epoch": 0.09485, + "grad_norm": 0.7964148073286885, + "learning_rate": 0.003, + "loss": 4.089, + "step": 9485 + }, + { + "epoch": 0.09486, + "grad_norm": 0.8771624728266194, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 9486 + }, + { + "epoch": 0.09487, + "grad_norm": 0.9914369833754603, + "learning_rate": 0.003, + "loss": 4.099, + "step": 9487 + }, + { + "epoch": 0.09488, + "grad_norm": 0.92799842755361, + "learning_rate": 0.003, + "loss": 4.1431, + "step": 9488 + }, + { + "epoch": 0.09489, + "grad_norm": 0.7239696176957987, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 9489 + }, + { + "epoch": 0.0949, + "grad_norm": 0.657621056297735, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 9490 + }, + { + "epoch": 0.09491, + "grad_norm": 0.5920365056277531, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 9491 + }, + { + "epoch": 0.09492, + "grad_norm": 0.6824599764976818, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 9492 + }, + { + "epoch": 0.09493, + "grad_norm": 0.8856580120578575, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 9493 + }, + { + "epoch": 0.09494, + "grad_norm": 0.9809438136763652, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 9494 + }, + { + "epoch": 0.09495, + "grad_norm": 1.0818725077217133, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 9495 + }, + { + "epoch": 0.09496, + "grad_norm": 0.8173342329103473, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 9496 + }, + { + "epoch": 0.09497, + "grad_norm": 0.6206180769695743, + "learning_rate": 0.003, + "loss": 4.089, + "step": 9497 + }, + { + "epoch": 0.09498, + "grad_norm": 0.5589853203174866, + "learning_rate": 0.003, + "loss": 4.067, + "step": 9498 + }, + { + "epoch": 0.09499, + "grad_norm": 0.5738351051872796, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 9499 + }, + { + "epoch": 0.095, + "grad_norm": 0.6306862899639631, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 9500 + }, + { + "epoch": 0.09501, + "grad_norm": 0.6733724428518848, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 9501 + }, + { + "epoch": 0.09502, + "grad_norm": 0.8461241073955719, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 9502 + }, + { + "epoch": 0.09503, + "grad_norm": 1.0158067722960233, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 9503 + }, + { + "epoch": 0.09504, + "grad_norm": 1.1190865130171455, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 9504 + }, + { + "epoch": 0.09505, + "grad_norm": 0.7932088206811884, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 9505 + }, + { + "epoch": 0.09506, + "grad_norm": 0.7129006027867597, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 9506 + }, + { + "epoch": 0.09507, + "grad_norm": 0.6024715566098497, + "learning_rate": 0.003, + "loss": 4.1149, + "step": 9507 + }, + { + "epoch": 0.09508, + "grad_norm": 0.6057657514430907, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 9508 + }, + { + "epoch": 0.09509, + "grad_norm": 0.6243112506876197, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 9509 + }, + { + "epoch": 0.0951, + "grad_norm": 0.6744699873109112, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 9510 + }, + { + "epoch": 0.09511, + "grad_norm": 0.5698773213648972, + "learning_rate": 0.003, + "loss": 4.077, + "step": 9511 + }, + { + "epoch": 0.09512, + "grad_norm": 0.5223468938440529, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 9512 + }, + { + "epoch": 0.09513, + "grad_norm": 0.5295241801515409, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 9513 + }, + { + "epoch": 0.09514, + "grad_norm": 0.6195032274853618, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 9514 + }, + { + "epoch": 0.09515, + "grad_norm": 0.7923024129571904, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 9515 + }, + { + "epoch": 0.09516, + "grad_norm": 1.0748459000652908, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 9516 + }, + { + "epoch": 0.09517, + "grad_norm": 0.9652150547579573, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 9517 + }, + { + "epoch": 0.09518, + "grad_norm": 0.9317902587041311, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 9518 + }, + { + "epoch": 0.09519, + "grad_norm": 1.0011920568954655, + "learning_rate": 0.003, + "loss": 4.1236, + "step": 9519 + }, + { + "epoch": 0.0952, + "grad_norm": 0.8162413257986282, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 9520 + }, + { + "epoch": 0.09521, + "grad_norm": 0.8511270427905199, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 9521 + }, + { + "epoch": 0.09522, + "grad_norm": 0.869150690242155, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 9522 + }, + { + "epoch": 0.09523, + "grad_norm": 0.7851073631966403, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 9523 + }, + { + "epoch": 0.09524, + "grad_norm": 0.7805600991791303, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 9524 + }, + { + "epoch": 0.09525, + "grad_norm": 0.6704373220450912, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 9525 + }, + { + "epoch": 0.09526, + "grad_norm": 0.6560833308141075, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 9526 + }, + { + "epoch": 0.09527, + "grad_norm": 0.6885732713810039, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 9527 + }, + { + "epoch": 0.09528, + "grad_norm": 0.9403396043081385, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 9528 + }, + { + "epoch": 0.09529, + "grad_norm": 1.00556887324088, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 9529 + }, + { + "epoch": 0.0953, + "grad_norm": 0.888729666716305, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 9530 + }, + { + "epoch": 0.09531, + "grad_norm": 0.9418145584346838, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 9531 + }, + { + "epoch": 0.09532, + "grad_norm": 0.9872093380526542, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 9532 + }, + { + "epoch": 0.09533, + "grad_norm": 1.0096517466771673, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 9533 + }, + { + "epoch": 0.09534, + "grad_norm": 0.8896105143964483, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 9534 + }, + { + "epoch": 0.09535, + "grad_norm": 0.7675771159928804, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 9535 + }, + { + "epoch": 0.09536, + "grad_norm": 0.7682126422834153, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 9536 + }, + { + "epoch": 0.09537, + "grad_norm": 0.8056030277957967, + "learning_rate": 0.003, + "loss": 4.1242, + "step": 9537 + }, + { + "epoch": 0.09538, + "grad_norm": 0.7162391293635292, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 9538 + }, + { + "epoch": 0.09539, + "grad_norm": 0.6683543592127842, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 9539 + }, + { + "epoch": 0.0954, + "grad_norm": 0.6001385230297466, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 9540 + }, + { + "epoch": 0.09541, + "grad_norm": 0.5520189491403364, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 9541 + }, + { + "epoch": 0.09542, + "grad_norm": 0.5771983745529633, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 9542 + }, + { + "epoch": 0.09543, + "grad_norm": 0.6070661140119326, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 9543 + }, + { + "epoch": 0.09544, + "grad_norm": 0.7384419185478289, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 9544 + }, + { + "epoch": 0.09545, + "grad_norm": 0.9657170883830423, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 9545 + }, + { + "epoch": 0.09546, + "grad_norm": 1.2681324157872516, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 9546 + }, + { + "epoch": 0.09547, + "grad_norm": 0.62049585194807, + "learning_rate": 0.003, + "loss": 4.094, + "step": 9547 + }, + { + "epoch": 0.09548, + "grad_norm": 0.795505396852106, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 9548 + }, + { + "epoch": 0.09549, + "grad_norm": 0.9041023642688241, + "learning_rate": 0.003, + "loss": 4.129, + "step": 9549 + }, + { + "epoch": 0.0955, + "grad_norm": 0.7920466015961556, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 9550 + }, + { + "epoch": 0.09551, + "grad_norm": 0.7664489290961656, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 9551 + }, + { + "epoch": 0.09552, + "grad_norm": 0.7297971620268919, + "learning_rate": 0.003, + "loss": 4.1432, + "step": 9552 + }, + { + "epoch": 0.09553, + "grad_norm": 0.7231956245433028, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 9553 + }, + { + "epoch": 0.09554, + "grad_norm": 0.7096741017462413, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 9554 + }, + { + "epoch": 0.09555, + "grad_norm": 0.7188180696481433, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 9555 + }, + { + "epoch": 0.09556, + "grad_norm": 0.700959357888011, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 9556 + }, + { + "epoch": 0.09557, + "grad_norm": 0.6879426371558562, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 9557 + }, + { + "epoch": 0.09558, + "grad_norm": 0.6597031957541576, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 9558 + }, + { + "epoch": 0.09559, + "grad_norm": 0.6739137082114623, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 9559 + }, + { + "epoch": 0.0956, + "grad_norm": 0.6964119348231613, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 9560 + }, + { + "epoch": 0.09561, + "grad_norm": 0.7031263209660419, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 9561 + }, + { + "epoch": 0.09562, + "grad_norm": 0.9420485552641903, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 9562 + }, + { + "epoch": 0.09563, + "grad_norm": 6.130969416922354, + "learning_rate": 0.003, + "loss": 4.293, + "step": 9563 + }, + { + "epoch": 0.09564, + "grad_norm": 1.309865365129671, + "learning_rate": 0.003, + "loss": 4.1751, + "step": 9564 + }, + { + "epoch": 0.09565, + "grad_norm": 1.0521711710301151, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 9565 + }, + { + "epoch": 0.09566, + "grad_norm": 0.9997826713731996, + "learning_rate": 0.003, + "loss": 4.1481, + "step": 9566 + }, + { + "epoch": 0.09567, + "grad_norm": 1.3487618062811157, + "learning_rate": 0.003, + "loss": 4.1547, + "step": 9567 + }, + { + "epoch": 0.09568, + "grad_norm": 0.9741842634354936, + "learning_rate": 0.003, + "loss": 4.1462, + "step": 9568 + }, + { + "epoch": 0.09569, + "grad_norm": 1.0654371346008946, + "learning_rate": 0.003, + "loss": 4.1583, + "step": 9569 + }, + { + "epoch": 0.0957, + "grad_norm": 0.8712382456617402, + "learning_rate": 0.003, + "loss": 4.15, + "step": 9570 + }, + { + "epoch": 0.09571, + "grad_norm": 0.8849374832653227, + "learning_rate": 0.003, + "loss": 4.1475, + "step": 9571 + }, + { + "epoch": 0.09572, + "grad_norm": 1.0577800121636243, + "learning_rate": 0.003, + "loss": 4.1463, + "step": 9572 + }, + { + "epoch": 0.09573, + "grad_norm": 1.206598624557028, + "learning_rate": 0.003, + "loss": 4.1599, + "step": 9573 + }, + { + "epoch": 0.09574, + "grad_norm": 1.2512634504121267, + "learning_rate": 0.003, + "loss": 4.1668, + "step": 9574 + }, + { + "epoch": 0.09575, + "grad_norm": 0.9501656930930779, + "learning_rate": 0.003, + "loss": 4.1379, + "step": 9575 + }, + { + "epoch": 0.09576, + "grad_norm": 1.1728879533932677, + "learning_rate": 0.003, + "loss": 4.1477, + "step": 9576 + }, + { + "epoch": 0.09577, + "grad_norm": 2.0768187878146875, + "learning_rate": 0.003, + "loss": 4.184, + "step": 9577 + }, + { + "epoch": 0.09578, + "grad_norm": 1.105305608187662, + "learning_rate": 0.003, + "loss": 4.1486, + "step": 9578 + }, + { + "epoch": 0.09579, + "grad_norm": 0.9689648072779388, + "learning_rate": 0.003, + "loss": 4.1449, + "step": 9579 + }, + { + "epoch": 0.0958, + "grad_norm": 0.9360186586846221, + "learning_rate": 0.003, + "loss": 4.1711, + "step": 9580 + }, + { + "epoch": 0.09581, + "grad_norm": 1.0800383998211391, + "learning_rate": 0.003, + "loss": 4.2082, + "step": 9581 + }, + { + "epoch": 0.09582, + "grad_norm": 1.0704867786853698, + "learning_rate": 0.003, + "loss": 4.1485, + "step": 9582 + }, + { + "epoch": 0.09583, + "grad_norm": 1.142877500779312, + "learning_rate": 0.003, + "loss": 4.1432, + "step": 9583 + }, + { + "epoch": 0.09584, + "grad_norm": 1.3332645879072875, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 9584 + }, + { + "epoch": 0.09585, + "grad_norm": 1.5324381756718481, + "learning_rate": 0.003, + "loss": 4.1664, + "step": 9585 + }, + { + "epoch": 0.09586, + "grad_norm": 0.8704649822421158, + "learning_rate": 0.003, + "loss": 4.1747, + "step": 9586 + }, + { + "epoch": 0.09587, + "grad_norm": 0.9688340079251864, + "learning_rate": 0.003, + "loss": 4.126, + "step": 9587 + }, + { + "epoch": 0.09588, + "grad_norm": 0.8342771328110887, + "learning_rate": 0.003, + "loss": 4.1531, + "step": 9588 + }, + { + "epoch": 0.09589, + "grad_norm": 0.7942887994349634, + "learning_rate": 0.003, + "loss": 4.153, + "step": 9589 + }, + { + "epoch": 0.0959, + "grad_norm": 0.8173041322090617, + "learning_rate": 0.003, + "loss": 4.1609, + "step": 9590 + }, + { + "epoch": 0.09591, + "grad_norm": 0.7677670828705769, + "learning_rate": 0.003, + "loss": 4.1874, + "step": 9591 + }, + { + "epoch": 0.09592, + "grad_norm": 0.7200915670319258, + "learning_rate": 0.003, + "loss": 4.1473, + "step": 9592 + }, + { + "epoch": 0.09593, + "grad_norm": 0.7972359349347333, + "learning_rate": 0.003, + "loss": 4.1567, + "step": 9593 + }, + { + "epoch": 0.09594, + "grad_norm": 0.8556890271092606, + "learning_rate": 0.003, + "loss": 4.1202, + "step": 9594 + }, + { + "epoch": 0.09595, + "grad_norm": 0.8915728396380502, + "learning_rate": 0.003, + "loss": 4.1244, + "step": 9595 + }, + { + "epoch": 0.09596, + "grad_norm": 0.9347507452393141, + "learning_rate": 0.003, + "loss": 4.1309, + "step": 9596 + }, + { + "epoch": 0.09597, + "grad_norm": 1.0024508636239098, + "learning_rate": 0.003, + "loss": 4.1322, + "step": 9597 + }, + { + "epoch": 0.09598, + "grad_norm": 1.2169763273515621, + "learning_rate": 0.003, + "loss": 4.1545, + "step": 9598 + }, + { + "epoch": 0.09599, + "grad_norm": 0.7115963030817509, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 9599 + }, + { + "epoch": 0.096, + "grad_norm": 0.6328045403895035, + "learning_rate": 0.003, + "loss": 4.1151, + "step": 9600 + }, + { + "epoch": 0.09601, + "grad_norm": 0.5998687509571087, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 9601 + }, + { + "epoch": 0.09602, + "grad_norm": 0.5466887961482548, + "learning_rate": 0.003, + "loss": 4.1438, + "step": 9602 + }, + { + "epoch": 0.09603, + "grad_norm": 0.7860448470509064, + "learning_rate": 0.003, + "loss": 4.1215, + "step": 9603 + }, + { + "epoch": 0.09604, + "grad_norm": 0.8745624847605153, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 9604 + }, + { + "epoch": 0.09605, + "grad_norm": 0.8688163043711737, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 9605 + }, + { + "epoch": 0.09606, + "grad_norm": 0.7963417759657033, + "learning_rate": 0.003, + "loss": 4.1231, + "step": 9606 + }, + { + "epoch": 0.09607, + "grad_norm": 0.7369028440038995, + "learning_rate": 0.003, + "loss": 4.1318, + "step": 9607 + }, + { + "epoch": 0.09608, + "grad_norm": 0.8647736584876029, + "learning_rate": 0.003, + "loss": 4.1427, + "step": 9608 + }, + { + "epoch": 0.09609, + "grad_norm": 1.0027577187055126, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 9609 + }, + { + "epoch": 0.0961, + "grad_norm": 0.8626178763464114, + "learning_rate": 0.003, + "loss": 4.1176, + "step": 9610 + }, + { + "epoch": 0.09611, + "grad_norm": 0.5910337185914825, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 9611 + }, + { + "epoch": 0.09612, + "grad_norm": 0.49438202417856203, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 9612 + }, + { + "epoch": 0.09613, + "grad_norm": 0.484763634619797, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 9613 + }, + { + "epoch": 0.09614, + "grad_norm": 0.44675304497501767, + "learning_rate": 0.003, + "loss": 4.103, + "step": 9614 + }, + { + "epoch": 0.09615, + "grad_norm": 0.3830623042239131, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 9615 + }, + { + "epoch": 0.09616, + "grad_norm": 0.40447953537590603, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 9616 + }, + { + "epoch": 0.09617, + "grad_norm": 0.42749479774334626, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 9617 + }, + { + "epoch": 0.09618, + "grad_norm": 0.3807451005616551, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 9618 + }, + { + "epoch": 0.09619, + "grad_norm": 0.3715423418211784, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 9619 + }, + { + "epoch": 0.0962, + "grad_norm": 0.3425603111991135, + "learning_rate": 0.003, + "loss": 4.074, + "step": 9620 + }, + { + "epoch": 0.09621, + "grad_norm": 0.3507562163489838, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 9621 + }, + { + "epoch": 0.09622, + "grad_norm": 0.29748679852114185, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 9622 + }, + { + "epoch": 0.09623, + "grad_norm": 0.2839394173155373, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 9623 + }, + { + "epoch": 0.09624, + "grad_norm": 0.2794661039316224, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 9624 + }, + { + "epoch": 0.09625, + "grad_norm": 0.29629458143975185, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 9625 + }, + { + "epoch": 0.09626, + "grad_norm": 0.3292412126393909, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 9626 + }, + { + "epoch": 0.09627, + "grad_norm": 0.4497439282959039, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 9627 + }, + { + "epoch": 0.09628, + "grad_norm": 0.6526850258400877, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 9628 + }, + { + "epoch": 0.09629, + "grad_norm": 0.9634340689907828, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 9629 + }, + { + "epoch": 0.0963, + "grad_norm": 1.1516478385031867, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 9630 + }, + { + "epoch": 0.09631, + "grad_norm": 0.6401093793721576, + "learning_rate": 0.003, + "loss": 4.1197, + "step": 9631 + }, + { + "epoch": 0.09632, + "grad_norm": 0.7636314345691994, + "learning_rate": 0.003, + "loss": 4.059, + "step": 9632 + }, + { + "epoch": 0.09633, + "grad_norm": 0.992571157461324, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 9633 + }, + { + "epoch": 0.09634, + "grad_norm": 0.7463855736399202, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 9634 + }, + { + "epoch": 0.09635, + "grad_norm": 0.6909299649125988, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 9635 + }, + { + "epoch": 0.09636, + "grad_norm": 0.7431003319653801, + "learning_rate": 0.003, + "loss": 4.109, + "step": 9636 + }, + { + "epoch": 0.09637, + "grad_norm": 0.6455512496885061, + "learning_rate": 0.003, + "loss": 4.075, + "step": 9637 + }, + { + "epoch": 0.09638, + "grad_norm": 0.5445783213282376, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 9638 + }, + { + "epoch": 0.09639, + "grad_norm": 0.5613727528028151, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 9639 + }, + { + "epoch": 0.0964, + "grad_norm": 0.7244593336080651, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 9640 + }, + { + "epoch": 0.09641, + "grad_norm": 0.9030660564578993, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 9641 + }, + { + "epoch": 0.09642, + "grad_norm": 0.9887019736303208, + "learning_rate": 0.003, + "loss": 4.1241, + "step": 9642 + }, + { + "epoch": 0.09643, + "grad_norm": 0.8838383160348036, + "learning_rate": 0.003, + "loss": 4.092, + "step": 9643 + }, + { + "epoch": 0.09644, + "grad_norm": 0.7515245578827098, + "learning_rate": 0.003, + "loss": 4.1104, + "step": 9644 + }, + { + "epoch": 0.09645, + "grad_norm": 0.6649872915216315, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 9645 + }, + { + "epoch": 0.09646, + "grad_norm": 0.7116890218147718, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 9646 + }, + { + "epoch": 0.09647, + "grad_norm": 0.7019694940498846, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 9647 + }, + { + "epoch": 0.09648, + "grad_norm": 0.6596706388975605, + "learning_rate": 0.003, + "loss": 4.1295, + "step": 9648 + }, + { + "epoch": 0.09649, + "grad_norm": 0.667100870547891, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 9649 + }, + { + "epoch": 0.0965, + "grad_norm": 0.6563616277238821, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 9650 + }, + { + "epoch": 0.09651, + "grad_norm": 0.6207032309420466, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 9651 + }, + { + "epoch": 0.09652, + "grad_norm": 0.6164765321685511, + "learning_rate": 0.003, + "loss": 4.08, + "step": 9652 + }, + { + "epoch": 0.09653, + "grad_norm": 0.6425858801124605, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 9653 + }, + { + "epoch": 0.09654, + "grad_norm": 0.6719434091175973, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 9654 + }, + { + "epoch": 0.09655, + "grad_norm": 0.6816595879015243, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 9655 + }, + { + "epoch": 0.09656, + "grad_norm": 0.75276222897346, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 9656 + }, + { + "epoch": 0.09657, + "grad_norm": 0.793810434998781, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 9657 + }, + { + "epoch": 0.09658, + "grad_norm": 0.8203985544035876, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 9658 + }, + { + "epoch": 0.09659, + "grad_norm": 0.7228109813703856, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 9659 + }, + { + "epoch": 0.0966, + "grad_norm": 0.6284036142391546, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 9660 + }, + { + "epoch": 0.09661, + "grad_norm": 0.5390823749375145, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 9661 + }, + { + "epoch": 0.09662, + "grad_norm": 0.4937468666738386, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 9662 + }, + { + "epoch": 0.09663, + "grad_norm": 0.5267363875644104, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 9663 + }, + { + "epoch": 0.09664, + "grad_norm": 0.5614869173293953, + "learning_rate": 0.003, + "loss": 4.094, + "step": 9664 + }, + { + "epoch": 0.09665, + "grad_norm": 0.6434988568327966, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 9665 + }, + { + "epoch": 0.09666, + "grad_norm": 0.6863099786692454, + "learning_rate": 0.003, + "loss": 4.091, + "step": 9666 + }, + { + "epoch": 0.09667, + "grad_norm": 0.6974192050812339, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 9667 + }, + { + "epoch": 0.09668, + "grad_norm": 0.7130895485506055, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 9668 + }, + { + "epoch": 0.09669, + "grad_norm": 0.7314830324385385, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 9669 + }, + { + "epoch": 0.0967, + "grad_norm": 0.7358599798360544, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 9670 + }, + { + "epoch": 0.09671, + "grad_norm": 0.6821836408473856, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 9671 + }, + { + "epoch": 0.09672, + "grad_norm": 0.6652927918131237, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 9672 + }, + { + "epoch": 0.09673, + "grad_norm": 0.7634068597168817, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 9673 + }, + { + "epoch": 0.09674, + "grad_norm": 0.7974126591485172, + "learning_rate": 0.003, + "loss": 4.1387, + "step": 9674 + }, + { + "epoch": 0.09675, + "grad_norm": 0.8544438788166088, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 9675 + }, + { + "epoch": 0.09676, + "grad_norm": 0.7941264785151647, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 9676 + }, + { + "epoch": 0.09677, + "grad_norm": 0.7203225810853267, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 9677 + }, + { + "epoch": 0.09678, + "grad_norm": 0.5740375147446136, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 9678 + }, + { + "epoch": 0.09679, + "grad_norm": 0.60439656922455, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 9679 + }, + { + "epoch": 0.0968, + "grad_norm": 0.7001578420854571, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 9680 + }, + { + "epoch": 0.09681, + "grad_norm": 0.8530019659681068, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 9681 + }, + { + "epoch": 0.09682, + "grad_norm": 1.0076727467524107, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 9682 + }, + { + "epoch": 0.09683, + "grad_norm": 1.1399405747494864, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 9683 + }, + { + "epoch": 0.09684, + "grad_norm": 0.6866617378578579, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 9684 + }, + { + "epoch": 0.09685, + "grad_norm": 0.6034395427034922, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 9685 + }, + { + "epoch": 0.09686, + "grad_norm": 0.6858126543232959, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 9686 + }, + { + "epoch": 0.09687, + "grad_norm": 0.7606273901161237, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 9687 + }, + { + "epoch": 0.09688, + "grad_norm": 0.7823206051561976, + "learning_rate": 0.003, + "loss": 4.1287, + "step": 9688 + }, + { + "epoch": 0.09689, + "grad_norm": 0.7275273786792908, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 9689 + }, + { + "epoch": 0.0969, + "grad_norm": 0.7020173666361166, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 9690 + }, + { + "epoch": 0.09691, + "grad_norm": 0.668700670688743, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 9691 + }, + { + "epoch": 0.09692, + "grad_norm": 0.7183283334951155, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 9692 + }, + { + "epoch": 0.09693, + "grad_norm": 0.8228261565164465, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 9693 + }, + { + "epoch": 0.09694, + "grad_norm": 0.9398860669380985, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 9694 + }, + { + "epoch": 0.09695, + "grad_norm": 0.9679499359904971, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 9695 + }, + { + "epoch": 0.09696, + "grad_norm": 0.9831277733673565, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 9696 + }, + { + "epoch": 0.09697, + "grad_norm": 0.8922294143645066, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 9697 + }, + { + "epoch": 0.09698, + "grad_norm": 0.8622941183892853, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 9698 + }, + { + "epoch": 0.09699, + "grad_norm": 0.849954807407499, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 9699 + }, + { + "epoch": 0.097, + "grad_norm": 0.8595604678823553, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 9700 + }, + { + "epoch": 0.09701, + "grad_norm": 0.7378223203944358, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 9701 + }, + { + "epoch": 0.09702, + "grad_norm": 0.6131607408869036, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 9702 + }, + { + "epoch": 0.09703, + "grad_norm": 0.5814032117502681, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 9703 + }, + { + "epoch": 0.09704, + "grad_norm": 0.6207822778251723, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 9704 + }, + { + "epoch": 0.09705, + "grad_norm": 0.5424790257052854, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 9705 + }, + { + "epoch": 0.09706, + "grad_norm": 0.5191042976935801, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 9706 + }, + { + "epoch": 0.09707, + "grad_norm": 0.4825162871461943, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 9707 + }, + { + "epoch": 0.09708, + "grad_norm": 0.5254613203667191, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 9708 + }, + { + "epoch": 0.09709, + "grad_norm": 0.5837214908960893, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 9709 + }, + { + "epoch": 0.0971, + "grad_norm": 0.5919660600745414, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 9710 + }, + { + "epoch": 0.09711, + "grad_norm": 0.5985906886004138, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 9711 + }, + { + "epoch": 0.09712, + "grad_norm": 0.6218596201649731, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 9712 + }, + { + "epoch": 0.09713, + "grad_norm": 0.6609701796902929, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 9713 + }, + { + "epoch": 0.09714, + "grad_norm": 0.5993265445219289, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 9714 + }, + { + "epoch": 0.09715, + "grad_norm": 0.5595560077596979, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 9715 + }, + { + "epoch": 0.09716, + "grad_norm": 0.5458771794387989, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 9716 + }, + { + "epoch": 0.09717, + "grad_norm": 0.5749644308670316, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 9717 + }, + { + "epoch": 0.09718, + "grad_norm": 0.7136367880167833, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 9718 + }, + { + "epoch": 0.09719, + "grad_norm": 0.8710577217664088, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 9719 + }, + { + "epoch": 0.0972, + "grad_norm": 1.1290953541399391, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 9720 + }, + { + "epoch": 0.09721, + "grad_norm": 0.9640405697464676, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 9721 + }, + { + "epoch": 0.09722, + "grad_norm": 0.947033121500074, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 9722 + }, + { + "epoch": 0.09723, + "grad_norm": 0.864058303867878, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 9723 + }, + { + "epoch": 0.09724, + "grad_norm": 0.7910749200085015, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 9724 + }, + { + "epoch": 0.09725, + "grad_norm": 0.6706273678022362, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 9725 + }, + { + "epoch": 0.09726, + "grad_norm": 0.6952803945114413, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 9726 + }, + { + "epoch": 0.09727, + "grad_norm": 0.7730372856532575, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 9727 + }, + { + "epoch": 0.09728, + "grad_norm": 0.6776321846761416, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 9728 + }, + { + "epoch": 0.09729, + "grad_norm": 0.656555599450724, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 9729 + }, + { + "epoch": 0.0973, + "grad_norm": 0.7397413425845409, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 9730 + }, + { + "epoch": 0.09731, + "grad_norm": 0.741668722027564, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 9731 + }, + { + "epoch": 0.09732, + "grad_norm": 0.9192765996179019, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 9732 + }, + { + "epoch": 0.09733, + "grad_norm": 0.9726160313328494, + "learning_rate": 0.003, + "loss": 4.095, + "step": 9733 + }, + { + "epoch": 0.09734, + "grad_norm": 0.8089628303200062, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 9734 + }, + { + "epoch": 0.09735, + "grad_norm": 0.9196854053701035, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 9735 + }, + { + "epoch": 0.09736, + "grad_norm": 1.0355781723270967, + "learning_rate": 0.003, + "loss": 4.094, + "step": 9736 + }, + { + "epoch": 0.09737, + "grad_norm": 1.390013473224088, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 9737 + }, + { + "epoch": 0.09738, + "grad_norm": 0.6677058435549194, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 9738 + }, + { + "epoch": 0.09739, + "grad_norm": 0.7149231696043877, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 9739 + }, + { + "epoch": 0.0974, + "grad_norm": 0.7596898377209321, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 9740 + }, + { + "epoch": 0.09741, + "grad_norm": 0.9445678693533357, + "learning_rate": 0.003, + "loss": 4.1459, + "step": 9741 + }, + { + "epoch": 0.09742, + "grad_norm": 1.144761757205563, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 9742 + }, + { + "epoch": 0.09743, + "grad_norm": 0.7798904071068536, + "learning_rate": 0.003, + "loss": 4.11, + "step": 9743 + }, + { + "epoch": 0.09744, + "grad_norm": 0.7126950822384531, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 9744 + }, + { + "epoch": 0.09745, + "grad_norm": 0.804932675736049, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 9745 + }, + { + "epoch": 0.09746, + "grad_norm": 0.8275041917630538, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 9746 + }, + { + "epoch": 0.09747, + "grad_norm": 0.8654165216744595, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 9747 + }, + { + "epoch": 0.09748, + "grad_norm": 0.8483082244535185, + "learning_rate": 0.003, + "loss": 4.1333, + "step": 9748 + }, + { + "epoch": 0.09749, + "grad_norm": 0.9655796883096281, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 9749 + }, + { + "epoch": 0.0975, + "grad_norm": 0.8375150794577596, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 9750 + }, + { + "epoch": 0.09751, + "grad_norm": 0.8357319865847818, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 9751 + }, + { + "epoch": 0.09752, + "grad_norm": 0.7805475045931051, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 9752 + }, + { + "epoch": 0.09753, + "grad_norm": 0.7014801049003869, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 9753 + }, + { + "epoch": 0.09754, + "grad_norm": 0.6856528434863608, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 9754 + }, + { + "epoch": 0.09755, + "grad_norm": 0.650848045634542, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 9755 + }, + { + "epoch": 0.09756, + "grad_norm": 0.5622036395323345, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 9756 + }, + { + "epoch": 0.09757, + "grad_norm": 0.5679917813675583, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 9757 + }, + { + "epoch": 0.09758, + "grad_norm": 0.5576435578489023, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 9758 + }, + { + "epoch": 0.09759, + "grad_norm": 0.6963321867164363, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 9759 + }, + { + "epoch": 0.0976, + "grad_norm": 0.7838079745448902, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 9760 + }, + { + "epoch": 0.09761, + "grad_norm": 0.9541884510786587, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 9761 + }, + { + "epoch": 0.09762, + "grad_norm": 1.121153313260097, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 9762 + }, + { + "epoch": 0.09763, + "grad_norm": 0.6664939878081045, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 9763 + }, + { + "epoch": 0.09764, + "grad_norm": 0.6067299530309102, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 9764 + }, + { + "epoch": 0.09765, + "grad_norm": 0.6916937760369664, + "learning_rate": 0.003, + "loss": 4.1132, + "step": 9765 + }, + { + "epoch": 0.09766, + "grad_norm": 0.7187195565497518, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 9766 + }, + { + "epoch": 0.09767, + "grad_norm": 0.7297554741322773, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 9767 + }, + { + "epoch": 0.09768, + "grad_norm": 0.7197047154946628, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 9768 + }, + { + "epoch": 0.09769, + "grad_norm": 0.77558338050166, + "learning_rate": 0.003, + "loss": 4.107, + "step": 9769 + }, + { + "epoch": 0.0977, + "grad_norm": 0.776706503347036, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 9770 + }, + { + "epoch": 0.09771, + "grad_norm": 0.6629745622875617, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 9771 + }, + { + "epoch": 0.09772, + "grad_norm": 0.599070691762346, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 9772 + }, + { + "epoch": 0.09773, + "grad_norm": 0.6430974616537984, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 9773 + }, + { + "epoch": 0.09774, + "grad_norm": 0.65829695633147, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 9774 + }, + { + "epoch": 0.09775, + "grad_norm": 0.6436886424967752, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 9775 + }, + { + "epoch": 0.09776, + "grad_norm": 0.7272224412585245, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 9776 + }, + { + "epoch": 0.09777, + "grad_norm": 0.7810004465966248, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 9777 + }, + { + "epoch": 0.09778, + "grad_norm": 0.843156271360344, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 9778 + }, + { + "epoch": 0.09779, + "grad_norm": 0.8542309059649245, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 9779 + }, + { + "epoch": 0.0978, + "grad_norm": 0.8284073938545176, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 9780 + }, + { + "epoch": 0.09781, + "grad_norm": 0.7656977589639479, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 9781 + }, + { + "epoch": 0.09782, + "grad_norm": 0.642867591138632, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 9782 + }, + { + "epoch": 0.09783, + "grad_norm": 0.6315054401796648, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 9783 + }, + { + "epoch": 0.09784, + "grad_norm": 0.6506631017065446, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 9784 + }, + { + "epoch": 0.09785, + "grad_norm": 0.6870713649494674, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 9785 + }, + { + "epoch": 0.09786, + "grad_norm": 0.7447419206648687, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 9786 + }, + { + "epoch": 0.09787, + "grad_norm": 0.8992521452563863, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 9787 + }, + { + "epoch": 0.09788, + "grad_norm": 0.8733619920631194, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 9788 + }, + { + "epoch": 0.09789, + "grad_norm": 0.829358760588995, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 9789 + }, + { + "epoch": 0.0979, + "grad_norm": 0.8166018992244876, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 9790 + }, + { + "epoch": 0.09791, + "grad_norm": 0.8700145284441482, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 9791 + }, + { + "epoch": 0.09792, + "grad_norm": 0.9672422465296193, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 9792 + }, + { + "epoch": 0.09793, + "grad_norm": 0.8861730905617189, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 9793 + }, + { + "epoch": 0.09794, + "grad_norm": 0.8248882883309061, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 9794 + }, + { + "epoch": 0.09795, + "grad_norm": 0.7088842143033923, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 9795 + }, + { + "epoch": 0.09796, + "grad_norm": 0.7130954093466139, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 9796 + }, + { + "epoch": 0.09797, + "grad_norm": 0.7288682239466572, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 9797 + }, + { + "epoch": 0.09798, + "grad_norm": 0.7976440093808327, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 9798 + }, + { + "epoch": 0.09799, + "grad_norm": 0.9649496398862366, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 9799 + }, + { + "epoch": 0.098, + "grad_norm": 1.1331720133396581, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 9800 + }, + { + "epoch": 0.09801, + "grad_norm": 0.7034672135712403, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 9801 + }, + { + "epoch": 0.09802, + "grad_norm": 0.6773839521714846, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 9802 + }, + { + "epoch": 0.09803, + "grad_norm": 0.7355150714696657, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 9803 + }, + { + "epoch": 0.09804, + "grad_norm": 0.7174536168973166, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 9804 + }, + { + "epoch": 0.09805, + "grad_norm": 0.6507571892133279, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 9805 + }, + { + "epoch": 0.09806, + "grad_norm": 0.64602677559084, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 9806 + }, + { + "epoch": 0.09807, + "grad_norm": 0.7072086988189831, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 9807 + }, + { + "epoch": 0.09808, + "grad_norm": 0.6694715352849688, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 9808 + }, + { + "epoch": 0.09809, + "grad_norm": 0.641647359947315, + "learning_rate": 0.003, + "loss": 4.077, + "step": 9809 + }, + { + "epoch": 0.0981, + "grad_norm": 0.6231401766200729, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 9810 + }, + { + "epoch": 0.09811, + "grad_norm": 0.6349700167139162, + "learning_rate": 0.003, + "loss": 4.104, + "step": 9811 + }, + { + "epoch": 0.09812, + "grad_norm": 0.6948447910573521, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 9812 + }, + { + "epoch": 0.09813, + "grad_norm": 0.8393980996176053, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 9813 + }, + { + "epoch": 0.09814, + "grad_norm": 0.9318454756189145, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 9814 + }, + { + "epoch": 0.09815, + "grad_norm": 0.9599372181289262, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 9815 + }, + { + "epoch": 0.09816, + "grad_norm": 0.8956701047579604, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 9816 + }, + { + "epoch": 0.09817, + "grad_norm": 0.7842843597137388, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 9817 + }, + { + "epoch": 0.09818, + "grad_norm": 0.6893223770225535, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 9818 + }, + { + "epoch": 0.09819, + "grad_norm": 0.5764089558777323, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 9819 + }, + { + "epoch": 0.0982, + "grad_norm": 0.6831187750693034, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 9820 + }, + { + "epoch": 0.09821, + "grad_norm": 0.7420126000497227, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 9821 + }, + { + "epoch": 0.09822, + "grad_norm": 0.7918367231819561, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 9822 + }, + { + "epoch": 0.09823, + "grad_norm": 0.8307403908166644, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 9823 + }, + { + "epoch": 0.09824, + "grad_norm": 0.8384902251512314, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 9824 + }, + { + "epoch": 0.09825, + "grad_norm": 0.9488542492159057, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 9825 + }, + { + "epoch": 0.09826, + "grad_norm": 0.9814885356015456, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 9826 + }, + { + "epoch": 0.09827, + "grad_norm": 1.0404257496474245, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 9827 + }, + { + "epoch": 0.09828, + "grad_norm": 0.9374789182838859, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 9828 + }, + { + "epoch": 0.09829, + "grad_norm": 0.8352504231117999, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 9829 + }, + { + "epoch": 0.0983, + "grad_norm": 0.8712005573355397, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 9830 + }, + { + "epoch": 0.09831, + "grad_norm": 0.8221414372597472, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 9831 + }, + { + "epoch": 0.09832, + "grad_norm": 0.7097651197485733, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 9832 + }, + { + "epoch": 0.09833, + "grad_norm": 0.7171718585045, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 9833 + }, + { + "epoch": 0.09834, + "grad_norm": 0.7926851111043537, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 9834 + }, + { + "epoch": 0.09835, + "grad_norm": 0.8623151532060463, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 9835 + }, + { + "epoch": 0.09836, + "grad_norm": 0.8199248380589881, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 9836 + }, + { + "epoch": 0.09837, + "grad_norm": 0.7755743905810927, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 9837 + }, + { + "epoch": 0.09838, + "grad_norm": 0.7705706750252852, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 9838 + }, + { + "epoch": 0.09839, + "grad_norm": 0.845394429517052, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 9839 + }, + { + "epoch": 0.0984, + "grad_norm": 0.9405467332911781, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 9840 + }, + { + "epoch": 0.09841, + "grad_norm": 0.8405964634828268, + "learning_rate": 0.003, + "loss": 4.111, + "step": 9841 + }, + { + "epoch": 0.09842, + "grad_norm": 0.7811416765302842, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 9842 + }, + { + "epoch": 0.09843, + "grad_norm": 0.8180023616159704, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 9843 + }, + { + "epoch": 0.09844, + "grad_norm": 0.7527374804721993, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 9844 + }, + { + "epoch": 0.09845, + "grad_norm": 0.7530390141763947, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 9845 + }, + { + "epoch": 0.09846, + "grad_norm": 0.8899981921507195, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 9846 + }, + { + "epoch": 0.09847, + "grad_norm": 1.1111905360253278, + "learning_rate": 0.003, + "loss": 4.105, + "step": 9847 + }, + { + "epoch": 0.09848, + "grad_norm": 0.8799351792098972, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 9848 + }, + { + "epoch": 0.09849, + "grad_norm": 0.6714966433483521, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 9849 + }, + { + "epoch": 0.0985, + "grad_norm": 0.660208855694013, + "learning_rate": 0.003, + "loss": 4.112, + "step": 9850 + }, + { + "epoch": 0.09851, + "grad_norm": 0.7037816807855953, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 9851 + }, + { + "epoch": 0.09852, + "grad_norm": 0.7854724491123188, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 9852 + }, + { + "epoch": 0.09853, + "grad_norm": 0.8934963259953501, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 9853 + }, + { + "epoch": 0.09854, + "grad_norm": 0.8084018134106394, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 9854 + }, + { + "epoch": 0.09855, + "grad_norm": 0.795443981883104, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 9855 + }, + { + "epoch": 0.09856, + "grad_norm": 0.7281963252673843, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 9856 + }, + { + "epoch": 0.09857, + "grad_norm": 0.7182745458303548, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 9857 + }, + { + "epoch": 0.09858, + "grad_norm": 0.7050569720124196, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 9858 + }, + { + "epoch": 0.09859, + "grad_norm": 0.6807473261285026, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 9859 + }, + { + "epoch": 0.0986, + "grad_norm": 0.6746756830722065, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 9860 + }, + { + "epoch": 0.09861, + "grad_norm": 0.7007907511364769, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 9861 + }, + { + "epoch": 0.09862, + "grad_norm": 0.7927126135369934, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 9862 + }, + { + "epoch": 0.09863, + "grad_norm": 0.8386321721308743, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 9863 + }, + { + "epoch": 0.09864, + "grad_norm": 0.7507936235349171, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 9864 + }, + { + "epoch": 0.09865, + "grad_norm": 0.6481226992818734, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 9865 + }, + { + "epoch": 0.09866, + "grad_norm": 0.5697115094410177, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 9866 + }, + { + "epoch": 0.09867, + "grad_norm": 0.5126449464627183, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 9867 + }, + { + "epoch": 0.09868, + "grad_norm": 0.5102549762756471, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 9868 + }, + { + "epoch": 0.09869, + "grad_norm": 0.5085779141402881, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 9869 + }, + { + "epoch": 0.0987, + "grad_norm": 0.5428508446899835, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 9870 + }, + { + "epoch": 0.09871, + "grad_norm": 0.6546855703044885, + "learning_rate": 0.003, + "loss": 4.1347, + "step": 9871 + }, + { + "epoch": 0.09872, + "grad_norm": 0.6934739489178067, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 9872 + }, + { + "epoch": 0.09873, + "grad_norm": 0.6568023982508876, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 9873 + }, + { + "epoch": 0.09874, + "grad_norm": 0.7426087598644382, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 9874 + }, + { + "epoch": 0.09875, + "grad_norm": 0.9646739748869851, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 9875 + }, + { + "epoch": 0.09876, + "grad_norm": 1.1760726018799912, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 9876 + }, + { + "epoch": 0.09877, + "grad_norm": 1.032505678325432, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 9877 + }, + { + "epoch": 0.09878, + "grad_norm": 0.9520034852500903, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 9878 + }, + { + "epoch": 0.09879, + "grad_norm": 0.8304599751268602, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 9879 + }, + { + "epoch": 0.0988, + "grad_norm": 0.7973040789135167, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 9880 + }, + { + "epoch": 0.09881, + "grad_norm": 0.6749261652206336, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 9881 + }, + { + "epoch": 0.09882, + "grad_norm": 0.6720424310436138, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 9882 + }, + { + "epoch": 0.09883, + "grad_norm": 0.6552937594046303, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 9883 + }, + { + "epoch": 0.09884, + "grad_norm": 0.8385733301338101, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 9884 + }, + { + "epoch": 0.09885, + "grad_norm": 0.787180609022946, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 9885 + }, + { + "epoch": 0.09886, + "grad_norm": 0.7214374906120231, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 9886 + }, + { + "epoch": 0.09887, + "grad_norm": 0.813828996233088, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 9887 + }, + { + "epoch": 0.09888, + "grad_norm": 0.9395429521943715, + "learning_rate": 0.003, + "loss": 4.1274, + "step": 9888 + }, + { + "epoch": 0.09889, + "grad_norm": 0.9594036018439058, + "learning_rate": 0.003, + "loss": 4.082, + "step": 9889 + }, + { + "epoch": 0.0989, + "grad_norm": 1.0604047721378298, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 9890 + }, + { + "epoch": 0.09891, + "grad_norm": 0.9760792138626977, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 9891 + }, + { + "epoch": 0.09892, + "grad_norm": 0.8055680873064225, + "learning_rate": 0.003, + "loss": 4.067, + "step": 9892 + }, + { + "epoch": 0.09893, + "grad_norm": 0.7150553503473565, + "learning_rate": 0.003, + "loss": 4.094, + "step": 9893 + }, + { + "epoch": 0.09894, + "grad_norm": 0.5919754883801267, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 9894 + }, + { + "epoch": 0.09895, + "grad_norm": 0.6042264333876616, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 9895 + }, + { + "epoch": 0.09896, + "grad_norm": 0.5430950181914646, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 9896 + }, + { + "epoch": 0.09897, + "grad_norm": 0.7064510242910831, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 9897 + }, + { + "epoch": 0.09898, + "grad_norm": 0.8003106649351045, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 9898 + }, + { + "epoch": 0.09899, + "grad_norm": 0.8270519641387873, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 9899 + }, + { + "epoch": 0.099, + "grad_norm": 0.7540500751340824, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 9900 + }, + { + "epoch": 0.09901, + "grad_norm": 0.6671609878427571, + "learning_rate": 0.003, + "loss": 4.091, + "step": 9901 + }, + { + "epoch": 0.09902, + "grad_norm": 0.7272528832102878, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 9902 + }, + { + "epoch": 0.09903, + "grad_norm": 0.749667895778945, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 9903 + }, + { + "epoch": 0.09904, + "grad_norm": 0.6741213580842577, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 9904 + }, + { + "epoch": 0.09905, + "grad_norm": 0.6736961275412793, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 9905 + }, + { + "epoch": 0.09906, + "grad_norm": 0.6560958029133749, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 9906 + }, + { + "epoch": 0.09907, + "grad_norm": 0.631422253019129, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 9907 + }, + { + "epoch": 0.09908, + "grad_norm": 0.5392615341333712, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 9908 + }, + { + "epoch": 0.09909, + "grad_norm": 0.5811455319662365, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 9909 + }, + { + "epoch": 0.0991, + "grad_norm": 0.6707319041356397, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 9910 + }, + { + "epoch": 0.09911, + "grad_norm": 0.771243416598569, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 9911 + }, + { + "epoch": 0.09912, + "grad_norm": 0.8496996867713535, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 9912 + }, + { + "epoch": 0.09913, + "grad_norm": 0.8902788189271055, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 9913 + }, + { + "epoch": 0.09914, + "grad_norm": 0.9357066721293495, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 9914 + }, + { + "epoch": 0.09915, + "grad_norm": 0.9755645470634543, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 9915 + }, + { + "epoch": 0.09916, + "grad_norm": 1.0089094696696197, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 9916 + }, + { + "epoch": 0.09917, + "grad_norm": 0.8817642996354145, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 9917 + }, + { + "epoch": 0.09918, + "grad_norm": 0.7658568526663542, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 9918 + }, + { + "epoch": 0.09919, + "grad_norm": 0.8128521592081074, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 9919 + }, + { + "epoch": 0.0992, + "grad_norm": 0.8609166031572244, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 9920 + }, + { + "epoch": 0.09921, + "grad_norm": 1.0125750989658047, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 9921 + }, + { + "epoch": 0.09922, + "grad_norm": 1.0032822006219375, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 9922 + }, + { + "epoch": 0.09923, + "grad_norm": 1.2307924439733817, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 9923 + }, + { + "epoch": 0.09924, + "grad_norm": 1.0090705722158624, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 9924 + }, + { + "epoch": 0.09925, + "grad_norm": 1.0212794524503392, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 9925 + }, + { + "epoch": 0.09926, + "grad_norm": 0.9853180213898505, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 9926 + }, + { + "epoch": 0.09927, + "grad_norm": 0.758327811881651, + "learning_rate": 0.003, + "loss": 4.1349, + "step": 9927 + }, + { + "epoch": 0.09928, + "grad_norm": 0.7659641009511424, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 9928 + }, + { + "epoch": 0.09929, + "grad_norm": 0.7504067878828342, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 9929 + }, + { + "epoch": 0.0993, + "grad_norm": 0.6947364493882249, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 9930 + }, + { + "epoch": 0.09931, + "grad_norm": 0.6761909114866678, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 9931 + }, + { + "epoch": 0.09932, + "grad_norm": 0.6135941560379718, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 9932 + }, + { + "epoch": 0.09933, + "grad_norm": 0.582085876114816, + "learning_rate": 0.003, + "loss": 4.091, + "step": 9933 + }, + { + "epoch": 0.09934, + "grad_norm": 0.6180077412932585, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 9934 + }, + { + "epoch": 0.09935, + "grad_norm": 0.6156612388696766, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 9935 + }, + { + "epoch": 0.09936, + "grad_norm": 0.6398409744821787, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 9936 + }, + { + "epoch": 0.09937, + "grad_norm": 0.6544534848994807, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 9937 + }, + { + "epoch": 0.09938, + "grad_norm": 0.8269077552442352, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 9938 + }, + { + "epoch": 0.09939, + "grad_norm": 0.9971115211192825, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 9939 + }, + { + "epoch": 0.0994, + "grad_norm": 0.9857415352546822, + "learning_rate": 0.003, + "loss": 4.1229, + "step": 9940 + }, + { + "epoch": 0.09941, + "grad_norm": 0.8137608150128142, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 9941 + }, + { + "epoch": 0.09942, + "grad_norm": 0.593147335054599, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 9942 + }, + { + "epoch": 0.09943, + "grad_norm": 0.5875666526054226, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 9943 + }, + { + "epoch": 0.09944, + "grad_norm": 0.5879211483291883, + "learning_rate": 0.003, + "loss": 4.112, + "step": 9944 + }, + { + "epoch": 0.09945, + "grad_norm": 0.6905176331929537, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 9945 + }, + { + "epoch": 0.09946, + "grad_norm": 0.7198664855118773, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 9946 + }, + { + "epoch": 0.09947, + "grad_norm": 0.752440611386409, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 9947 + }, + { + "epoch": 0.09948, + "grad_norm": 0.8192932165988773, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 9948 + }, + { + "epoch": 0.09949, + "grad_norm": 0.8209749794970975, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 9949 + }, + { + "epoch": 0.0995, + "grad_norm": 0.8358143036904804, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 9950 + }, + { + "epoch": 0.09951, + "grad_norm": 0.8867621716493573, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 9951 + }, + { + "epoch": 0.09952, + "grad_norm": 0.8466716064437273, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 9952 + }, + { + "epoch": 0.09953, + "grad_norm": 0.8254486041624173, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 9953 + }, + { + "epoch": 0.09954, + "grad_norm": 0.7712441746642007, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 9954 + }, + { + "epoch": 0.09955, + "grad_norm": 0.6903202853138538, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 9955 + }, + { + "epoch": 0.09956, + "grad_norm": 0.5906670263661717, + "learning_rate": 0.003, + "loss": 4.074, + "step": 9956 + }, + { + "epoch": 0.09957, + "grad_norm": 0.5743206687874738, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 9957 + }, + { + "epoch": 0.09958, + "grad_norm": 0.590105027476533, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 9958 + }, + { + "epoch": 0.09959, + "grad_norm": 0.6478146939289141, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 9959 + }, + { + "epoch": 0.0996, + "grad_norm": 0.6125903545494685, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 9960 + }, + { + "epoch": 0.09961, + "grad_norm": 0.6855179525277352, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 9961 + }, + { + "epoch": 0.09962, + "grad_norm": 0.8245686493079379, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 9962 + }, + { + "epoch": 0.09963, + "grad_norm": 1.1718745927375416, + "learning_rate": 0.003, + "loss": 4.106, + "step": 9963 + }, + { + "epoch": 0.09964, + "grad_norm": 0.7822113890013278, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 9964 + }, + { + "epoch": 0.09965, + "grad_norm": 0.6627168370652431, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 9965 + }, + { + "epoch": 0.09966, + "grad_norm": 0.6816975050009425, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 9966 + }, + { + "epoch": 0.09967, + "grad_norm": 0.6410163787617061, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 9967 + }, + { + "epoch": 0.09968, + "grad_norm": 0.5513934862668493, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 9968 + }, + { + "epoch": 0.09969, + "grad_norm": 0.5717861304898563, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 9969 + }, + { + "epoch": 0.0997, + "grad_norm": 0.6378222450638118, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 9970 + }, + { + "epoch": 0.09971, + "grad_norm": 0.6879760044616384, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 9971 + }, + { + "epoch": 0.09972, + "grad_norm": 0.7703686261003476, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 9972 + }, + { + "epoch": 0.09973, + "grad_norm": 0.8455904106463509, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 9973 + }, + { + "epoch": 0.09974, + "grad_norm": 0.9031224948235831, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 9974 + }, + { + "epoch": 0.09975, + "grad_norm": 0.7834229298155747, + "learning_rate": 0.003, + "loss": 4.091, + "step": 9975 + }, + { + "epoch": 0.09976, + "grad_norm": 0.7374608282517201, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 9976 + }, + { + "epoch": 0.09977, + "grad_norm": 0.6475022518581292, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 9977 + }, + { + "epoch": 0.09978, + "grad_norm": 0.6655914091026776, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 9978 + }, + { + "epoch": 0.09979, + "grad_norm": 0.6092837390948476, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 9979 + }, + { + "epoch": 0.0998, + "grad_norm": 0.6222297729643492, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 9980 + }, + { + "epoch": 0.09981, + "grad_norm": 0.5644346622119025, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 9981 + }, + { + "epoch": 0.09982, + "grad_norm": 0.5746206355689046, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 9982 + }, + { + "epoch": 0.09983, + "grad_norm": 0.6579561700974546, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 9983 + }, + { + "epoch": 0.09984, + "grad_norm": 0.9489942242805092, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 9984 + }, + { + "epoch": 0.09985, + "grad_norm": 1.5512495843402707, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 9985 + }, + { + "epoch": 0.09986, + "grad_norm": 0.5642219821127367, + "learning_rate": 0.003, + "loss": 4.096, + "step": 9986 + }, + { + "epoch": 0.09987, + "grad_norm": 0.8878804991370798, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 9987 + }, + { + "epoch": 0.09988, + "grad_norm": 0.9616073363226949, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 9988 + }, + { + "epoch": 0.09989, + "grad_norm": 0.905824845539826, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 9989 + }, + { + "epoch": 0.0999, + "grad_norm": 0.8647017499059604, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 9990 + }, + { + "epoch": 0.09991, + "grad_norm": 0.8032896275364889, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 9991 + }, + { + "epoch": 0.09992, + "grad_norm": 0.7279383274198165, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 9992 + }, + { + "epoch": 0.09993, + "grad_norm": 0.7799466970112703, + "learning_rate": 0.003, + "loss": 4.085, + "step": 9993 + }, + { + "epoch": 0.09994, + "grad_norm": 0.9145234797734555, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 9994 + }, + { + "epoch": 0.09995, + "grad_norm": 0.8201103396095457, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 9995 + }, + { + "epoch": 0.09996, + "grad_norm": 0.9047552292555922, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 9996 + }, + { + "epoch": 0.09997, + "grad_norm": 0.8963430475859631, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 9997 + }, + { + "epoch": 0.09998, + "grad_norm": 0.813945518877336, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 9998 + }, + { + "epoch": 0.09999, + "grad_norm": 0.6549507314006496, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 9999 + }, + { + "epoch": 0.1, + "grad_norm": 0.6519137253369942, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10000 + }, + { + "epoch": 0.10001, + "grad_norm": 0.6310538887421997, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10001 + }, + { + "epoch": 0.10002, + "grad_norm": 0.6391259755212524, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 10002 + }, + { + "epoch": 0.10003, + "grad_norm": 0.5978190100029864, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10003 + }, + { + "epoch": 0.10004, + "grad_norm": 0.7353677870344891, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 10004 + }, + { + "epoch": 0.10005, + "grad_norm": 0.8600701195416798, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 10005 + }, + { + "epoch": 0.10006, + "grad_norm": 0.7651339491329487, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 10006 + }, + { + "epoch": 0.10007, + "grad_norm": 0.8281866504989451, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 10007 + }, + { + "epoch": 0.10008, + "grad_norm": 0.898976147912649, + "learning_rate": 0.003, + "loss": 4.1079, + "step": 10008 + }, + { + "epoch": 0.10009, + "grad_norm": 0.9235345887520816, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 10009 + }, + { + "epoch": 0.1001, + "grad_norm": 1.1045884205459653, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 10010 + }, + { + "epoch": 0.10011, + "grad_norm": 0.8047977290401939, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 10011 + }, + { + "epoch": 0.10012, + "grad_norm": 0.8491840634842334, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 10012 + }, + { + "epoch": 0.10013, + "grad_norm": 1.0828080137474874, + "learning_rate": 0.003, + "loss": 4.1233, + "step": 10013 + }, + { + "epoch": 0.10014, + "grad_norm": 1.1635619875301892, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 10014 + }, + { + "epoch": 0.10015, + "grad_norm": 0.897354796776556, + "learning_rate": 0.003, + "loss": 4.1283, + "step": 10015 + }, + { + "epoch": 0.10016, + "grad_norm": 0.7780665867165246, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 10016 + }, + { + "epoch": 0.10017, + "grad_norm": 0.683817569961156, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 10017 + }, + { + "epoch": 0.10018, + "grad_norm": 0.7740532824418541, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 10018 + }, + { + "epoch": 0.10019, + "grad_norm": 0.8606450031261723, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 10019 + }, + { + "epoch": 0.1002, + "grad_norm": 0.808735430817491, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 10020 + }, + { + "epoch": 0.10021, + "grad_norm": 0.6620052775434282, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 10021 + }, + { + "epoch": 0.10022, + "grad_norm": 0.6154822144201489, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 10022 + }, + { + "epoch": 0.10023, + "grad_norm": 0.5187491125035918, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 10023 + }, + { + "epoch": 0.10024, + "grad_norm": 0.4942144582638463, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 10024 + }, + { + "epoch": 0.10025, + "grad_norm": 0.46654780612751734, + "learning_rate": 0.003, + "loss": 4.085, + "step": 10025 + }, + { + "epoch": 0.10026, + "grad_norm": 0.5004543480891657, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 10026 + }, + { + "epoch": 0.10027, + "grad_norm": 0.5513832529483311, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 10027 + }, + { + "epoch": 0.10028, + "grad_norm": 0.6352605183788917, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 10028 + }, + { + "epoch": 0.10029, + "grad_norm": 0.8367870123940754, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 10029 + }, + { + "epoch": 0.1003, + "grad_norm": 1.0581826188086585, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 10030 + }, + { + "epoch": 0.10031, + "grad_norm": 1.0220085066989417, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 10031 + }, + { + "epoch": 0.10032, + "grad_norm": 0.7783369834157056, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 10032 + }, + { + "epoch": 0.10033, + "grad_norm": 0.6709417988465439, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 10033 + }, + { + "epoch": 0.10034, + "grad_norm": 0.6669434095115762, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 10034 + }, + { + "epoch": 0.10035, + "grad_norm": 0.6147566105234249, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 10035 + }, + { + "epoch": 0.10036, + "grad_norm": 0.6529090399469691, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 10036 + }, + { + "epoch": 0.10037, + "grad_norm": 0.7107482522994412, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 10037 + }, + { + "epoch": 0.10038, + "grad_norm": 0.8197362150703974, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 10038 + }, + { + "epoch": 0.10039, + "grad_norm": 0.8295877869862686, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 10039 + }, + { + "epoch": 0.1004, + "grad_norm": 0.7374421686069658, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 10040 + }, + { + "epoch": 0.10041, + "grad_norm": 0.8214317115707387, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 10041 + }, + { + "epoch": 0.10042, + "grad_norm": 0.895440358479686, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 10042 + }, + { + "epoch": 0.10043, + "grad_norm": 0.9993692769809046, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 10043 + }, + { + "epoch": 0.10044, + "grad_norm": 1.072699158101983, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 10044 + }, + { + "epoch": 0.10045, + "grad_norm": 0.9388431886106355, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 10045 + }, + { + "epoch": 0.10046, + "grad_norm": 1.030884690149999, + "learning_rate": 0.003, + "loss": 4.114, + "step": 10046 + }, + { + "epoch": 0.10047, + "grad_norm": 1.0068468868079046, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 10047 + }, + { + "epoch": 0.10048, + "grad_norm": 0.7705600866643636, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 10048 + }, + { + "epoch": 0.10049, + "grad_norm": 0.8194123489368415, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 10049 + }, + { + "epoch": 0.1005, + "grad_norm": 0.869020771465238, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 10050 + }, + { + "epoch": 0.10051, + "grad_norm": 0.9307225832163902, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 10051 + }, + { + "epoch": 0.10052, + "grad_norm": 0.8621096104565541, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 10052 + }, + { + "epoch": 0.10053, + "grad_norm": 0.7248964302923027, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 10053 + }, + { + "epoch": 0.10054, + "grad_norm": 0.6696966942680252, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 10054 + }, + { + "epoch": 0.10055, + "grad_norm": 0.7706194800493945, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 10055 + }, + { + "epoch": 0.10056, + "grad_norm": 0.7476737464111006, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 10056 + }, + { + "epoch": 0.10057, + "grad_norm": 0.7734457893757593, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 10057 + }, + { + "epoch": 0.10058, + "grad_norm": 0.678252649945835, + "learning_rate": 0.003, + "loss": 4.096, + "step": 10058 + }, + { + "epoch": 0.10059, + "grad_norm": 0.6595095553472218, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10059 + }, + { + "epoch": 0.1006, + "grad_norm": 0.7002337222649138, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 10060 + }, + { + "epoch": 0.10061, + "grad_norm": 0.7739912428589064, + "learning_rate": 0.003, + "loss": 4.072, + "step": 10061 + }, + { + "epoch": 0.10062, + "grad_norm": 0.8094580611217377, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 10062 + }, + { + "epoch": 0.10063, + "grad_norm": 0.7849235571724628, + "learning_rate": 0.003, + "loss": 4.074, + "step": 10063 + }, + { + "epoch": 0.10064, + "grad_norm": 0.71688513935201, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 10064 + }, + { + "epoch": 0.10065, + "grad_norm": 0.5387402250397292, + "learning_rate": 0.003, + "loss": 4.103, + "step": 10065 + }, + { + "epoch": 0.10066, + "grad_norm": 0.5738573805956311, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 10066 + }, + { + "epoch": 0.10067, + "grad_norm": 0.5128916053318231, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 10067 + }, + { + "epoch": 0.10068, + "grad_norm": 0.5303861773665701, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 10068 + }, + { + "epoch": 0.10069, + "grad_norm": 0.5363307400406016, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 10069 + }, + { + "epoch": 0.1007, + "grad_norm": 0.6677766567970822, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 10070 + }, + { + "epoch": 0.10071, + "grad_norm": 0.869800192886798, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 10071 + }, + { + "epoch": 0.10072, + "grad_norm": 1.241986467031261, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 10072 + }, + { + "epoch": 0.10073, + "grad_norm": 0.7044633440415046, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 10073 + }, + { + "epoch": 0.10074, + "grad_norm": 0.6016954396918823, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 10074 + }, + { + "epoch": 0.10075, + "grad_norm": 0.6506653532205113, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 10075 + }, + { + "epoch": 0.10076, + "grad_norm": 0.5913298031674521, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 10076 + }, + { + "epoch": 0.10077, + "grad_norm": 0.6612064518297085, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 10077 + }, + { + "epoch": 0.10078, + "grad_norm": 0.7381204773241105, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 10078 + }, + { + "epoch": 0.10079, + "grad_norm": 0.9136392204855105, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 10079 + }, + { + "epoch": 0.1008, + "grad_norm": 1.0551033651714792, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 10080 + }, + { + "epoch": 0.10081, + "grad_norm": 1.021241306196213, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 10081 + }, + { + "epoch": 0.10082, + "grad_norm": 0.9444310820937741, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 10082 + }, + { + "epoch": 0.10083, + "grad_norm": 0.8028394635763569, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 10083 + }, + { + "epoch": 0.10084, + "grad_norm": 0.6931291147221774, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 10084 + }, + { + "epoch": 0.10085, + "grad_norm": 0.6614930979877484, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 10085 + }, + { + "epoch": 0.10086, + "grad_norm": 0.6447028805806914, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 10086 + }, + { + "epoch": 0.10087, + "grad_norm": 0.7271300834569303, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 10087 + }, + { + "epoch": 0.10088, + "grad_norm": 0.8548799986148488, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 10088 + }, + { + "epoch": 0.10089, + "grad_norm": 0.8660557367235694, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 10089 + }, + { + "epoch": 0.1009, + "grad_norm": 0.8905488162446568, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 10090 + }, + { + "epoch": 0.10091, + "grad_norm": 0.9129923392768977, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10091 + }, + { + "epoch": 0.10092, + "grad_norm": 0.9486091293972858, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 10092 + }, + { + "epoch": 0.10093, + "grad_norm": 0.9202478176786889, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 10093 + }, + { + "epoch": 0.10094, + "grad_norm": 0.9444438231215719, + "learning_rate": 0.003, + "loss": 4.084, + "step": 10094 + }, + { + "epoch": 0.10095, + "grad_norm": 1.0023114763189855, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 10095 + }, + { + "epoch": 0.10096, + "grad_norm": 0.9008830005116453, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 10096 + }, + { + "epoch": 0.10097, + "grad_norm": 0.7394970221020885, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10097 + }, + { + "epoch": 0.10098, + "grad_norm": 0.7119640016754663, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 10098 + }, + { + "epoch": 0.10099, + "grad_norm": 0.6795956150432717, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 10099 + }, + { + "epoch": 0.101, + "grad_norm": 0.7985963421953124, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 10100 + }, + { + "epoch": 0.10101, + "grad_norm": 0.9676465329947534, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 10101 + }, + { + "epoch": 0.10102, + "grad_norm": 0.9402257720277828, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 10102 + }, + { + "epoch": 0.10103, + "grad_norm": 0.8874925029060675, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 10103 + }, + { + "epoch": 0.10104, + "grad_norm": 0.9117348708084444, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 10104 + }, + { + "epoch": 0.10105, + "grad_norm": 0.8479646908158324, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 10105 + }, + { + "epoch": 0.10106, + "grad_norm": 0.896593809928579, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 10106 + }, + { + "epoch": 0.10107, + "grad_norm": 0.8315621037343224, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 10107 + }, + { + "epoch": 0.10108, + "grad_norm": 0.7681718745034675, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 10108 + }, + { + "epoch": 0.10109, + "grad_norm": 0.8233892495101346, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 10109 + }, + { + "epoch": 0.1011, + "grad_norm": 0.9016640807163642, + "learning_rate": 0.003, + "loss": 4.1142, + "step": 10110 + }, + { + "epoch": 0.10111, + "grad_norm": 0.9170672273091978, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 10111 + }, + { + "epoch": 0.10112, + "grad_norm": 0.8351702169958964, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 10112 + }, + { + "epoch": 0.10113, + "grad_norm": 0.8089461545174494, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 10113 + }, + { + "epoch": 0.10114, + "grad_norm": 0.9107383212481057, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 10114 + }, + { + "epoch": 0.10115, + "grad_norm": 0.8118768018834102, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 10115 + }, + { + "epoch": 0.10116, + "grad_norm": 0.7983571219601531, + "learning_rate": 0.003, + "loss": 4.091, + "step": 10116 + }, + { + "epoch": 0.10117, + "grad_norm": 0.7951177221933025, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 10117 + }, + { + "epoch": 0.10118, + "grad_norm": 0.8413030315926632, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 10118 + }, + { + "epoch": 0.10119, + "grad_norm": 0.716518761890454, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 10119 + }, + { + "epoch": 0.1012, + "grad_norm": 0.6486824283257824, + "learning_rate": 0.003, + "loss": 4.074, + "step": 10120 + }, + { + "epoch": 0.10121, + "grad_norm": 0.5880476149618893, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 10121 + }, + { + "epoch": 0.10122, + "grad_norm": 0.5866107750288472, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 10122 + }, + { + "epoch": 0.10123, + "grad_norm": 0.5811881253993841, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 10123 + }, + { + "epoch": 0.10124, + "grad_norm": 0.5569420366913549, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 10124 + }, + { + "epoch": 0.10125, + "grad_norm": 0.5942996797715101, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 10125 + }, + { + "epoch": 0.10126, + "grad_norm": 0.5670677013779793, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 10126 + }, + { + "epoch": 0.10127, + "grad_norm": 0.529089784800278, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 10127 + }, + { + "epoch": 0.10128, + "grad_norm": 0.4168566029164349, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 10128 + }, + { + "epoch": 0.10129, + "grad_norm": 0.4311731795710161, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 10129 + }, + { + "epoch": 0.1013, + "grad_norm": 0.4477041043125615, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 10130 + }, + { + "epoch": 0.10131, + "grad_norm": 0.5027821507640442, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 10131 + }, + { + "epoch": 0.10132, + "grad_norm": 0.5493559629395345, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 10132 + }, + { + "epoch": 0.10133, + "grad_norm": 0.6452629987298875, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 10133 + }, + { + "epoch": 0.10134, + "grad_norm": 0.8688443183887481, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 10134 + }, + { + "epoch": 0.10135, + "grad_norm": 1.1410657478991406, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 10135 + }, + { + "epoch": 0.10136, + "grad_norm": 0.923735834137093, + "learning_rate": 0.003, + "loss": 4.062, + "step": 10136 + }, + { + "epoch": 0.10137, + "grad_norm": 0.7985668392366627, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 10137 + }, + { + "epoch": 0.10138, + "grad_norm": 0.6893008020293587, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 10138 + }, + { + "epoch": 0.10139, + "grad_norm": 0.699027589097982, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 10139 + }, + { + "epoch": 0.1014, + "grad_norm": 0.7173481888193998, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 10140 + }, + { + "epoch": 0.10141, + "grad_norm": 0.6959173004694686, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 10141 + }, + { + "epoch": 0.10142, + "grad_norm": 0.813260923915381, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 10142 + }, + { + "epoch": 0.10143, + "grad_norm": 0.8889759915027781, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 10143 + }, + { + "epoch": 0.10144, + "grad_norm": 0.6930759800585614, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 10144 + }, + { + "epoch": 0.10145, + "grad_norm": 0.7031416657156874, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 10145 + }, + { + "epoch": 0.10146, + "grad_norm": 0.8186555112243362, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 10146 + }, + { + "epoch": 0.10147, + "grad_norm": 0.972515461562128, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 10147 + }, + { + "epoch": 0.10148, + "grad_norm": 1.2251324017059566, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 10148 + }, + { + "epoch": 0.10149, + "grad_norm": 0.7575767344901424, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 10149 + }, + { + "epoch": 0.1015, + "grad_norm": 0.7257739095105269, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 10150 + }, + { + "epoch": 0.10151, + "grad_norm": 0.6850837341823302, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 10151 + }, + { + "epoch": 0.10152, + "grad_norm": 0.6901837020889886, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 10152 + }, + { + "epoch": 0.10153, + "grad_norm": 0.7366995404122453, + "learning_rate": 0.003, + "loss": 4.089, + "step": 10153 + }, + { + "epoch": 0.10154, + "grad_norm": 0.7729200614601484, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 10154 + }, + { + "epoch": 0.10155, + "grad_norm": 0.9179351051298269, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 10155 + }, + { + "epoch": 0.10156, + "grad_norm": 1.079772577207657, + "learning_rate": 0.003, + "loss": 4.078, + "step": 10156 + }, + { + "epoch": 0.10157, + "grad_norm": 0.8478718907398735, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 10157 + }, + { + "epoch": 0.10158, + "grad_norm": 0.8076358725971214, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 10158 + }, + { + "epoch": 0.10159, + "grad_norm": 0.7182692438467507, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 10159 + }, + { + "epoch": 0.1016, + "grad_norm": 0.6354632732608569, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 10160 + }, + { + "epoch": 0.10161, + "grad_norm": 0.6096925827913215, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 10161 + }, + { + "epoch": 0.10162, + "grad_norm": 0.6412613510772903, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 10162 + }, + { + "epoch": 0.10163, + "grad_norm": 0.7228497893130814, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 10163 + }, + { + "epoch": 0.10164, + "grad_norm": 0.8189338574542239, + "learning_rate": 0.003, + "loss": 4.107, + "step": 10164 + }, + { + "epoch": 0.10165, + "grad_norm": 0.8802770642728063, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 10165 + }, + { + "epoch": 0.10166, + "grad_norm": 0.9340783981991572, + "learning_rate": 0.003, + "loss": 4.102, + "step": 10166 + }, + { + "epoch": 0.10167, + "grad_norm": 1.0522661488882454, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 10167 + }, + { + "epoch": 0.10168, + "grad_norm": 1.0883984803163758, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 10168 + }, + { + "epoch": 0.10169, + "grad_norm": 0.8718130916305842, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 10169 + }, + { + "epoch": 0.1017, + "grad_norm": 0.7267906765452536, + "learning_rate": 0.003, + "loss": 4.09, + "step": 10170 + }, + { + "epoch": 0.10171, + "grad_norm": 0.6741776316878704, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 10171 + }, + { + "epoch": 0.10172, + "grad_norm": 0.6366479102607301, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 10172 + }, + { + "epoch": 0.10173, + "grad_norm": 0.6255451070036055, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 10173 + }, + { + "epoch": 0.10174, + "grad_norm": 0.6833151370840097, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 10174 + }, + { + "epoch": 0.10175, + "grad_norm": 0.6980219428980601, + "learning_rate": 0.003, + "loss": 4.073, + "step": 10175 + }, + { + "epoch": 0.10176, + "grad_norm": 0.7375496501515941, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 10176 + }, + { + "epoch": 0.10177, + "grad_norm": 0.7898802215772395, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 10177 + }, + { + "epoch": 0.10178, + "grad_norm": 0.9839723040732539, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 10178 + }, + { + "epoch": 0.10179, + "grad_norm": 1.0558006196476055, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 10179 + }, + { + "epoch": 0.1018, + "grad_norm": 0.8839567388022763, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 10180 + }, + { + "epoch": 0.10181, + "grad_norm": 0.8323042106193205, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 10181 + }, + { + "epoch": 0.10182, + "grad_norm": 0.8791136712281088, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 10182 + }, + { + "epoch": 0.10183, + "grad_norm": 1.0917668790659707, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 10183 + }, + { + "epoch": 0.10184, + "grad_norm": 0.9025576624348437, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 10184 + }, + { + "epoch": 0.10185, + "grad_norm": 0.7798371617092108, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 10185 + }, + { + "epoch": 0.10186, + "grad_norm": 0.7713060566227249, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 10186 + }, + { + "epoch": 0.10187, + "grad_norm": 0.7563056494272968, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 10187 + }, + { + "epoch": 0.10188, + "grad_norm": 0.7772228424671542, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 10188 + }, + { + "epoch": 0.10189, + "grad_norm": 0.8260486212304557, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 10189 + }, + { + "epoch": 0.1019, + "grad_norm": 1.1171480943910872, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 10190 + }, + { + "epoch": 0.10191, + "grad_norm": 0.9843614175223876, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 10191 + }, + { + "epoch": 0.10192, + "grad_norm": 1.036663948755249, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 10192 + }, + { + "epoch": 0.10193, + "grad_norm": 0.8972801457141254, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10193 + }, + { + "epoch": 0.10194, + "grad_norm": 0.8235311205411696, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 10194 + }, + { + "epoch": 0.10195, + "grad_norm": 0.709463331336577, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 10195 + }, + { + "epoch": 0.10196, + "grad_norm": 0.5235487875342617, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 10196 + }, + { + "epoch": 0.10197, + "grad_norm": 0.5246146886760584, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 10197 + }, + { + "epoch": 0.10198, + "grad_norm": 0.5530686360133628, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 10198 + }, + { + "epoch": 0.10199, + "grad_norm": 0.6533458323226683, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 10199 + }, + { + "epoch": 0.102, + "grad_norm": 0.8138884437121796, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 10200 + }, + { + "epoch": 0.10201, + "grad_norm": 0.8833147158811808, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 10201 + }, + { + "epoch": 0.10202, + "grad_norm": 0.8602495065910606, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 10202 + }, + { + "epoch": 0.10203, + "grad_norm": 0.8167686182900835, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 10203 + }, + { + "epoch": 0.10204, + "grad_norm": 0.69579025866685, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 10204 + }, + { + "epoch": 0.10205, + "grad_norm": 0.6952695693816464, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 10205 + }, + { + "epoch": 0.10206, + "grad_norm": 0.7178836748410159, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 10206 + }, + { + "epoch": 0.10207, + "grad_norm": 0.7051292005693832, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 10207 + }, + { + "epoch": 0.10208, + "grad_norm": 0.7123121092039982, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 10208 + }, + { + "epoch": 0.10209, + "grad_norm": 0.7357546073327017, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10209 + }, + { + "epoch": 0.1021, + "grad_norm": 0.738960921368254, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 10210 + }, + { + "epoch": 0.10211, + "grad_norm": 0.9042302755628594, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 10211 + }, + { + "epoch": 0.10212, + "grad_norm": 0.9328835687364458, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 10212 + }, + { + "epoch": 0.10213, + "grad_norm": 1.064128555829592, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 10213 + }, + { + "epoch": 0.10214, + "grad_norm": 0.8423985032180068, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 10214 + }, + { + "epoch": 0.10215, + "grad_norm": 0.6894767482193619, + "learning_rate": 0.003, + "loss": 4.107, + "step": 10215 + }, + { + "epoch": 0.10216, + "grad_norm": 0.6407174824026426, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 10216 + }, + { + "epoch": 0.10217, + "grad_norm": 0.6930631872988513, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 10217 + }, + { + "epoch": 0.10218, + "grad_norm": 0.6736003865724515, + "learning_rate": 0.003, + "loss": 4.1127, + "step": 10218 + }, + { + "epoch": 0.10219, + "grad_norm": 0.6391476717729888, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 10219 + }, + { + "epoch": 0.1022, + "grad_norm": 0.626075339436956, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 10220 + }, + { + "epoch": 0.10221, + "grad_norm": 0.6109786482845987, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 10221 + }, + { + "epoch": 0.10222, + "grad_norm": 0.6282338053842801, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 10222 + }, + { + "epoch": 0.10223, + "grad_norm": 0.6832246627842549, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 10223 + }, + { + "epoch": 0.10224, + "grad_norm": 0.7169113704064414, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 10224 + }, + { + "epoch": 0.10225, + "grad_norm": 0.7323081942187616, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 10225 + }, + { + "epoch": 0.10226, + "grad_norm": 0.7406008465439835, + "learning_rate": 0.003, + "loss": 4.1302, + "step": 10226 + }, + { + "epoch": 0.10227, + "grad_norm": 0.785218333980764, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 10227 + }, + { + "epoch": 0.10228, + "grad_norm": 0.9898486890834313, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 10228 + }, + { + "epoch": 0.10229, + "grad_norm": 1.0742073532465095, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 10229 + }, + { + "epoch": 0.1023, + "grad_norm": 0.7265290601522981, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 10230 + }, + { + "epoch": 0.10231, + "grad_norm": 0.7367427007561755, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 10231 + }, + { + "epoch": 0.10232, + "grad_norm": 0.7481518210705496, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 10232 + }, + { + "epoch": 0.10233, + "grad_norm": 0.8005412014419764, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 10233 + }, + { + "epoch": 0.10234, + "grad_norm": 0.8303364545297103, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 10234 + }, + { + "epoch": 0.10235, + "grad_norm": 0.8310317993635913, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 10235 + }, + { + "epoch": 0.10236, + "grad_norm": 0.9054391176791641, + "learning_rate": 0.003, + "loss": 4.127, + "step": 10236 + }, + { + "epoch": 0.10237, + "grad_norm": 0.8435763363330704, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 10237 + }, + { + "epoch": 0.10238, + "grad_norm": 0.8115475750168918, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 10238 + }, + { + "epoch": 0.10239, + "grad_norm": 0.8628909482258778, + "learning_rate": 0.003, + "loss": 4.1279, + "step": 10239 + }, + { + "epoch": 0.1024, + "grad_norm": 0.9364777145567061, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 10240 + }, + { + "epoch": 0.10241, + "grad_norm": 1.146465968559167, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 10241 + }, + { + "epoch": 0.10242, + "grad_norm": 0.854898755875994, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 10242 + }, + { + "epoch": 0.10243, + "grad_norm": 0.7511718291044952, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 10243 + }, + { + "epoch": 0.10244, + "grad_norm": 0.7603646485366432, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 10244 + }, + { + "epoch": 0.10245, + "grad_norm": 0.8105837774316208, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 10245 + }, + { + "epoch": 0.10246, + "grad_norm": 0.7977585495767636, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 10246 + }, + { + "epoch": 0.10247, + "grad_norm": 0.7506978997504432, + "learning_rate": 0.003, + "loss": 4.1188, + "step": 10247 + }, + { + "epoch": 0.10248, + "grad_norm": 0.8759386294051426, + "learning_rate": 0.003, + "loss": 4.115, + "step": 10248 + }, + { + "epoch": 0.10249, + "grad_norm": 0.8431799228218545, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 10249 + }, + { + "epoch": 0.1025, + "grad_norm": 0.7864842623687947, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 10250 + }, + { + "epoch": 0.10251, + "grad_norm": 0.7740703845382346, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 10251 + }, + { + "epoch": 0.10252, + "grad_norm": 0.8321898093447786, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 10252 + }, + { + "epoch": 0.10253, + "grad_norm": 0.7894410369300876, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 10253 + }, + { + "epoch": 0.10254, + "grad_norm": 0.8133697650165276, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 10254 + }, + { + "epoch": 0.10255, + "grad_norm": 0.744829951039707, + "learning_rate": 0.003, + "loss": 4.082, + "step": 10255 + }, + { + "epoch": 0.10256, + "grad_norm": 0.7780534762171264, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 10256 + }, + { + "epoch": 0.10257, + "grad_norm": 0.8808046453133321, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 10257 + }, + { + "epoch": 0.10258, + "grad_norm": 0.9129780513694674, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 10258 + }, + { + "epoch": 0.10259, + "grad_norm": 0.8386788452118967, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 10259 + }, + { + "epoch": 0.1026, + "grad_norm": 0.7688367051425977, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 10260 + }, + { + "epoch": 0.10261, + "grad_norm": 0.6828172453485549, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 10261 + }, + { + "epoch": 0.10262, + "grad_norm": 0.6027470278842928, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 10262 + }, + { + "epoch": 0.10263, + "grad_norm": 0.6134625435010462, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 10263 + }, + { + "epoch": 0.10264, + "grad_norm": 0.6601613262512446, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 10264 + }, + { + "epoch": 0.10265, + "grad_norm": 0.6925742701606704, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 10265 + }, + { + "epoch": 0.10266, + "grad_norm": 0.7494871872755563, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 10266 + }, + { + "epoch": 0.10267, + "grad_norm": 0.7626957921447522, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 10267 + }, + { + "epoch": 0.10268, + "grad_norm": 0.8706405052637655, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 10268 + }, + { + "epoch": 0.10269, + "grad_norm": 0.9926896776195115, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 10269 + }, + { + "epoch": 0.1027, + "grad_norm": 0.9916626731747161, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 10270 + }, + { + "epoch": 0.10271, + "grad_norm": 0.96407920565692, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 10271 + }, + { + "epoch": 0.10272, + "grad_norm": 0.8960155500010829, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 10272 + }, + { + "epoch": 0.10273, + "grad_norm": 0.7192574545668858, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 10273 + }, + { + "epoch": 0.10274, + "grad_norm": 0.7829507156964814, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 10274 + }, + { + "epoch": 0.10275, + "grad_norm": 0.8067743579386221, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 10275 + }, + { + "epoch": 0.10276, + "grad_norm": 0.9012713038007449, + "learning_rate": 0.003, + "loss": 4.095, + "step": 10276 + }, + { + "epoch": 0.10277, + "grad_norm": 1.0017053952207466, + "learning_rate": 0.003, + "loss": 4.087, + "step": 10277 + }, + { + "epoch": 0.10278, + "grad_norm": 1.1196366403325528, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 10278 + }, + { + "epoch": 0.10279, + "grad_norm": 0.7036241002443819, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 10279 + }, + { + "epoch": 0.1028, + "grad_norm": 0.675873333438298, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 10280 + }, + { + "epoch": 0.10281, + "grad_norm": 0.7188694294424975, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 10281 + }, + { + "epoch": 0.10282, + "grad_norm": 0.6458206776287501, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 10282 + }, + { + "epoch": 0.10283, + "grad_norm": 0.6503790584246892, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 10283 + }, + { + "epoch": 0.10284, + "grad_norm": 0.7512150394872587, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 10284 + }, + { + "epoch": 0.10285, + "grad_norm": 0.8041418564413038, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 10285 + }, + { + "epoch": 0.10286, + "grad_norm": 0.7393185584957023, + "learning_rate": 0.003, + "loss": 4.085, + "step": 10286 + }, + { + "epoch": 0.10287, + "grad_norm": 0.6536497052383766, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 10287 + }, + { + "epoch": 0.10288, + "grad_norm": 0.6895525064317385, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 10288 + }, + { + "epoch": 0.10289, + "grad_norm": 0.71141474436633, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 10289 + }, + { + "epoch": 0.1029, + "grad_norm": 0.7863696654596326, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 10290 + }, + { + "epoch": 0.10291, + "grad_norm": 0.7433309300210272, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 10291 + }, + { + "epoch": 0.10292, + "grad_norm": 0.7228574492515069, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 10292 + }, + { + "epoch": 0.10293, + "grad_norm": 0.7328543775889537, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 10293 + }, + { + "epoch": 0.10294, + "grad_norm": 0.7119459365331811, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 10294 + }, + { + "epoch": 0.10295, + "grad_norm": 0.6902154836264375, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 10295 + }, + { + "epoch": 0.10296, + "grad_norm": 0.6673321189400658, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 10296 + }, + { + "epoch": 0.10297, + "grad_norm": 0.6206236886964629, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 10297 + }, + { + "epoch": 0.10298, + "grad_norm": 0.580549324398935, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 10298 + }, + { + "epoch": 0.10299, + "grad_norm": 0.5568108267250715, + "learning_rate": 0.003, + "loss": 4.074, + "step": 10299 + }, + { + "epoch": 0.103, + "grad_norm": 0.6459850251191824, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 10300 + }, + { + "epoch": 0.10301, + "grad_norm": 0.7870413569880143, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 10301 + }, + { + "epoch": 0.10302, + "grad_norm": 0.9424107245640844, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 10302 + }, + { + "epoch": 0.10303, + "grad_norm": 1.0280411938586382, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 10303 + }, + { + "epoch": 0.10304, + "grad_norm": 1.1122147913714624, + "learning_rate": 0.003, + "loss": 4.105, + "step": 10304 + }, + { + "epoch": 0.10305, + "grad_norm": 0.935437875996847, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 10305 + }, + { + "epoch": 0.10306, + "grad_norm": 0.9434216723466147, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 10306 + }, + { + "epoch": 0.10307, + "grad_norm": 1.077419206259069, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 10307 + }, + { + "epoch": 0.10308, + "grad_norm": 0.9260799956690462, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10308 + }, + { + "epoch": 0.10309, + "grad_norm": 0.788475483287031, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 10309 + }, + { + "epoch": 0.1031, + "grad_norm": 0.7548519322564272, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 10310 + }, + { + "epoch": 0.10311, + "grad_norm": 0.8490099710145201, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 10311 + }, + { + "epoch": 0.10312, + "grad_norm": 0.9754889415393432, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 10312 + }, + { + "epoch": 0.10313, + "grad_norm": 0.9725068876707715, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10313 + }, + { + "epoch": 0.10314, + "grad_norm": 1.008302977550403, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 10314 + }, + { + "epoch": 0.10315, + "grad_norm": 1.1068450571626192, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 10315 + }, + { + "epoch": 0.10316, + "grad_norm": 1.01689260859524, + "learning_rate": 0.003, + "loss": 4.1353, + "step": 10316 + }, + { + "epoch": 0.10317, + "grad_norm": 1.0399611525497334, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 10317 + }, + { + "epoch": 0.10318, + "grad_norm": 1.0664621024838172, + "learning_rate": 0.003, + "loss": 4.092, + "step": 10318 + }, + { + "epoch": 0.10319, + "grad_norm": 0.8887351487113996, + "learning_rate": 0.003, + "loss": 4.106, + "step": 10319 + }, + { + "epoch": 0.1032, + "grad_norm": 0.9777260247175278, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 10320 + }, + { + "epoch": 0.10321, + "grad_norm": 1.0855049175533027, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 10321 + }, + { + "epoch": 0.10322, + "grad_norm": 0.963506360245415, + "learning_rate": 0.003, + "loss": 4.1389, + "step": 10322 + }, + { + "epoch": 0.10323, + "grad_norm": 0.9881045744868919, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 10323 + }, + { + "epoch": 0.10324, + "grad_norm": 0.8742451143437673, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 10324 + }, + { + "epoch": 0.10325, + "grad_norm": 0.7592965201742712, + "learning_rate": 0.003, + "loss": 4.098, + "step": 10325 + }, + { + "epoch": 0.10326, + "grad_norm": 0.6924762638567085, + "learning_rate": 0.003, + "loss": 4.111, + "step": 10326 + }, + { + "epoch": 0.10327, + "grad_norm": 0.7996862828960193, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 10327 + }, + { + "epoch": 0.10328, + "grad_norm": 0.8472096443774338, + "learning_rate": 0.003, + "loss": 4.087, + "step": 10328 + }, + { + "epoch": 0.10329, + "grad_norm": 0.80863735984161, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 10329 + }, + { + "epoch": 0.1033, + "grad_norm": 0.7470728233214627, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 10330 + }, + { + "epoch": 0.10331, + "grad_norm": 0.6111385808771548, + "learning_rate": 0.003, + "loss": 4.077, + "step": 10331 + }, + { + "epoch": 0.10332, + "grad_norm": 0.5181245173636219, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 10332 + }, + { + "epoch": 0.10333, + "grad_norm": 0.5288066580362742, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 10333 + }, + { + "epoch": 0.10334, + "grad_norm": 0.5522664037256619, + "learning_rate": 0.003, + "loss": 4.1144, + "step": 10334 + }, + { + "epoch": 0.10335, + "grad_norm": 0.6137655038412625, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 10335 + }, + { + "epoch": 0.10336, + "grad_norm": 0.7105459077980683, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 10336 + }, + { + "epoch": 0.10337, + "grad_norm": 0.8033414113689806, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 10337 + }, + { + "epoch": 0.10338, + "grad_norm": 0.76450882628852, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 10338 + }, + { + "epoch": 0.10339, + "grad_norm": 0.6766193484175521, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 10339 + }, + { + "epoch": 0.1034, + "grad_norm": 0.7046081840372489, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 10340 + }, + { + "epoch": 0.10341, + "grad_norm": 0.6881196394385097, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 10341 + }, + { + "epoch": 0.10342, + "grad_norm": 0.7019620860313454, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 10342 + }, + { + "epoch": 0.10343, + "grad_norm": 0.7669457179418919, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 10343 + }, + { + "epoch": 0.10344, + "grad_norm": 0.8562986464162007, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 10344 + }, + { + "epoch": 0.10345, + "grad_norm": 0.9678576326676694, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 10345 + }, + { + "epoch": 0.10346, + "grad_norm": 0.988840553225034, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 10346 + }, + { + "epoch": 0.10347, + "grad_norm": 0.8761665279793409, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 10347 + }, + { + "epoch": 0.10348, + "grad_norm": 0.8070465433399605, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 10348 + }, + { + "epoch": 0.10349, + "grad_norm": 0.746677695053773, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 10349 + }, + { + "epoch": 0.1035, + "grad_norm": 0.7251075414951131, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 10350 + }, + { + "epoch": 0.10351, + "grad_norm": 0.682523058274381, + "learning_rate": 0.003, + "loss": 4.093, + "step": 10351 + }, + { + "epoch": 0.10352, + "grad_norm": 0.5549826583410974, + "learning_rate": 0.003, + "loss": 4.078, + "step": 10352 + }, + { + "epoch": 0.10353, + "grad_norm": 0.6184831584530538, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 10353 + }, + { + "epoch": 0.10354, + "grad_norm": 0.6154810565062979, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 10354 + }, + { + "epoch": 0.10355, + "grad_norm": 0.6013274566749466, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 10355 + }, + { + "epoch": 0.10356, + "grad_norm": 0.5790494966726321, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 10356 + }, + { + "epoch": 0.10357, + "grad_norm": 0.5815749905417382, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 10357 + }, + { + "epoch": 0.10358, + "grad_norm": 0.5961417322223677, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 10358 + }, + { + "epoch": 0.10359, + "grad_norm": 0.612228084547915, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 10359 + }, + { + "epoch": 0.1036, + "grad_norm": 0.5632606844307874, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 10360 + }, + { + "epoch": 0.10361, + "grad_norm": 0.5944178213854017, + "learning_rate": 0.003, + "loss": 4.036, + "step": 10361 + }, + { + "epoch": 0.10362, + "grad_norm": 0.6050895286964695, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 10362 + }, + { + "epoch": 0.10363, + "grad_norm": 0.5893502071647355, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 10363 + }, + { + "epoch": 0.10364, + "grad_norm": 0.6480921596779424, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 10364 + }, + { + "epoch": 0.10365, + "grad_norm": 0.6655516728542986, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 10365 + }, + { + "epoch": 0.10366, + "grad_norm": 0.841939927056256, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 10366 + }, + { + "epoch": 0.10367, + "grad_norm": 1.2925940203305464, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 10367 + }, + { + "epoch": 0.10368, + "grad_norm": 1.0721423681105018, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 10368 + }, + { + "epoch": 0.10369, + "grad_norm": 0.8054815871522366, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 10369 + }, + { + "epoch": 0.1037, + "grad_norm": 0.6431254709101457, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 10370 + }, + { + "epoch": 0.10371, + "grad_norm": 0.6771185643178579, + "learning_rate": 0.003, + "loss": 4.049, + "step": 10371 + }, + { + "epoch": 0.10372, + "grad_norm": 0.6505722328550112, + "learning_rate": 0.003, + "loss": 4.061, + "step": 10372 + }, + { + "epoch": 0.10373, + "grad_norm": 0.649150897107484, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 10373 + }, + { + "epoch": 0.10374, + "grad_norm": 0.7655192327120619, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 10374 + }, + { + "epoch": 0.10375, + "grad_norm": 0.8276458451965146, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 10375 + }, + { + "epoch": 0.10376, + "grad_norm": 0.8401543111084332, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 10376 + }, + { + "epoch": 0.10377, + "grad_norm": 0.961981488825057, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 10377 + }, + { + "epoch": 0.10378, + "grad_norm": 1.0528802151842687, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 10378 + }, + { + "epoch": 0.10379, + "grad_norm": 0.9122713415226914, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 10379 + }, + { + "epoch": 0.1038, + "grad_norm": 0.8191396318548235, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 10380 + }, + { + "epoch": 0.10381, + "grad_norm": 0.7785335876041576, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 10381 + }, + { + "epoch": 0.10382, + "grad_norm": 0.8640344513391571, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 10382 + }, + { + "epoch": 0.10383, + "grad_norm": 1.0100767424979953, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 10383 + }, + { + "epoch": 0.10384, + "grad_norm": 1.0586243617599906, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 10384 + }, + { + "epoch": 0.10385, + "grad_norm": 0.8624828769831347, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 10385 + }, + { + "epoch": 0.10386, + "grad_norm": 0.8830596935983357, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 10386 + }, + { + "epoch": 0.10387, + "grad_norm": 0.8306326059978796, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 10387 + }, + { + "epoch": 0.10388, + "grad_norm": 0.8272101216281712, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 10388 + }, + { + "epoch": 0.10389, + "grad_norm": 0.9009021769238307, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 10389 + }, + { + "epoch": 0.1039, + "grad_norm": 1.0391034226876843, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 10390 + }, + { + "epoch": 0.10391, + "grad_norm": 0.9858486051647656, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 10391 + }, + { + "epoch": 0.10392, + "grad_norm": 0.817170907122026, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 10392 + }, + { + "epoch": 0.10393, + "grad_norm": 0.7813731942805362, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 10393 + }, + { + "epoch": 0.10394, + "grad_norm": 0.7491777949113574, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 10394 + }, + { + "epoch": 0.10395, + "grad_norm": 0.9168636740102267, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 10395 + }, + { + "epoch": 0.10396, + "grad_norm": 1.0881876341009609, + "learning_rate": 0.003, + "loss": 4.1147, + "step": 10396 + }, + { + "epoch": 0.10397, + "grad_norm": 0.8483418535870134, + "learning_rate": 0.003, + "loss": 4.1204, + "step": 10397 + }, + { + "epoch": 0.10398, + "grad_norm": 0.8004552390402355, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 10398 + }, + { + "epoch": 0.10399, + "grad_norm": 1.0550255797082628, + "learning_rate": 0.003, + "loss": 4.12, + "step": 10399 + }, + { + "epoch": 0.104, + "grad_norm": 1.1469901095195558, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 10400 + }, + { + "epoch": 0.10401, + "grad_norm": 0.776163770261994, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 10401 + }, + { + "epoch": 0.10402, + "grad_norm": 0.7191911818065254, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 10402 + }, + { + "epoch": 0.10403, + "grad_norm": 0.7757679538550868, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 10403 + }, + { + "epoch": 0.10404, + "grad_norm": 0.9017847369719004, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 10404 + }, + { + "epoch": 0.10405, + "grad_norm": 1.0205196229352549, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 10405 + }, + { + "epoch": 0.10406, + "grad_norm": 1.0561034007451116, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 10406 + }, + { + "epoch": 0.10407, + "grad_norm": 0.8382012374833445, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 10407 + }, + { + "epoch": 0.10408, + "grad_norm": 0.6440441204977746, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 10408 + }, + { + "epoch": 0.10409, + "grad_norm": 0.6231554547472592, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 10409 + }, + { + "epoch": 0.1041, + "grad_norm": 0.5895655256675674, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 10410 + }, + { + "epoch": 0.10411, + "grad_norm": 0.6078869262220562, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 10411 + }, + { + "epoch": 0.10412, + "grad_norm": 0.5714544500792731, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 10412 + }, + { + "epoch": 0.10413, + "grad_norm": 0.4888734408355746, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 10413 + }, + { + "epoch": 0.10414, + "grad_norm": 0.45177378648238387, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 10414 + }, + { + "epoch": 0.10415, + "grad_norm": 0.4464755195762157, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 10415 + }, + { + "epoch": 0.10416, + "grad_norm": 0.4119578434609014, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 10416 + }, + { + "epoch": 0.10417, + "grad_norm": 0.4530981054257146, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 10417 + }, + { + "epoch": 0.10418, + "grad_norm": 0.5625746495824158, + "learning_rate": 0.003, + "loss": 4.055, + "step": 10418 + }, + { + "epoch": 0.10419, + "grad_norm": 0.7189451960829651, + "learning_rate": 0.003, + "loss": 4.1232, + "step": 10419 + }, + { + "epoch": 0.1042, + "grad_norm": 0.9831700376646376, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 10420 + }, + { + "epoch": 0.10421, + "grad_norm": 1.1445532836627261, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 10421 + }, + { + "epoch": 0.10422, + "grad_norm": 0.7082563767107952, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 10422 + }, + { + "epoch": 0.10423, + "grad_norm": 0.6874627848858307, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 10423 + }, + { + "epoch": 0.10424, + "grad_norm": 0.730815564511028, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 10424 + }, + { + "epoch": 0.10425, + "grad_norm": 0.6651685498466663, + "learning_rate": 0.003, + "loss": 4.077, + "step": 10425 + }, + { + "epoch": 0.10426, + "grad_norm": 0.629940706978873, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 10426 + }, + { + "epoch": 0.10427, + "grad_norm": 0.662314993288643, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 10427 + }, + { + "epoch": 0.10428, + "grad_norm": 0.7008715707921093, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 10428 + }, + { + "epoch": 0.10429, + "grad_norm": 0.774968411885718, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 10429 + }, + { + "epoch": 0.1043, + "grad_norm": 0.7183561816195599, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 10430 + }, + { + "epoch": 0.10431, + "grad_norm": 0.7603457171655063, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 10431 + }, + { + "epoch": 0.10432, + "grad_norm": 0.79045224783535, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 10432 + }, + { + "epoch": 0.10433, + "grad_norm": 0.9599162387244943, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 10433 + }, + { + "epoch": 0.10434, + "grad_norm": 0.8214963345150575, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 10434 + }, + { + "epoch": 0.10435, + "grad_norm": 0.7767373236471652, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 10435 + }, + { + "epoch": 0.10436, + "grad_norm": 0.7865430681912782, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 10436 + }, + { + "epoch": 0.10437, + "grad_norm": 0.7933364160947659, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 10437 + }, + { + "epoch": 0.10438, + "grad_norm": 0.7277652813568619, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 10438 + }, + { + "epoch": 0.10439, + "grad_norm": 0.6839207059096676, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 10439 + }, + { + "epoch": 0.1044, + "grad_norm": 0.6746530544011542, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10440 + }, + { + "epoch": 0.10441, + "grad_norm": 0.7864041796225456, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 10441 + }, + { + "epoch": 0.10442, + "grad_norm": 0.8804608511981953, + "learning_rate": 0.003, + "loss": 4.069, + "step": 10442 + }, + { + "epoch": 0.10443, + "grad_norm": 1.0508300123164835, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 10443 + }, + { + "epoch": 0.10444, + "grad_norm": 1.051811976800046, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 10444 + }, + { + "epoch": 0.10445, + "grad_norm": 0.7888503995506437, + "learning_rate": 0.003, + "loss": 4.1071, + "step": 10445 + }, + { + "epoch": 0.10446, + "grad_norm": 0.6603911887857837, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 10446 + }, + { + "epoch": 0.10447, + "grad_norm": 0.8408603404059548, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 10447 + }, + { + "epoch": 0.10448, + "grad_norm": 0.9695074227496694, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 10448 + }, + { + "epoch": 0.10449, + "grad_norm": 0.9822010322827212, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 10449 + }, + { + "epoch": 0.1045, + "grad_norm": 1.0012725608752013, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 10450 + }, + { + "epoch": 0.10451, + "grad_norm": 1.009392360148877, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 10451 + }, + { + "epoch": 0.10452, + "grad_norm": 1.0090387388437994, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 10452 + }, + { + "epoch": 0.10453, + "grad_norm": 0.8943599349231653, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 10453 + }, + { + "epoch": 0.10454, + "grad_norm": 0.8560707064477248, + "learning_rate": 0.003, + "loss": 4.083, + "step": 10454 + }, + { + "epoch": 0.10455, + "grad_norm": 0.8752369980580909, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 10455 + }, + { + "epoch": 0.10456, + "grad_norm": 0.9130498227436277, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 10456 + }, + { + "epoch": 0.10457, + "grad_norm": 0.8067284336559211, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 10457 + }, + { + "epoch": 0.10458, + "grad_norm": 0.7395291417558032, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 10458 + }, + { + "epoch": 0.10459, + "grad_norm": 0.7297382370955492, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 10459 + }, + { + "epoch": 0.1046, + "grad_norm": 0.8594227462287262, + "learning_rate": 0.003, + "loss": 4.1102, + "step": 10460 + }, + { + "epoch": 0.10461, + "grad_norm": 0.8861318685991226, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 10461 + }, + { + "epoch": 0.10462, + "grad_norm": 0.8083739717834618, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 10462 + }, + { + "epoch": 0.10463, + "grad_norm": 0.855447088915927, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 10463 + }, + { + "epoch": 0.10464, + "grad_norm": 0.8309265331948995, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 10464 + }, + { + "epoch": 0.10465, + "grad_norm": 0.8950247767863003, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 10465 + }, + { + "epoch": 0.10466, + "grad_norm": 0.9702711674796212, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 10466 + }, + { + "epoch": 0.10467, + "grad_norm": 1.0808015279280165, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 10467 + }, + { + "epoch": 0.10468, + "grad_norm": 0.8796165704446247, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 10468 + }, + { + "epoch": 0.10469, + "grad_norm": 0.6864191337528118, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 10469 + }, + { + "epoch": 0.1047, + "grad_norm": 0.642911293497494, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 10470 + }, + { + "epoch": 0.10471, + "grad_norm": 0.6401286669093053, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 10471 + }, + { + "epoch": 0.10472, + "grad_norm": 0.7623940021861698, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 10472 + }, + { + "epoch": 0.10473, + "grad_norm": 0.7757677108816563, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 10473 + }, + { + "epoch": 0.10474, + "grad_norm": 0.8165254615464338, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 10474 + }, + { + "epoch": 0.10475, + "grad_norm": 0.6947649086407327, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 10475 + }, + { + "epoch": 0.10476, + "grad_norm": 0.609454645718345, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 10476 + }, + { + "epoch": 0.10477, + "grad_norm": 0.5156181671942854, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 10477 + }, + { + "epoch": 0.10478, + "grad_norm": 0.550404325359368, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 10478 + }, + { + "epoch": 0.10479, + "grad_norm": 0.5844190234215697, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 10479 + }, + { + "epoch": 0.1048, + "grad_norm": 0.6475683647990985, + "learning_rate": 0.003, + "loss": 4.062, + "step": 10480 + }, + { + "epoch": 0.10481, + "grad_norm": 0.765493534126949, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 10481 + }, + { + "epoch": 0.10482, + "grad_norm": 0.7460812863968737, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 10482 + }, + { + "epoch": 0.10483, + "grad_norm": 0.760367664758415, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 10483 + }, + { + "epoch": 0.10484, + "grad_norm": 0.7889324440591884, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 10484 + }, + { + "epoch": 0.10485, + "grad_norm": 0.7000125978773567, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 10485 + }, + { + "epoch": 0.10486, + "grad_norm": 0.697394489855323, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 10486 + }, + { + "epoch": 0.10487, + "grad_norm": 0.7734234905452742, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 10487 + }, + { + "epoch": 0.10488, + "grad_norm": 0.9578169878035425, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 10488 + }, + { + "epoch": 0.10489, + "grad_norm": 1.1043883747627603, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 10489 + }, + { + "epoch": 0.1049, + "grad_norm": 0.8806546383022705, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 10490 + }, + { + "epoch": 0.10491, + "grad_norm": 0.8581015926193526, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 10491 + }, + { + "epoch": 0.10492, + "grad_norm": 0.8562928611424699, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 10492 + }, + { + "epoch": 0.10493, + "grad_norm": 0.8058684115768378, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 10493 + }, + { + "epoch": 0.10494, + "grad_norm": 0.7770554096388587, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 10494 + }, + { + "epoch": 0.10495, + "grad_norm": 0.7334547382322241, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 10495 + }, + { + "epoch": 0.10496, + "grad_norm": 0.718416025393358, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 10496 + }, + { + "epoch": 0.10497, + "grad_norm": 0.7792435850503384, + "learning_rate": 0.003, + "loss": 4.067, + "step": 10497 + }, + { + "epoch": 0.10498, + "grad_norm": 0.8412922868706377, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 10498 + }, + { + "epoch": 0.10499, + "grad_norm": 0.9509337789006239, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 10499 + }, + { + "epoch": 0.105, + "grad_norm": 0.9912844508190685, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 10500 + }, + { + "epoch": 0.10501, + "grad_norm": 0.8580834861139071, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 10501 + }, + { + "epoch": 0.10502, + "grad_norm": 0.8167382602075896, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 10502 + }, + { + "epoch": 0.10503, + "grad_norm": 0.8743323585619591, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 10503 + }, + { + "epoch": 0.10504, + "grad_norm": 0.8327457707468423, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 10504 + }, + { + "epoch": 0.10505, + "grad_norm": 0.801391957128637, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 10505 + }, + { + "epoch": 0.10506, + "grad_norm": 0.6394241038195827, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 10506 + }, + { + "epoch": 0.10507, + "grad_norm": 0.6433059411947507, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 10507 + }, + { + "epoch": 0.10508, + "grad_norm": 0.6555621284685157, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 10508 + }, + { + "epoch": 0.10509, + "grad_norm": 0.6181176365447185, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 10509 + }, + { + "epoch": 0.1051, + "grad_norm": 0.65166605534626, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 10510 + }, + { + "epoch": 0.10511, + "grad_norm": 0.7356173809364867, + "learning_rate": 0.003, + "loss": 4.052, + "step": 10511 + }, + { + "epoch": 0.10512, + "grad_norm": 1.013079842957836, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 10512 + }, + { + "epoch": 0.10513, + "grad_norm": 1.1755143739073186, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 10513 + }, + { + "epoch": 0.10514, + "grad_norm": 0.8163557674329753, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 10514 + }, + { + "epoch": 0.10515, + "grad_norm": 0.7085350074934578, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 10515 + }, + { + "epoch": 0.10516, + "grad_norm": 0.6658791858624886, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 10516 + }, + { + "epoch": 0.10517, + "grad_norm": 0.6041515015315367, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 10517 + }, + { + "epoch": 0.10518, + "grad_norm": 0.7047204808794802, + "learning_rate": 0.003, + "loss": 4.102, + "step": 10518 + }, + { + "epoch": 0.10519, + "grad_norm": 0.7244412290591962, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 10519 + }, + { + "epoch": 0.1052, + "grad_norm": 0.7395861089702314, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 10520 + }, + { + "epoch": 0.10521, + "grad_norm": 0.7665016692200823, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 10521 + }, + { + "epoch": 0.10522, + "grad_norm": 0.7792629094064418, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 10522 + }, + { + "epoch": 0.10523, + "grad_norm": 0.7953210132825478, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 10523 + }, + { + "epoch": 0.10524, + "grad_norm": 0.7991555641259503, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 10524 + }, + { + "epoch": 0.10525, + "grad_norm": 0.8238402147042898, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 10525 + }, + { + "epoch": 0.10526, + "grad_norm": 0.7163491665622135, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 10526 + }, + { + "epoch": 0.10527, + "grad_norm": 0.6545420151814723, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 10527 + }, + { + "epoch": 0.10528, + "grad_norm": 0.7166141333279368, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 10528 + }, + { + "epoch": 0.10529, + "grad_norm": 0.9100729617540565, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 10529 + }, + { + "epoch": 0.1053, + "grad_norm": 1.0028018301025132, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 10530 + }, + { + "epoch": 0.10531, + "grad_norm": 1.0807675817900035, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 10531 + }, + { + "epoch": 0.10532, + "grad_norm": 0.8870026361329219, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 10532 + }, + { + "epoch": 0.10533, + "grad_norm": 0.893246559986549, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 10533 + }, + { + "epoch": 0.10534, + "grad_norm": 0.8877419641098879, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 10534 + }, + { + "epoch": 0.10535, + "grad_norm": 0.804120621184703, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 10535 + }, + { + "epoch": 0.10536, + "grad_norm": 0.731737971690129, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 10536 + }, + { + "epoch": 0.10537, + "grad_norm": 0.6685167604634952, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 10537 + }, + { + "epoch": 0.10538, + "grad_norm": 0.6041586651576828, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 10538 + }, + { + "epoch": 0.10539, + "grad_norm": 0.5722575391865711, + "learning_rate": 0.003, + "loss": 4.072, + "step": 10539 + }, + { + "epoch": 0.1054, + "grad_norm": 0.6001113282044517, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 10540 + }, + { + "epoch": 0.10541, + "grad_norm": 0.5762315876118429, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 10541 + }, + { + "epoch": 0.10542, + "grad_norm": 0.563179289202697, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10542 + }, + { + "epoch": 0.10543, + "grad_norm": 0.5947299652169051, + "learning_rate": 0.003, + "loss": 4.072, + "step": 10543 + }, + { + "epoch": 0.10544, + "grad_norm": 0.6985463762361143, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 10544 + }, + { + "epoch": 0.10545, + "grad_norm": 0.8488458192713703, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 10545 + }, + { + "epoch": 0.10546, + "grad_norm": 1.134283035666868, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 10546 + }, + { + "epoch": 0.10547, + "grad_norm": 1.008621876461993, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 10547 + }, + { + "epoch": 0.10548, + "grad_norm": 0.8910932087633671, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 10548 + }, + { + "epoch": 0.10549, + "grad_norm": 0.809160397609863, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 10549 + }, + { + "epoch": 0.1055, + "grad_norm": 0.8218623812846083, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 10550 + }, + { + "epoch": 0.10551, + "grad_norm": 0.7992528000231008, + "learning_rate": 0.003, + "loss": 4.107, + "step": 10551 + }, + { + "epoch": 0.10552, + "grad_norm": 0.86719344382475, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 10552 + }, + { + "epoch": 0.10553, + "grad_norm": 0.908060487944853, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 10553 + }, + { + "epoch": 0.10554, + "grad_norm": 0.8652695049802602, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 10554 + }, + { + "epoch": 0.10555, + "grad_norm": 0.6860164989967895, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 10555 + }, + { + "epoch": 0.10556, + "grad_norm": 0.6378293392988055, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 10556 + }, + { + "epoch": 0.10557, + "grad_norm": 0.62800965514661, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 10557 + }, + { + "epoch": 0.10558, + "grad_norm": 0.6963803928423222, + "learning_rate": 0.003, + "loss": 4.062, + "step": 10558 + }, + { + "epoch": 0.10559, + "grad_norm": 0.6865270024171043, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 10559 + }, + { + "epoch": 0.1056, + "grad_norm": 0.737499509213761, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 10560 + }, + { + "epoch": 0.10561, + "grad_norm": 0.8984522058594936, + "learning_rate": 0.003, + "loss": 4.077, + "step": 10561 + }, + { + "epoch": 0.10562, + "grad_norm": 1.1339394563977538, + "learning_rate": 0.003, + "loss": 4.066, + "step": 10562 + }, + { + "epoch": 0.10563, + "grad_norm": 0.8198771181383394, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 10563 + }, + { + "epoch": 0.10564, + "grad_norm": 0.7658877274749886, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 10564 + }, + { + "epoch": 0.10565, + "grad_norm": 0.8482606637522958, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 10565 + }, + { + "epoch": 0.10566, + "grad_norm": 0.8347786010621645, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 10566 + }, + { + "epoch": 0.10567, + "grad_norm": 0.8843860041980399, + "learning_rate": 0.003, + "loss": 4.1206, + "step": 10567 + }, + { + "epoch": 0.10568, + "grad_norm": 0.990139802584295, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 10568 + }, + { + "epoch": 0.10569, + "grad_norm": 0.9852662752867085, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 10569 + }, + { + "epoch": 0.1057, + "grad_norm": 0.9372987172274819, + "learning_rate": 0.003, + "loss": 4.084, + "step": 10570 + }, + { + "epoch": 0.10571, + "grad_norm": 0.988820397043408, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 10571 + }, + { + "epoch": 0.10572, + "grad_norm": 1.1678833347420248, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 10572 + }, + { + "epoch": 0.10573, + "grad_norm": 0.8517913634074189, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 10573 + }, + { + "epoch": 0.10574, + "grad_norm": 0.8659007094767834, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 10574 + }, + { + "epoch": 0.10575, + "grad_norm": 0.958183815886997, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 10575 + }, + { + "epoch": 0.10576, + "grad_norm": 0.9118557877430638, + "learning_rate": 0.003, + "loss": 4.1164, + "step": 10576 + }, + { + "epoch": 0.10577, + "grad_norm": 0.9153226156451957, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 10577 + }, + { + "epoch": 0.10578, + "grad_norm": 0.9874735654072453, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 10578 + }, + { + "epoch": 0.10579, + "grad_norm": 1.0415079742593492, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 10579 + }, + { + "epoch": 0.1058, + "grad_norm": 0.9153784660111088, + "learning_rate": 0.003, + "loss": 4.1133, + "step": 10580 + }, + { + "epoch": 0.10581, + "grad_norm": 0.9210262014080384, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 10581 + }, + { + "epoch": 0.10582, + "grad_norm": 0.9396705464198691, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 10582 + }, + { + "epoch": 0.10583, + "grad_norm": 0.9120879354056952, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 10583 + }, + { + "epoch": 0.10584, + "grad_norm": 0.8070610834160225, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 10584 + }, + { + "epoch": 0.10585, + "grad_norm": 0.8983503783260497, + "learning_rate": 0.003, + "loss": 4.1378, + "step": 10585 + }, + { + "epoch": 0.10586, + "grad_norm": 0.942723580765965, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 10586 + }, + { + "epoch": 0.10587, + "grad_norm": 0.9654495812888432, + "learning_rate": 0.003, + "loss": 4.1261, + "step": 10587 + }, + { + "epoch": 0.10588, + "grad_norm": 1.0137540717282463, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 10588 + }, + { + "epoch": 0.10589, + "grad_norm": 0.9211083933242845, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 10589 + }, + { + "epoch": 0.1059, + "grad_norm": 0.8886434692979224, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 10590 + }, + { + "epoch": 0.10591, + "grad_norm": 0.8414672791491735, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 10591 + }, + { + "epoch": 0.10592, + "grad_norm": 0.808437412569294, + "learning_rate": 0.003, + "loss": 4.1363, + "step": 10592 + }, + { + "epoch": 0.10593, + "grad_norm": 0.6746674356577256, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 10593 + }, + { + "epoch": 0.10594, + "grad_norm": 0.6222040276891654, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 10594 + }, + { + "epoch": 0.10595, + "grad_norm": 0.6313983056054177, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 10595 + }, + { + "epoch": 0.10596, + "grad_norm": 0.6792929100891301, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10596 + }, + { + "epoch": 0.10597, + "grad_norm": 0.6800719059268772, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 10597 + }, + { + "epoch": 0.10598, + "grad_norm": 0.770312479924072, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10598 + }, + { + "epoch": 0.10599, + "grad_norm": 0.9764320147050246, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 10599 + }, + { + "epoch": 0.106, + "grad_norm": 0.9114216015686166, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 10600 + }, + { + "epoch": 0.10601, + "grad_norm": 0.6555189332915664, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 10601 + }, + { + "epoch": 0.10602, + "grad_norm": 0.5040307133652577, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 10602 + }, + { + "epoch": 0.10603, + "grad_norm": 0.5690484358230466, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 10603 + }, + { + "epoch": 0.10604, + "grad_norm": 0.5637735595856799, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 10604 + }, + { + "epoch": 0.10605, + "grad_norm": 0.48001727986025794, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 10605 + }, + { + "epoch": 0.10606, + "grad_norm": 0.4919296874299588, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 10606 + }, + { + "epoch": 0.10607, + "grad_norm": 0.4727741741475219, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 10607 + }, + { + "epoch": 0.10608, + "grad_norm": 0.5184442406668994, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 10608 + }, + { + "epoch": 0.10609, + "grad_norm": 0.5858630638238149, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 10609 + }, + { + "epoch": 0.1061, + "grad_norm": 0.6874353934764682, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 10610 + }, + { + "epoch": 0.10611, + "grad_norm": 0.8263741726587102, + "learning_rate": 0.003, + "loss": 4.041, + "step": 10611 + }, + { + "epoch": 0.10612, + "grad_norm": 0.8884315684622063, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 10612 + }, + { + "epoch": 0.10613, + "grad_norm": 0.8763453508937509, + "learning_rate": 0.003, + "loss": 4.069, + "step": 10613 + }, + { + "epoch": 0.10614, + "grad_norm": 0.8224298838859537, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 10614 + }, + { + "epoch": 0.10615, + "grad_norm": 0.7191046159980217, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10615 + }, + { + "epoch": 0.10616, + "grad_norm": 0.7315824929912534, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 10616 + }, + { + "epoch": 0.10617, + "grad_norm": 0.916956377366291, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 10617 + }, + { + "epoch": 0.10618, + "grad_norm": 1.2629005890508869, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 10618 + }, + { + "epoch": 0.10619, + "grad_norm": 0.856578006061621, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 10619 + }, + { + "epoch": 0.1062, + "grad_norm": 0.7408131912141622, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 10620 + }, + { + "epoch": 0.10621, + "grad_norm": 0.6470727798627832, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 10621 + }, + { + "epoch": 0.10622, + "grad_norm": 0.5725004840385046, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 10622 + }, + { + "epoch": 0.10623, + "grad_norm": 0.553432799759805, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 10623 + }, + { + "epoch": 0.10624, + "grad_norm": 0.5403048416895829, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 10624 + }, + { + "epoch": 0.10625, + "grad_norm": 0.5104072292295226, + "learning_rate": 0.003, + "loss": 4.06, + "step": 10625 + }, + { + "epoch": 0.10626, + "grad_norm": 0.5823780352699905, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 10626 + }, + { + "epoch": 0.10627, + "grad_norm": 0.6962228994056245, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 10627 + }, + { + "epoch": 0.10628, + "grad_norm": 0.7935604816074494, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 10628 + }, + { + "epoch": 0.10629, + "grad_norm": 0.9695291887403658, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 10629 + }, + { + "epoch": 0.1063, + "grad_norm": 1.2894223340608182, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 10630 + }, + { + "epoch": 0.10631, + "grad_norm": 0.6621419698664075, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 10631 + }, + { + "epoch": 0.10632, + "grad_norm": 0.8074981429308187, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 10632 + }, + { + "epoch": 0.10633, + "grad_norm": 0.9026339646908454, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 10633 + }, + { + "epoch": 0.10634, + "grad_norm": 0.8417206941624089, + "learning_rate": 0.003, + "loss": 4.078, + "step": 10634 + }, + { + "epoch": 0.10635, + "grad_norm": 0.8020780791146946, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 10635 + }, + { + "epoch": 0.10636, + "grad_norm": 0.8075945654041848, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 10636 + }, + { + "epoch": 0.10637, + "grad_norm": 0.7764114257102779, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 10637 + }, + { + "epoch": 0.10638, + "grad_norm": 0.8095214640783682, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 10638 + }, + { + "epoch": 0.10639, + "grad_norm": 0.8784677433844998, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 10639 + }, + { + "epoch": 0.1064, + "grad_norm": 0.8355273252342663, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 10640 + }, + { + "epoch": 0.10641, + "grad_norm": 0.9458422852748879, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 10641 + }, + { + "epoch": 0.10642, + "grad_norm": 1.068264830551457, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 10642 + }, + { + "epoch": 0.10643, + "grad_norm": 1.0523298329122281, + "learning_rate": 0.003, + "loss": 4.072, + "step": 10643 + }, + { + "epoch": 0.10644, + "grad_norm": 0.7659982628957405, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 10644 + }, + { + "epoch": 0.10645, + "grad_norm": 0.652728924657321, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 10645 + }, + { + "epoch": 0.10646, + "grad_norm": 0.6456066494246264, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 10646 + }, + { + "epoch": 0.10647, + "grad_norm": 0.7464907408983558, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 10647 + }, + { + "epoch": 0.10648, + "grad_norm": 0.7405212525064351, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 10648 + }, + { + "epoch": 0.10649, + "grad_norm": 0.7478639631456179, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 10649 + }, + { + "epoch": 0.1065, + "grad_norm": 0.7550955604366715, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 10650 + }, + { + "epoch": 0.10651, + "grad_norm": 0.7110846298696153, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 10651 + }, + { + "epoch": 0.10652, + "grad_norm": 0.6402684922431795, + "learning_rate": 0.003, + "loss": 4.084, + "step": 10652 + }, + { + "epoch": 0.10653, + "grad_norm": 0.59791029327084, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 10653 + }, + { + "epoch": 0.10654, + "grad_norm": 0.6432662968161444, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 10654 + }, + { + "epoch": 0.10655, + "grad_norm": 0.6752567227234482, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 10655 + }, + { + "epoch": 0.10656, + "grad_norm": 0.6800244754630034, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 10656 + }, + { + "epoch": 0.10657, + "grad_norm": 0.5959513716804604, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 10657 + }, + { + "epoch": 0.10658, + "grad_norm": 0.6403859324045968, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 10658 + }, + { + "epoch": 0.10659, + "grad_norm": 0.8296924742157811, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 10659 + }, + { + "epoch": 0.1066, + "grad_norm": 1.0682115948141957, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 10660 + }, + { + "epoch": 0.10661, + "grad_norm": 1.1194993070021568, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 10661 + }, + { + "epoch": 0.10662, + "grad_norm": 0.9275062203048765, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 10662 + }, + { + "epoch": 0.10663, + "grad_norm": 0.8557195585149507, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 10663 + }, + { + "epoch": 0.10664, + "grad_norm": 0.7332891166349356, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 10664 + }, + { + "epoch": 0.10665, + "grad_norm": 0.7737620390856565, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 10665 + }, + { + "epoch": 0.10666, + "grad_norm": 0.8490683224003708, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 10666 + }, + { + "epoch": 0.10667, + "grad_norm": 0.8834112771739413, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 10667 + }, + { + "epoch": 0.10668, + "grad_norm": 0.9844633934429818, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 10668 + }, + { + "epoch": 0.10669, + "grad_norm": 0.932517734016524, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 10669 + }, + { + "epoch": 0.1067, + "grad_norm": 0.83578599313614, + "learning_rate": 0.003, + "loss": 4.073, + "step": 10670 + }, + { + "epoch": 0.10671, + "grad_norm": 0.9069085658024532, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 10671 + }, + { + "epoch": 0.10672, + "grad_norm": 0.9945862570463592, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 10672 + }, + { + "epoch": 0.10673, + "grad_norm": 1.09965107012104, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 10673 + }, + { + "epoch": 0.10674, + "grad_norm": 0.8268182899214279, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 10674 + }, + { + "epoch": 0.10675, + "grad_norm": 0.7034549781067129, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 10675 + }, + { + "epoch": 0.10676, + "grad_norm": 0.7890775923714155, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 10676 + }, + { + "epoch": 0.10677, + "grad_norm": 0.7900757896942041, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 10677 + }, + { + "epoch": 0.10678, + "grad_norm": 0.734976551432324, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10678 + }, + { + "epoch": 0.10679, + "grad_norm": 0.7314744669978124, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 10679 + }, + { + "epoch": 0.1068, + "grad_norm": 0.6908817611613471, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 10680 + }, + { + "epoch": 0.10681, + "grad_norm": 0.6399548375986829, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 10681 + }, + { + "epoch": 0.10682, + "grad_norm": 0.7053930398751589, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 10682 + }, + { + "epoch": 0.10683, + "grad_norm": 0.8259488173340027, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 10683 + }, + { + "epoch": 0.10684, + "grad_norm": 0.8721003907230183, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 10684 + }, + { + "epoch": 0.10685, + "grad_norm": 0.8470217654691111, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 10685 + }, + { + "epoch": 0.10686, + "grad_norm": 0.90992443878563, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 10686 + }, + { + "epoch": 0.10687, + "grad_norm": 1.024100970580944, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 10687 + }, + { + "epoch": 0.10688, + "grad_norm": 1.0637806320585521, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 10688 + }, + { + "epoch": 0.10689, + "grad_norm": 1.1156946627292235, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 10689 + }, + { + "epoch": 0.1069, + "grad_norm": 0.8046943265183353, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 10690 + }, + { + "epoch": 0.10691, + "grad_norm": 0.7100711170524753, + "learning_rate": 0.003, + "loss": 4.1216, + "step": 10691 + }, + { + "epoch": 0.10692, + "grad_norm": 0.6574317863899862, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 10692 + }, + { + "epoch": 0.10693, + "grad_norm": 0.6251555571305705, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 10693 + }, + { + "epoch": 0.10694, + "grad_norm": 0.679376390140605, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 10694 + }, + { + "epoch": 0.10695, + "grad_norm": 0.8280371560374931, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 10695 + }, + { + "epoch": 0.10696, + "grad_norm": 0.9932001330147407, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 10696 + }, + { + "epoch": 0.10697, + "grad_norm": 1.1062910064134184, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 10697 + }, + { + "epoch": 0.10698, + "grad_norm": 0.7722807520515808, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 10698 + }, + { + "epoch": 0.10699, + "grad_norm": 0.7081182958148078, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 10699 + }, + { + "epoch": 0.107, + "grad_norm": 0.7166968485485714, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 10700 + }, + { + "epoch": 0.10701, + "grad_norm": 0.7537508686418137, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 10701 + }, + { + "epoch": 0.10702, + "grad_norm": 0.8835708055800392, + "learning_rate": 0.003, + "loss": 4.074, + "step": 10702 + }, + { + "epoch": 0.10703, + "grad_norm": 0.7817691460438141, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 10703 + }, + { + "epoch": 0.10704, + "grad_norm": 0.645477228585237, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 10704 + }, + { + "epoch": 0.10705, + "grad_norm": 0.7710367164781207, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 10705 + }, + { + "epoch": 0.10706, + "grad_norm": 0.7901517958326595, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 10706 + }, + { + "epoch": 0.10707, + "grad_norm": 0.7891516356491817, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 10707 + }, + { + "epoch": 0.10708, + "grad_norm": 0.8615348206451179, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 10708 + }, + { + "epoch": 0.10709, + "grad_norm": 0.9888041530740351, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 10709 + }, + { + "epoch": 0.1071, + "grad_norm": 1.061660380792365, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 10710 + }, + { + "epoch": 0.10711, + "grad_norm": 0.9066601556149967, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 10711 + }, + { + "epoch": 0.10712, + "grad_norm": 0.8116455998652893, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 10712 + }, + { + "epoch": 0.10713, + "grad_norm": 0.8637113563116897, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 10713 + }, + { + "epoch": 0.10714, + "grad_norm": 0.7022220024601558, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 10714 + }, + { + "epoch": 0.10715, + "grad_norm": 0.6293443796442022, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 10715 + }, + { + "epoch": 0.10716, + "grad_norm": 0.6779094422266427, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 10716 + }, + { + "epoch": 0.10717, + "grad_norm": 0.7211066849511437, + "learning_rate": 0.003, + "loss": 4.089, + "step": 10717 + }, + { + "epoch": 0.10718, + "grad_norm": 0.6577616772264899, + "learning_rate": 0.003, + "loss": 4.048, + "step": 10718 + }, + { + "epoch": 0.10719, + "grad_norm": 0.6728898224028206, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 10719 + }, + { + "epoch": 0.1072, + "grad_norm": 0.6504562644268949, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 10720 + }, + { + "epoch": 0.10721, + "grad_norm": 0.5906580401216541, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 10721 + }, + { + "epoch": 0.10722, + "grad_norm": 0.6651104603087701, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 10722 + }, + { + "epoch": 0.10723, + "grad_norm": 0.8274844100078965, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 10723 + }, + { + "epoch": 0.10724, + "grad_norm": 1.107508585414254, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 10724 + }, + { + "epoch": 0.10725, + "grad_norm": 0.9770130165439336, + "learning_rate": 0.003, + "loss": 4.055, + "step": 10725 + }, + { + "epoch": 0.10726, + "grad_norm": 0.8609099367579911, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 10726 + }, + { + "epoch": 0.10727, + "grad_norm": 0.729890468071586, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 10727 + }, + { + "epoch": 0.10728, + "grad_norm": 0.6948485160734639, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 10728 + }, + { + "epoch": 0.10729, + "grad_norm": 0.6678861533303945, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 10729 + }, + { + "epoch": 0.1073, + "grad_norm": 0.7248961043248474, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 10730 + }, + { + "epoch": 0.10731, + "grad_norm": 0.7071667241504125, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 10731 + }, + { + "epoch": 0.10732, + "grad_norm": 0.7205983714963741, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 10732 + }, + { + "epoch": 0.10733, + "grad_norm": 0.688369875669321, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 10733 + }, + { + "epoch": 0.10734, + "grad_norm": 0.6741612928630552, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 10734 + }, + { + "epoch": 0.10735, + "grad_norm": 0.7199477700388638, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 10735 + }, + { + "epoch": 0.10736, + "grad_norm": 0.7876731105567968, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 10736 + }, + { + "epoch": 0.10737, + "grad_norm": 0.8324327281824561, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 10737 + }, + { + "epoch": 0.10738, + "grad_norm": 0.8709320081981367, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 10738 + }, + { + "epoch": 0.10739, + "grad_norm": 0.9562955220392207, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 10739 + }, + { + "epoch": 0.1074, + "grad_norm": 1.2304307954831826, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 10740 + }, + { + "epoch": 0.10741, + "grad_norm": 0.9300435353142966, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 10741 + }, + { + "epoch": 0.10742, + "grad_norm": 0.9335382984724416, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 10742 + }, + { + "epoch": 0.10743, + "grad_norm": 0.9877653745311188, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 10743 + }, + { + "epoch": 0.10744, + "grad_norm": 1.0405104571146349, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 10744 + }, + { + "epoch": 0.10745, + "grad_norm": 0.8419311900278269, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 10745 + }, + { + "epoch": 0.10746, + "grad_norm": 0.8478057480297286, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 10746 + }, + { + "epoch": 0.10747, + "grad_norm": 0.9138973791244602, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 10747 + }, + { + "epoch": 0.10748, + "grad_norm": 0.9349681802837756, + "learning_rate": 0.003, + "loss": 4.1211, + "step": 10748 + }, + { + "epoch": 0.10749, + "grad_norm": 0.950015954392382, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 10749 + }, + { + "epoch": 0.1075, + "grad_norm": 0.8943074737592488, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 10750 + }, + { + "epoch": 0.10751, + "grad_norm": 0.8401604688025964, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 10751 + }, + { + "epoch": 0.10752, + "grad_norm": 0.8907490401952103, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 10752 + }, + { + "epoch": 0.10753, + "grad_norm": 0.8019963256041405, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 10753 + }, + { + "epoch": 0.10754, + "grad_norm": 0.7334278789249893, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 10754 + }, + { + "epoch": 0.10755, + "grad_norm": 0.7807073635179747, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 10755 + }, + { + "epoch": 0.10756, + "grad_norm": 0.9037897305755073, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 10756 + }, + { + "epoch": 0.10757, + "grad_norm": 1.0038789915507889, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 10757 + }, + { + "epoch": 0.10758, + "grad_norm": 0.8637668717135266, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 10758 + }, + { + "epoch": 0.10759, + "grad_norm": 0.7784709868978823, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 10759 + }, + { + "epoch": 0.1076, + "grad_norm": 0.7513385059899371, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 10760 + }, + { + "epoch": 0.10761, + "grad_norm": 0.658145040039802, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 10761 + }, + { + "epoch": 0.10762, + "grad_norm": 0.6319284675105509, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 10762 + }, + { + "epoch": 0.10763, + "grad_norm": 0.6484314688691583, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 10763 + }, + { + "epoch": 0.10764, + "grad_norm": 0.6447554772230162, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 10764 + }, + { + "epoch": 0.10765, + "grad_norm": 0.6947856450689127, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 10765 + }, + { + "epoch": 0.10766, + "grad_norm": 0.8271087875534372, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 10766 + }, + { + "epoch": 0.10767, + "grad_norm": 0.9248495761280467, + "learning_rate": 0.003, + "loss": 4.083, + "step": 10767 + }, + { + "epoch": 0.10768, + "grad_norm": 0.906365044623054, + "learning_rate": 0.003, + "loss": 4.079, + "step": 10768 + }, + { + "epoch": 0.10769, + "grad_norm": 0.9421556590622695, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10769 + }, + { + "epoch": 0.1077, + "grad_norm": 0.9830274427766104, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 10770 + }, + { + "epoch": 0.10771, + "grad_norm": 0.7459901139183797, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 10771 + }, + { + "epoch": 0.10772, + "grad_norm": 0.8029037552458691, + "learning_rate": 0.003, + "loss": 4.08, + "step": 10772 + }, + { + "epoch": 0.10773, + "grad_norm": 0.8413257874108826, + "learning_rate": 0.003, + "loss": 4.094, + "step": 10773 + }, + { + "epoch": 0.10774, + "grad_norm": 0.8939967598650286, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 10774 + }, + { + "epoch": 0.10775, + "grad_norm": 0.9500930304152452, + "learning_rate": 0.003, + "loss": 4.1063, + "step": 10775 + }, + { + "epoch": 0.10776, + "grad_norm": 0.9180361337301888, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 10776 + }, + { + "epoch": 0.10777, + "grad_norm": 0.8739106054853617, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 10777 + }, + { + "epoch": 0.10778, + "grad_norm": 0.7861751893427512, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 10778 + }, + { + "epoch": 0.10779, + "grad_norm": 0.6801288616685922, + "learning_rate": 0.003, + "loss": 4.058, + "step": 10779 + }, + { + "epoch": 0.1078, + "grad_norm": 0.6595454326458605, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 10780 + }, + { + "epoch": 0.10781, + "grad_norm": 0.677694591198545, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 10781 + }, + { + "epoch": 0.10782, + "grad_norm": 0.7322536223576103, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 10782 + }, + { + "epoch": 0.10783, + "grad_norm": 0.6949774146294931, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 10783 + }, + { + "epoch": 0.10784, + "grad_norm": 0.7112783123302691, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 10784 + }, + { + "epoch": 0.10785, + "grad_norm": 0.7624840584818994, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 10785 + }, + { + "epoch": 0.10786, + "grad_norm": 0.8307593955140588, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 10786 + }, + { + "epoch": 0.10787, + "grad_norm": 0.8234800415780226, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 10787 + }, + { + "epoch": 0.10788, + "grad_norm": 0.7063471007386206, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 10788 + }, + { + "epoch": 0.10789, + "grad_norm": 0.7003812666554212, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 10789 + }, + { + "epoch": 0.1079, + "grad_norm": 0.591255796389519, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 10790 + }, + { + "epoch": 0.10791, + "grad_norm": 0.6588407077575648, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 10791 + }, + { + "epoch": 0.10792, + "grad_norm": 0.7812948240216615, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 10792 + }, + { + "epoch": 0.10793, + "grad_norm": 0.875484648604842, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 10793 + }, + { + "epoch": 0.10794, + "grad_norm": 1.0819813451984712, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 10794 + }, + { + "epoch": 0.10795, + "grad_norm": 1.0106647386412784, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 10795 + }, + { + "epoch": 0.10796, + "grad_norm": 0.8727394703935979, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 10796 + }, + { + "epoch": 0.10797, + "grad_norm": 0.7493976900488687, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 10797 + }, + { + "epoch": 0.10798, + "grad_norm": 0.6778670897020321, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 10798 + }, + { + "epoch": 0.10799, + "grad_norm": 0.6886630260315241, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 10799 + }, + { + "epoch": 0.108, + "grad_norm": 0.6997924479763118, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 10800 + }, + { + "epoch": 0.10801, + "grad_norm": 0.6335312641192151, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 10801 + }, + { + "epoch": 0.10802, + "grad_norm": 0.5596975143808999, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 10802 + }, + { + "epoch": 0.10803, + "grad_norm": 0.5883654642439371, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 10803 + }, + { + "epoch": 0.10804, + "grad_norm": 0.6353797720617529, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 10804 + }, + { + "epoch": 0.10805, + "grad_norm": 0.766636901293996, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 10805 + }, + { + "epoch": 0.10806, + "grad_norm": 0.9243626908434707, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 10806 + }, + { + "epoch": 0.10807, + "grad_norm": 0.9974198487878154, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 10807 + }, + { + "epoch": 0.10808, + "grad_norm": 0.906105158565713, + "learning_rate": 0.003, + "loss": 4.1665, + "step": 10808 + }, + { + "epoch": 0.10809, + "grad_norm": 0.9969514627729021, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10809 + }, + { + "epoch": 0.1081, + "grad_norm": 0.834425050345777, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 10810 + }, + { + "epoch": 0.10811, + "grad_norm": 0.8783993703886297, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 10811 + }, + { + "epoch": 0.10812, + "grad_norm": 1.0486220426031836, + "learning_rate": 0.003, + "loss": 4.069, + "step": 10812 + }, + { + "epoch": 0.10813, + "grad_norm": 0.9635854562205466, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 10813 + }, + { + "epoch": 0.10814, + "grad_norm": 0.991913843257712, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 10814 + }, + { + "epoch": 0.10815, + "grad_norm": 1.03240484345877, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 10815 + }, + { + "epoch": 0.10816, + "grad_norm": 0.7999243463833863, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 10816 + }, + { + "epoch": 0.10817, + "grad_norm": 0.7146118966430277, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 10817 + }, + { + "epoch": 0.10818, + "grad_norm": 0.6760564583671121, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 10818 + }, + { + "epoch": 0.10819, + "grad_norm": 0.7437787761517194, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 10819 + }, + { + "epoch": 0.1082, + "grad_norm": 0.7667318805568926, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 10820 + }, + { + "epoch": 0.10821, + "grad_norm": 0.7578063636892496, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 10821 + }, + { + "epoch": 0.10822, + "grad_norm": 0.9722299248990864, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 10822 + }, + { + "epoch": 0.10823, + "grad_norm": 1.084422938439403, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 10823 + }, + { + "epoch": 0.10824, + "grad_norm": 0.7866493459633321, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 10824 + }, + { + "epoch": 0.10825, + "grad_norm": 0.6928064188763265, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 10825 + }, + { + "epoch": 0.10826, + "grad_norm": 0.7316035379733721, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 10826 + }, + { + "epoch": 0.10827, + "grad_norm": 0.7872692879605871, + "learning_rate": 0.003, + "loss": 4.1159, + "step": 10827 + }, + { + "epoch": 0.10828, + "grad_norm": 0.6915375778555326, + "learning_rate": 0.003, + "loss": 4.1201, + "step": 10828 + }, + { + "epoch": 0.10829, + "grad_norm": 0.748471989224979, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 10829 + }, + { + "epoch": 0.1083, + "grad_norm": 0.763563153780263, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 10830 + }, + { + "epoch": 0.10831, + "grad_norm": 0.7557051035736269, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 10831 + }, + { + "epoch": 0.10832, + "grad_norm": 0.8081559760247755, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 10832 + }, + { + "epoch": 0.10833, + "grad_norm": 0.7916475572460021, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 10833 + }, + { + "epoch": 0.10834, + "grad_norm": 0.7180126556558653, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 10834 + }, + { + "epoch": 0.10835, + "grad_norm": 0.6897522880798621, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 10835 + }, + { + "epoch": 0.10836, + "grad_norm": 0.720390447907444, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 10836 + }, + { + "epoch": 0.10837, + "grad_norm": 0.7729119904062143, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 10837 + }, + { + "epoch": 0.10838, + "grad_norm": 0.974189110491675, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 10838 + }, + { + "epoch": 0.10839, + "grad_norm": 1.1146789421849448, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 10839 + }, + { + "epoch": 0.1084, + "grad_norm": 0.7863144186085149, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 10840 + }, + { + "epoch": 0.10841, + "grad_norm": 0.6344950133109905, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 10841 + }, + { + "epoch": 0.10842, + "grad_norm": 0.6253130642537409, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 10842 + }, + { + "epoch": 0.10843, + "grad_norm": 0.642829186840075, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 10843 + }, + { + "epoch": 0.10844, + "grad_norm": 0.7172865995614057, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 10844 + }, + { + "epoch": 0.10845, + "grad_norm": 0.8387230458337437, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 10845 + }, + { + "epoch": 0.10846, + "grad_norm": 0.8272335404876863, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 10846 + }, + { + "epoch": 0.10847, + "grad_norm": 0.8339723408210796, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 10847 + }, + { + "epoch": 0.10848, + "grad_norm": 0.6960991280713948, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 10848 + }, + { + "epoch": 0.10849, + "grad_norm": 0.621407103825329, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 10849 + }, + { + "epoch": 0.1085, + "grad_norm": 0.6226763190146141, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 10850 + }, + { + "epoch": 0.10851, + "grad_norm": 0.7346253480020578, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 10851 + }, + { + "epoch": 0.10852, + "grad_norm": 0.8514943147557615, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 10852 + }, + { + "epoch": 0.10853, + "grad_norm": 0.9517437965287522, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 10853 + }, + { + "epoch": 0.10854, + "grad_norm": 0.9508741169410276, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 10854 + }, + { + "epoch": 0.10855, + "grad_norm": 0.7768027675885154, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 10855 + }, + { + "epoch": 0.10856, + "grad_norm": 0.7413209828275256, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 10856 + }, + { + "epoch": 0.10857, + "grad_norm": 0.7185402434742164, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 10857 + }, + { + "epoch": 0.10858, + "grad_norm": 0.6826430053195711, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 10858 + }, + { + "epoch": 0.10859, + "grad_norm": 0.761533780013182, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 10859 + }, + { + "epoch": 0.1086, + "grad_norm": 0.7642804260240755, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 10860 + }, + { + "epoch": 0.10861, + "grad_norm": 0.825548626289074, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 10861 + }, + { + "epoch": 0.10862, + "grad_norm": 0.9037697230091956, + "learning_rate": 0.003, + "loss": 4.1286, + "step": 10862 + }, + { + "epoch": 0.10863, + "grad_norm": 0.8657754903618303, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 10863 + }, + { + "epoch": 0.10864, + "grad_norm": 0.8141322733115343, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 10864 + }, + { + "epoch": 0.10865, + "grad_norm": 0.7253469991497675, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 10865 + }, + { + "epoch": 0.10866, + "grad_norm": 0.7117167359362205, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 10866 + }, + { + "epoch": 0.10867, + "grad_norm": 0.7882818390575478, + "learning_rate": 0.003, + "loss": 4.057, + "step": 10867 + }, + { + "epoch": 0.10868, + "grad_norm": 0.9220190688619744, + "learning_rate": 0.003, + "loss": 4.058, + "step": 10868 + }, + { + "epoch": 0.10869, + "grad_norm": 1.2171062146891194, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 10869 + }, + { + "epoch": 0.1087, + "grad_norm": 0.9409103742766868, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 10870 + }, + { + "epoch": 0.10871, + "grad_norm": 0.7857575782459725, + "learning_rate": 0.003, + "loss": 4.067, + "step": 10871 + }, + { + "epoch": 0.10872, + "grad_norm": 0.7008071897173932, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 10872 + }, + { + "epoch": 0.10873, + "grad_norm": 0.6493494671071487, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 10873 + }, + { + "epoch": 0.10874, + "grad_norm": 0.7422045080755252, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 10874 + }, + { + "epoch": 0.10875, + "grad_norm": 0.7813850020221002, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 10875 + }, + { + "epoch": 0.10876, + "grad_norm": 0.7337134215205611, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 10876 + }, + { + "epoch": 0.10877, + "grad_norm": 0.5738251794128827, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 10877 + }, + { + "epoch": 0.10878, + "grad_norm": 0.5971880485148056, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 10878 + }, + { + "epoch": 0.10879, + "grad_norm": 0.6407269342613244, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 10879 + }, + { + "epoch": 0.1088, + "grad_norm": 0.7567936302504148, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 10880 + }, + { + "epoch": 0.10881, + "grad_norm": 0.8952438404265414, + "learning_rate": 0.003, + "loss": 4.053, + "step": 10881 + }, + { + "epoch": 0.10882, + "grad_norm": 1.016869262938487, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 10882 + }, + { + "epoch": 0.10883, + "grad_norm": 0.9089472739873565, + "learning_rate": 0.003, + "loss": 4.097, + "step": 10883 + }, + { + "epoch": 0.10884, + "grad_norm": 0.8192132716200892, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 10884 + }, + { + "epoch": 0.10885, + "grad_norm": 0.754863660292805, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 10885 + }, + { + "epoch": 0.10886, + "grad_norm": 0.7646838940972092, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 10886 + }, + { + "epoch": 0.10887, + "grad_norm": 0.8054792718032056, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 10887 + }, + { + "epoch": 0.10888, + "grad_norm": 0.807217157369814, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 10888 + }, + { + "epoch": 0.10889, + "grad_norm": 0.9596711545065643, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 10889 + }, + { + "epoch": 0.1089, + "grad_norm": 1.0281488211494914, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 10890 + }, + { + "epoch": 0.10891, + "grad_norm": 1.008425907314204, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 10891 + }, + { + "epoch": 0.10892, + "grad_norm": 0.9881878897319983, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 10892 + }, + { + "epoch": 0.10893, + "grad_norm": 0.9651702578236134, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 10893 + }, + { + "epoch": 0.10894, + "grad_norm": 0.91488242972445, + "learning_rate": 0.003, + "loss": 4.082, + "step": 10894 + }, + { + "epoch": 0.10895, + "grad_norm": 0.820190985605287, + "learning_rate": 0.003, + "loss": 4.1148, + "step": 10895 + }, + { + "epoch": 0.10896, + "grad_norm": 0.8047362457919742, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 10896 + }, + { + "epoch": 0.10897, + "grad_norm": 0.8272579145202782, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 10897 + }, + { + "epoch": 0.10898, + "grad_norm": 0.8529516426412708, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 10898 + }, + { + "epoch": 0.10899, + "grad_norm": 0.792496666395517, + "learning_rate": 0.003, + "loss": 4.079, + "step": 10899 + }, + { + "epoch": 0.109, + "grad_norm": 0.938657528442336, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 10900 + }, + { + "epoch": 0.10901, + "grad_norm": 1.2363502284270567, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 10901 + }, + { + "epoch": 0.10902, + "grad_norm": 0.7903678185797423, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 10902 + }, + { + "epoch": 0.10903, + "grad_norm": 0.6789417264937385, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 10903 + }, + { + "epoch": 0.10904, + "grad_norm": 0.7466995321912779, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 10904 + }, + { + "epoch": 0.10905, + "grad_norm": 0.8446333893055047, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 10905 + }, + { + "epoch": 0.10906, + "grad_norm": 0.9434146945884782, + "learning_rate": 0.003, + "loss": 4.113, + "step": 10906 + }, + { + "epoch": 0.10907, + "grad_norm": 0.9448346223448993, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 10907 + }, + { + "epoch": 0.10908, + "grad_norm": 0.9492929430380965, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 10908 + }, + { + "epoch": 0.10909, + "grad_norm": 0.9638647144203306, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 10909 + }, + { + "epoch": 0.1091, + "grad_norm": 0.8114844498739628, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 10910 + }, + { + "epoch": 0.10911, + "grad_norm": 0.7861873196944332, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 10911 + }, + { + "epoch": 0.10912, + "grad_norm": 0.6992439758399313, + "learning_rate": 0.003, + "loss": 4.083, + "step": 10912 + }, + { + "epoch": 0.10913, + "grad_norm": 0.6815651781157617, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 10913 + }, + { + "epoch": 0.10914, + "grad_norm": 0.6536113835814653, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 10914 + }, + { + "epoch": 0.10915, + "grad_norm": 0.6938082032494179, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 10915 + }, + { + "epoch": 0.10916, + "grad_norm": 0.7202674599350765, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 10916 + }, + { + "epoch": 0.10917, + "grad_norm": 0.7491490948689121, + "learning_rate": 0.003, + "loss": 4.093, + "step": 10917 + }, + { + "epoch": 0.10918, + "grad_norm": 1.11036055240353, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 10918 + }, + { + "epoch": 0.10919, + "grad_norm": 1.1337221606450771, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 10919 + }, + { + "epoch": 0.1092, + "grad_norm": 0.8428547311068283, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 10920 + }, + { + "epoch": 0.10921, + "grad_norm": 0.6082082658833271, + "learning_rate": 0.003, + "loss": 4.091, + "step": 10921 + }, + { + "epoch": 0.10922, + "grad_norm": 0.6315066804935527, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 10922 + }, + { + "epoch": 0.10923, + "grad_norm": 0.6061406163799906, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 10923 + }, + { + "epoch": 0.10924, + "grad_norm": 0.6369014987608396, + "learning_rate": 0.003, + "loss": 4.052, + "step": 10924 + }, + { + "epoch": 0.10925, + "grad_norm": 0.6509900001681895, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 10925 + }, + { + "epoch": 0.10926, + "grad_norm": 0.5911101588927634, + "learning_rate": 0.003, + "loss": 4.037, + "step": 10926 + }, + { + "epoch": 0.10927, + "grad_norm": 0.6090199769351918, + "learning_rate": 0.003, + "loss": 4.081, + "step": 10927 + }, + { + "epoch": 0.10928, + "grad_norm": 0.7583916555674786, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 10928 + }, + { + "epoch": 0.10929, + "grad_norm": 0.9306843772780642, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 10929 + }, + { + "epoch": 0.1093, + "grad_norm": 1.2362967491595924, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 10930 + }, + { + "epoch": 0.10931, + "grad_norm": 0.6502246716231995, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 10931 + }, + { + "epoch": 0.10932, + "grad_norm": 0.5470588873212986, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 10932 + }, + { + "epoch": 0.10933, + "grad_norm": 0.6038156560331971, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 10933 + }, + { + "epoch": 0.10934, + "grad_norm": 0.631710911079846, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 10934 + }, + { + "epoch": 0.10935, + "grad_norm": 0.6072788054734551, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 10935 + }, + { + "epoch": 0.10936, + "grad_norm": 0.5548572160813884, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 10936 + }, + { + "epoch": 0.10937, + "grad_norm": 0.5678083242728192, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 10937 + }, + { + "epoch": 0.10938, + "grad_norm": 0.5349364110178259, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 10938 + }, + { + "epoch": 0.10939, + "grad_norm": 0.5314674417062363, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 10939 + }, + { + "epoch": 0.1094, + "grad_norm": 0.5567007468861874, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 10940 + }, + { + "epoch": 0.10941, + "grad_norm": 0.6095491373372913, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 10941 + }, + { + "epoch": 0.10942, + "grad_norm": 0.7298043409485729, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 10942 + }, + { + "epoch": 0.10943, + "grad_norm": 0.786513510107108, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 10943 + }, + { + "epoch": 0.10944, + "grad_norm": 0.7774007804681506, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 10944 + }, + { + "epoch": 0.10945, + "grad_norm": 0.7495722469210125, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 10945 + }, + { + "epoch": 0.10946, + "grad_norm": 0.8064706771959848, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 10946 + }, + { + "epoch": 0.10947, + "grad_norm": 0.9393433262565961, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 10947 + }, + { + "epoch": 0.10948, + "grad_norm": 0.9387790046571931, + "learning_rate": 0.003, + "loss": 4.074, + "step": 10948 + }, + { + "epoch": 0.10949, + "grad_norm": 1.1410032379592778, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 10949 + }, + { + "epoch": 0.1095, + "grad_norm": 0.9129552961385187, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 10950 + }, + { + "epoch": 0.10951, + "grad_norm": 1.0435796410645963, + "learning_rate": 0.003, + "loss": 4.075, + "step": 10951 + }, + { + "epoch": 0.10952, + "grad_norm": 1.3358604841725472, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 10952 + }, + { + "epoch": 0.10953, + "grad_norm": 0.8310108373581879, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 10953 + }, + { + "epoch": 0.10954, + "grad_norm": 0.6723858624058059, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 10954 + }, + { + "epoch": 0.10955, + "grad_norm": 0.6611226964259507, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 10955 + }, + { + "epoch": 0.10956, + "grad_norm": 0.7011303501399112, + "learning_rate": 0.003, + "loss": 4.1017, + "step": 10956 + }, + { + "epoch": 0.10957, + "grad_norm": 0.6836216086535878, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 10957 + }, + { + "epoch": 0.10958, + "grad_norm": 0.7592475674795014, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 10958 + }, + { + "epoch": 0.10959, + "grad_norm": 0.7522399573457461, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 10959 + }, + { + "epoch": 0.1096, + "grad_norm": 0.7583206213419711, + "learning_rate": 0.003, + "loss": 4.058, + "step": 10960 + }, + { + "epoch": 0.10961, + "grad_norm": 0.7948798372620559, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 10961 + }, + { + "epoch": 0.10962, + "grad_norm": 0.9520999820511873, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 10962 + }, + { + "epoch": 0.10963, + "grad_norm": 0.9934263182752433, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 10963 + }, + { + "epoch": 0.10964, + "grad_norm": 1.0404312099324224, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 10964 + }, + { + "epoch": 0.10965, + "grad_norm": 0.9314463760092233, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 10965 + }, + { + "epoch": 0.10966, + "grad_norm": 0.8350231443250099, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 10966 + }, + { + "epoch": 0.10967, + "grad_norm": 0.9320300124939774, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 10967 + }, + { + "epoch": 0.10968, + "grad_norm": 0.9215579233627603, + "learning_rate": 0.003, + "loss": 4.1139, + "step": 10968 + }, + { + "epoch": 0.10969, + "grad_norm": 1.0604243559948778, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 10969 + }, + { + "epoch": 0.1097, + "grad_norm": 1.1138978634435108, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 10970 + }, + { + "epoch": 0.10971, + "grad_norm": 0.8951876392104671, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 10971 + }, + { + "epoch": 0.10972, + "grad_norm": 0.9676653328126739, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 10972 + }, + { + "epoch": 0.10973, + "grad_norm": 1.0352834124023993, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 10973 + }, + { + "epoch": 0.10974, + "grad_norm": 1.0675730058873871, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 10974 + }, + { + "epoch": 0.10975, + "grad_norm": 0.9740004559762198, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 10975 + }, + { + "epoch": 0.10976, + "grad_norm": 1.0447644987824045, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 10976 + }, + { + "epoch": 0.10977, + "grad_norm": 1.0200289761418742, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 10977 + }, + { + "epoch": 0.10978, + "grad_norm": 0.9365717545240859, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 10978 + }, + { + "epoch": 0.10979, + "grad_norm": 0.8362679752586221, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 10979 + }, + { + "epoch": 0.1098, + "grad_norm": 0.7172618437426157, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 10980 + }, + { + "epoch": 0.10981, + "grad_norm": 0.6864193346218213, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 10981 + }, + { + "epoch": 0.10982, + "grad_norm": 0.5895896832601729, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 10982 + }, + { + "epoch": 0.10983, + "grad_norm": 0.5787111289779593, + "learning_rate": 0.003, + "loss": 4.064, + "step": 10983 + }, + { + "epoch": 0.10984, + "grad_norm": 0.6057523165429262, + "learning_rate": 0.003, + "loss": 4.106, + "step": 10984 + }, + { + "epoch": 0.10985, + "grad_norm": 0.6514886745623131, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 10985 + }, + { + "epoch": 0.10986, + "grad_norm": 0.7435079048577373, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 10986 + }, + { + "epoch": 0.10987, + "grad_norm": 0.9200597665780822, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 10987 + }, + { + "epoch": 0.10988, + "grad_norm": 1.1265238934764898, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 10988 + }, + { + "epoch": 0.10989, + "grad_norm": 0.88644921715438, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 10989 + }, + { + "epoch": 0.1099, + "grad_norm": 0.8325133772180551, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 10990 + }, + { + "epoch": 0.10991, + "grad_norm": 0.8420468014984893, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 10991 + }, + { + "epoch": 0.10992, + "grad_norm": 0.8537316831757594, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 10992 + }, + { + "epoch": 0.10993, + "grad_norm": 0.7940387723272442, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 10993 + }, + { + "epoch": 0.10994, + "grad_norm": 0.6710457088949344, + "learning_rate": 0.003, + "loss": 4.1213, + "step": 10994 + }, + { + "epoch": 0.10995, + "grad_norm": 0.6142598287538222, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 10995 + }, + { + "epoch": 0.10996, + "grad_norm": 0.6660630848750299, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 10996 + }, + { + "epoch": 0.10997, + "grad_norm": 0.7899825737081104, + "learning_rate": 0.003, + "loss": 4.1131, + "step": 10997 + }, + { + "epoch": 0.10998, + "grad_norm": 0.8923751751449065, + "learning_rate": 0.003, + "loss": 4.12, + "step": 10998 + }, + { + "epoch": 0.10999, + "grad_norm": 0.9402386393787387, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 10999 + }, + { + "epoch": 0.11, + "grad_norm": 0.9050561963603275, + "learning_rate": 0.003, + "loss": 4.1182, + "step": 11000 + }, + { + "epoch": 0.11001, + "grad_norm": 0.885324393665634, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 11001 + }, + { + "epoch": 0.11002, + "grad_norm": 0.9434080532034825, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 11002 + }, + { + "epoch": 0.11003, + "grad_norm": 1.010969350024946, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 11003 + }, + { + "epoch": 0.11004, + "grad_norm": 0.954640988869078, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 11004 + }, + { + "epoch": 0.11005, + "grad_norm": 0.8240164005546453, + "learning_rate": 0.003, + "loss": 4.064, + "step": 11005 + }, + { + "epoch": 0.11006, + "grad_norm": 0.7594825840470841, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 11006 + }, + { + "epoch": 0.11007, + "grad_norm": 0.7590475204659565, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 11007 + }, + { + "epoch": 0.11008, + "grad_norm": 0.7958304222714225, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 11008 + }, + { + "epoch": 0.11009, + "grad_norm": 0.8287995764159665, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 11009 + }, + { + "epoch": 0.1101, + "grad_norm": 0.9174224956026463, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 11010 + }, + { + "epoch": 0.11011, + "grad_norm": 0.9810955939848184, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 11011 + }, + { + "epoch": 0.11012, + "grad_norm": 0.9588660180330635, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11012 + }, + { + "epoch": 0.11013, + "grad_norm": 0.818062590515428, + "learning_rate": 0.003, + "loss": 4.096, + "step": 11013 + }, + { + "epoch": 0.11014, + "grad_norm": 0.6982672473244061, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 11014 + }, + { + "epoch": 0.11015, + "grad_norm": 0.7514408359654094, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 11015 + }, + { + "epoch": 0.11016, + "grad_norm": 0.7207043759222721, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 11016 + }, + { + "epoch": 0.11017, + "grad_norm": 0.6165799587110872, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 11017 + }, + { + "epoch": 0.11018, + "grad_norm": 0.6810053095086784, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 11018 + }, + { + "epoch": 0.11019, + "grad_norm": 0.6851205152192956, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 11019 + }, + { + "epoch": 0.1102, + "grad_norm": 0.7128614695443434, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 11020 + }, + { + "epoch": 0.11021, + "grad_norm": 0.7954986146548706, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 11021 + }, + { + "epoch": 0.11022, + "grad_norm": 0.9822662453480975, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 11022 + }, + { + "epoch": 0.11023, + "grad_norm": 0.992584945136861, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 11023 + }, + { + "epoch": 0.11024, + "grad_norm": 0.9429960341025528, + "learning_rate": 0.003, + "loss": 4.11, + "step": 11024 + }, + { + "epoch": 0.11025, + "grad_norm": 0.8325519591187426, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 11025 + }, + { + "epoch": 0.11026, + "grad_norm": 0.8109579572209392, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 11026 + }, + { + "epoch": 0.11027, + "grad_norm": 0.8503578340284045, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 11027 + }, + { + "epoch": 0.11028, + "grad_norm": 0.7623548658665757, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 11028 + }, + { + "epoch": 0.11029, + "grad_norm": 0.6574600909508291, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 11029 + }, + { + "epoch": 0.1103, + "grad_norm": 0.6153319028159655, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11030 + }, + { + "epoch": 0.11031, + "grad_norm": 0.6706428602781416, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 11031 + }, + { + "epoch": 0.11032, + "grad_norm": 0.7446375104894701, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 11032 + }, + { + "epoch": 0.11033, + "grad_norm": 0.8287999931870835, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 11033 + }, + { + "epoch": 0.11034, + "grad_norm": 0.9769780850481814, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 11034 + }, + { + "epoch": 0.11035, + "grad_norm": 1.0533298010822936, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 11035 + }, + { + "epoch": 0.11036, + "grad_norm": 0.829555188618686, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 11036 + }, + { + "epoch": 0.11037, + "grad_norm": 0.6487286656984719, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 11037 + }, + { + "epoch": 0.11038, + "grad_norm": 0.6380967062306625, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 11038 + }, + { + "epoch": 0.11039, + "grad_norm": 0.7177361539966476, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 11039 + }, + { + "epoch": 0.1104, + "grad_norm": 0.6958535348954583, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 11040 + }, + { + "epoch": 0.11041, + "grad_norm": 0.5940027764338971, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 11041 + }, + { + "epoch": 0.11042, + "grad_norm": 0.6049326361318368, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 11042 + }, + { + "epoch": 0.11043, + "grad_norm": 0.629245191369106, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 11043 + }, + { + "epoch": 0.11044, + "grad_norm": 0.5444625180097691, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 11044 + }, + { + "epoch": 0.11045, + "grad_norm": 0.5617706070761912, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 11045 + }, + { + "epoch": 0.11046, + "grad_norm": 0.5611245309640837, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 11046 + }, + { + "epoch": 0.11047, + "grad_norm": 0.5187354432594959, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 11047 + }, + { + "epoch": 0.11048, + "grad_norm": 0.5788524265693503, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 11048 + }, + { + "epoch": 0.11049, + "grad_norm": 0.7277778292414144, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 11049 + }, + { + "epoch": 0.1105, + "grad_norm": 0.9135531216313109, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 11050 + }, + { + "epoch": 0.11051, + "grad_norm": 1.0056469057417077, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 11051 + }, + { + "epoch": 0.11052, + "grad_norm": 0.9280627702680145, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 11052 + }, + { + "epoch": 0.11053, + "grad_norm": 0.898418645317437, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 11053 + }, + { + "epoch": 0.11054, + "grad_norm": 0.9058851919381276, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 11054 + }, + { + "epoch": 0.11055, + "grad_norm": 0.8116406755293073, + "learning_rate": 0.003, + "loss": 4.081, + "step": 11055 + }, + { + "epoch": 0.11056, + "grad_norm": 0.8161785790884862, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 11056 + }, + { + "epoch": 0.11057, + "grad_norm": 0.867405007787928, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 11057 + }, + { + "epoch": 0.11058, + "grad_norm": 0.9816412480496788, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 11058 + }, + { + "epoch": 0.11059, + "grad_norm": 1.1515701005249115, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 11059 + }, + { + "epoch": 0.1106, + "grad_norm": 0.854982835118098, + "learning_rate": 0.003, + "loss": 4.093, + "step": 11060 + }, + { + "epoch": 0.11061, + "grad_norm": 0.8438666981265432, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 11061 + }, + { + "epoch": 0.11062, + "grad_norm": 0.9551886092704508, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 11062 + }, + { + "epoch": 0.11063, + "grad_norm": 0.933860815146107, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 11063 + }, + { + "epoch": 0.11064, + "grad_norm": 0.7776075181114828, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 11064 + }, + { + "epoch": 0.11065, + "grad_norm": 0.7850056440416847, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 11065 + }, + { + "epoch": 0.11066, + "grad_norm": 0.659664837771907, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 11066 + }, + { + "epoch": 0.11067, + "grad_norm": 0.6593281025138392, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 11067 + }, + { + "epoch": 0.11068, + "grad_norm": 0.6687637937193209, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 11068 + }, + { + "epoch": 0.11069, + "grad_norm": 0.7001621764244782, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 11069 + }, + { + "epoch": 0.1107, + "grad_norm": 0.7262171290177102, + "learning_rate": 0.003, + "loss": 4.086, + "step": 11070 + }, + { + "epoch": 0.11071, + "grad_norm": 0.800625243464924, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 11071 + }, + { + "epoch": 0.11072, + "grad_norm": 1.0052207819407468, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 11072 + }, + { + "epoch": 0.11073, + "grad_norm": 1.001613976466175, + "learning_rate": 0.003, + "loss": 4.1241, + "step": 11073 + }, + { + "epoch": 0.11074, + "grad_norm": 0.9625258224419659, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 11074 + }, + { + "epoch": 0.11075, + "grad_norm": 1.1074935298853081, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 11075 + }, + { + "epoch": 0.11076, + "grad_norm": 1.0561029884393052, + "learning_rate": 0.003, + "loss": 4.09, + "step": 11076 + }, + { + "epoch": 0.11077, + "grad_norm": 0.9285331088718336, + "learning_rate": 0.003, + "loss": 4.139, + "step": 11077 + }, + { + "epoch": 0.11078, + "grad_norm": 0.808604672012496, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 11078 + }, + { + "epoch": 0.11079, + "grad_norm": 0.7884733933274555, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 11079 + }, + { + "epoch": 0.1108, + "grad_norm": 0.8357394406437376, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 11080 + }, + { + "epoch": 0.11081, + "grad_norm": 0.8803590252787716, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 11081 + }, + { + "epoch": 0.11082, + "grad_norm": 0.9368453183007583, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 11082 + }, + { + "epoch": 0.11083, + "grad_norm": 0.9299308351951199, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 11083 + }, + { + "epoch": 0.11084, + "grad_norm": 0.870455859227154, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 11084 + }, + { + "epoch": 0.11085, + "grad_norm": 0.7216008943105169, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 11085 + }, + { + "epoch": 0.11086, + "grad_norm": 0.6948711660910164, + "learning_rate": 0.003, + "loss": 4.1077, + "step": 11086 + }, + { + "epoch": 0.11087, + "grad_norm": 0.6945540607673151, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 11087 + }, + { + "epoch": 0.11088, + "grad_norm": 0.638563248131485, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 11088 + }, + { + "epoch": 0.11089, + "grad_norm": 0.6341263639431465, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 11089 + }, + { + "epoch": 0.1109, + "grad_norm": 0.6174699458323581, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 11090 + }, + { + "epoch": 0.11091, + "grad_norm": 0.6103462124790471, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 11091 + }, + { + "epoch": 0.11092, + "grad_norm": 0.5651746008201992, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 11092 + }, + { + "epoch": 0.11093, + "grad_norm": 0.5601775914620788, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 11093 + }, + { + "epoch": 0.11094, + "grad_norm": 0.5516687114517382, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 11094 + }, + { + "epoch": 0.11095, + "grad_norm": 0.5876552445906201, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 11095 + }, + { + "epoch": 0.11096, + "grad_norm": 0.6849229580947964, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11096 + }, + { + "epoch": 0.11097, + "grad_norm": 0.90306848388154, + "learning_rate": 0.003, + "loss": 4.092, + "step": 11097 + }, + { + "epoch": 0.11098, + "grad_norm": 1.0965563470179573, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 11098 + }, + { + "epoch": 0.11099, + "grad_norm": 0.7817470235674453, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 11099 + }, + { + "epoch": 0.111, + "grad_norm": 0.5655622318949944, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 11100 + }, + { + "epoch": 0.11101, + "grad_norm": 0.6798505146702352, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 11101 + }, + { + "epoch": 0.11102, + "grad_norm": 0.8259776028595571, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 11102 + }, + { + "epoch": 0.11103, + "grad_norm": 0.8961065505995598, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 11103 + }, + { + "epoch": 0.11104, + "grad_norm": 0.8356772914510698, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 11104 + }, + { + "epoch": 0.11105, + "grad_norm": 0.8829032009312003, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 11105 + }, + { + "epoch": 0.11106, + "grad_norm": 0.9776127333423626, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 11106 + }, + { + "epoch": 0.11107, + "grad_norm": 0.9884899110891495, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 11107 + }, + { + "epoch": 0.11108, + "grad_norm": 0.9083186955420474, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 11108 + }, + { + "epoch": 0.11109, + "grad_norm": 0.9905629247807923, + "learning_rate": 0.003, + "loss": 4.1051, + "step": 11109 + }, + { + "epoch": 0.1111, + "grad_norm": 1.1256324486484663, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 11110 + }, + { + "epoch": 0.11111, + "grad_norm": 0.7276573929050033, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 11111 + }, + { + "epoch": 0.11112, + "grad_norm": 0.7229532212478316, + "learning_rate": 0.003, + "loss": 4.11, + "step": 11112 + }, + { + "epoch": 0.11113, + "grad_norm": 0.7357412365818277, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 11113 + }, + { + "epoch": 0.11114, + "grad_norm": 0.6686360455122295, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 11114 + }, + { + "epoch": 0.11115, + "grad_norm": 0.7741764825670838, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 11115 + }, + { + "epoch": 0.11116, + "grad_norm": 1.089654929098186, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 11116 + }, + { + "epoch": 0.11117, + "grad_norm": 1.003149957043394, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 11117 + }, + { + "epoch": 0.11118, + "grad_norm": 0.7553134094623033, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 11118 + }, + { + "epoch": 0.11119, + "grad_norm": 0.582610597098817, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 11119 + }, + { + "epoch": 0.1112, + "grad_norm": 0.6612281295879227, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 11120 + }, + { + "epoch": 0.11121, + "grad_norm": 0.8000233847001187, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 11121 + }, + { + "epoch": 0.11122, + "grad_norm": 0.8404674674939324, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 11122 + }, + { + "epoch": 0.11123, + "grad_norm": 0.979990963536137, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 11123 + }, + { + "epoch": 0.11124, + "grad_norm": 1.1400318707447292, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 11124 + }, + { + "epoch": 0.11125, + "grad_norm": 0.9186690040538052, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 11125 + }, + { + "epoch": 0.11126, + "grad_norm": 1.0404524691016466, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 11126 + }, + { + "epoch": 0.11127, + "grad_norm": 1.0107939634734449, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 11127 + }, + { + "epoch": 0.11128, + "grad_norm": 0.9893896552321982, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 11128 + }, + { + "epoch": 0.11129, + "grad_norm": 0.8167246210083996, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 11129 + }, + { + "epoch": 0.1113, + "grad_norm": 0.7190600151740959, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 11130 + }, + { + "epoch": 0.11131, + "grad_norm": 0.6945641839293223, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 11131 + }, + { + "epoch": 0.11132, + "grad_norm": 0.828200398218642, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 11132 + }, + { + "epoch": 0.11133, + "grad_norm": 0.7873960152046607, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 11133 + }, + { + "epoch": 0.11134, + "grad_norm": 0.759110078625811, + "learning_rate": 0.003, + "loss": 4.064, + "step": 11134 + }, + { + "epoch": 0.11135, + "grad_norm": 0.7169621318467949, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 11135 + }, + { + "epoch": 0.11136, + "grad_norm": 0.645979991251931, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 11136 + }, + { + "epoch": 0.11137, + "grad_norm": 0.68607581660032, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 11137 + }, + { + "epoch": 0.11138, + "grad_norm": 0.785577460381717, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 11138 + }, + { + "epoch": 0.11139, + "grad_norm": 0.8364211665979386, + "learning_rate": 0.003, + "loss": 4.106, + "step": 11139 + }, + { + "epoch": 0.1114, + "grad_norm": 0.7642698793727908, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 11140 + }, + { + "epoch": 0.11141, + "grad_norm": 0.713957137186134, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 11141 + }, + { + "epoch": 0.11142, + "grad_norm": 0.8090643725446109, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 11142 + }, + { + "epoch": 0.11143, + "grad_norm": 0.7445012677708782, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 11143 + }, + { + "epoch": 0.11144, + "grad_norm": 0.7713246383144959, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 11144 + }, + { + "epoch": 0.11145, + "grad_norm": 0.9585554430132573, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 11145 + }, + { + "epoch": 0.11146, + "grad_norm": 1.2437802015715147, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 11146 + }, + { + "epoch": 0.11147, + "grad_norm": 0.798064848892936, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 11147 + }, + { + "epoch": 0.11148, + "grad_norm": 0.6344607558816502, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 11148 + }, + { + "epoch": 0.11149, + "grad_norm": 0.5963078934623987, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 11149 + }, + { + "epoch": 0.1115, + "grad_norm": 0.7150086302385679, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 11150 + }, + { + "epoch": 0.11151, + "grad_norm": 0.7228254346279399, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 11151 + }, + { + "epoch": 0.11152, + "grad_norm": 0.7880303348317202, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 11152 + }, + { + "epoch": 0.11153, + "grad_norm": 0.810534284842259, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 11153 + }, + { + "epoch": 0.11154, + "grad_norm": 0.8074464292942731, + "learning_rate": 0.003, + "loss": 4.092, + "step": 11154 + }, + { + "epoch": 0.11155, + "grad_norm": 0.8382989830252079, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 11155 + }, + { + "epoch": 0.11156, + "grad_norm": 0.875789436517272, + "learning_rate": 0.003, + "loss": 4.094, + "step": 11156 + }, + { + "epoch": 0.11157, + "grad_norm": 0.9604344902781272, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 11157 + }, + { + "epoch": 0.11158, + "grad_norm": 0.7338788485272214, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 11158 + }, + { + "epoch": 0.11159, + "grad_norm": 0.7075825561394384, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 11159 + }, + { + "epoch": 0.1116, + "grad_norm": 0.606268141593149, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 11160 + }, + { + "epoch": 0.11161, + "grad_norm": 0.679934195703754, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 11161 + }, + { + "epoch": 0.11162, + "grad_norm": 0.7656715332302683, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 11162 + }, + { + "epoch": 0.11163, + "grad_norm": 0.7480394445842258, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 11163 + }, + { + "epoch": 0.11164, + "grad_norm": 0.6028095169334248, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 11164 + }, + { + "epoch": 0.11165, + "grad_norm": 0.6318752878707158, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 11165 + }, + { + "epoch": 0.11166, + "grad_norm": 0.8879033904433962, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 11166 + }, + { + "epoch": 0.11167, + "grad_norm": 1.4274703348853277, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 11167 + }, + { + "epoch": 0.11168, + "grad_norm": 0.6922698324968064, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 11168 + }, + { + "epoch": 0.11169, + "grad_norm": 0.6067024325327948, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 11169 + }, + { + "epoch": 0.1117, + "grad_norm": 0.7340926860018586, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 11170 + }, + { + "epoch": 0.11171, + "grad_norm": 0.8701848545174916, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 11171 + }, + { + "epoch": 0.11172, + "grad_norm": 0.9155073360821716, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 11172 + }, + { + "epoch": 0.11173, + "grad_norm": 0.8873135441014273, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 11173 + }, + { + "epoch": 0.11174, + "grad_norm": 0.8199142395718748, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 11174 + }, + { + "epoch": 0.11175, + "grad_norm": 0.8017348512165051, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 11175 + }, + { + "epoch": 0.11176, + "grad_norm": 0.786558601699622, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 11176 + }, + { + "epoch": 0.11177, + "grad_norm": 0.7339391142845619, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 11177 + }, + { + "epoch": 0.11178, + "grad_norm": 0.6520365160874647, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 11178 + }, + { + "epoch": 0.11179, + "grad_norm": 0.6960486782654498, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 11179 + }, + { + "epoch": 0.1118, + "grad_norm": 0.7364223235410267, + "learning_rate": 0.003, + "loss": 4.08, + "step": 11180 + }, + { + "epoch": 0.11181, + "grad_norm": 0.8184326717593928, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 11181 + }, + { + "epoch": 0.11182, + "grad_norm": 1.0033279367101047, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 11182 + }, + { + "epoch": 0.11183, + "grad_norm": 1.3151922001789536, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 11183 + }, + { + "epoch": 0.11184, + "grad_norm": 0.7681398620969967, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 11184 + }, + { + "epoch": 0.11185, + "grad_norm": 0.6324973183291746, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 11185 + }, + { + "epoch": 0.11186, + "grad_norm": 0.6442181131175263, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 11186 + }, + { + "epoch": 0.11187, + "grad_norm": 0.6632643336699477, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 11187 + }, + { + "epoch": 0.11188, + "grad_norm": 0.737583248978792, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 11188 + }, + { + "epoch": 0.11189, + "grad_norm": 0.9257422382836547, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 11189 + }, + { + "epoch": 0.1119, + "grad_norm": 1.0748123757293582, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 11190 + }, + { + "epoch": 0.11191, + "grad_norm": 1.0669751328722072, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 11191 + }, + { + "epoch": 0.11192, + "grad_norm": 0.9475513799019504, + "learning_rate": 0.003, + "loss": 4.12, + "step": 11192 + }, + { + "epoch": 0.11193, + "grad_norm": 0.980393271477006, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 11193 + }, + { + "epoch": 0.11194, + "grad_norm": 0.9463524671897485, + "learning_rate": 0.003, + "loss": 4.1257, + "step": 11194 + }, + { + "epoch": 0.11195, + "grad_norm": 0.8342265267333907, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 11195 + }, + { + "epoch": 0.11196, + "grad_norm": 0.7999477240923631, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 11196 + }, + { + "epoch": 0.11197, + "grad_norm": 0.8902807757761849, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 11197 + }, + { + "epoch": 0.11198, + "grad_norm": 0.8987577748435858, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 11198 + }, + { + "epoch": 0.11199, + "grad_norm": 0.9852587056057885, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 11199 + }, + { + "epoch": 0.112, + "grad_norm": 1.4745377463079015, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 11200 + }, + { + "epoch": 0.11201, + "grad_norm": 0.6922170191393475, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 11201 + }, + { + "epoch": 0.11202, + "grad_norm": 0.667280520513202, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 11202 + }, + { + "epoch": 0.11203, + "grad_norm": 0.725043754467325, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 11203 + }, + { + "epoch": 0.11204, + "grad_norm": 0.7905874419302064, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 11204 + }, + { + "epoch": 0.11205, + "grad_norm": 0.798668273386535, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 11205 + }, + { + "epoch": 0.11206, + "grad_norm": 0.9353317315582219, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 11206 + }, + { + "epoch": 0.11207, + "grad_norm": 1.1942296263526493, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 11207 + }, + { + "epoch": 0.11208, + "grad_norm": 0.8285060649114675, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 11208 + }, + { + "epoch": 0.11209, + "grad_norm": 0.7602845324086417, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 11209 + }, + { + "epoch": 0.1121, + "grad_norm": 0.7461526534362624, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 11210 + }, + { + "epoch": 0.11211, + "grad_norm": 0.7174801894516294, + "learning_rate": 0.003, + "loss": 4.103, + "step": 11211 + }, + { + "epoch": 0.11212, + "grad_norm": 0.5623731031335691, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 11212 + }, + { + "epoch": 0.11213, + "grad_norm": 0.5256282911301774, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 11213 + }, + { + "epoch": 0.11214, + "grad_norm": 0.524622311054672, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 11214 + }, + { + "epoch": 0.11215, + "grad_norm": 0.5705507694064851, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 11215 + }, + { + "epoch": 0.11216, + "grad_norm": 0.5728925901575492, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 11216 + }, + { + "epoch": 0.11217, + "grad_norm": 0.5692961251822839, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 11217 + }, + { + "epoch": 0.11218, + "grad_norm": 0.6711715531082638, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 11218 + }, + { + "epoch": 0.11219, + "grad_norm": 0.846884977023136, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 11219 + }, + { + "epoch": 0.1122, + "grad_norm": 0.9680363037463088, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 11220 + }, + { + "epoch": 0.11221, + "grad_norm": 1.0420788650409123, + "learning_rate": 0.003, + "loss": 4.085, + "step": 11221 + }, + { + "epoch": 0.11222, + "grad_norm": 0.9004639432683451, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 11222 + }, + { + "epoch": 0.11223, + "grad_norm": 0.8393242438846302, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 11223 + }, + { + "epoch": 0.11224, + "grad_norm": 0.7990820081600976, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 11224 + }, + { + "epoch": 0.11225, + "grad_norm": 0.6574466208192928, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 11225 + }, + { + "epoch": 0.11226, + "grad_norm": 0.6759618474023606, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 11226 + }, + { + "epoch": 0.11227, + "grad_norm": 0.7609637817250507, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 11227 + }, + { + "epoch": 0.11228, + "grad_norm": 0.7661746709409221, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 11228 + }, + { + "epoch": 0.11229, + "grad_norm": 0.6810587343834144, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 11229 + }, + { + "epoch": 0.1123, + "grad_norm": 0.7436685296036734, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 11230 + }, + { + "epoch": 0.11231, + "grad_norm": 0.7175074007556332, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 11231 + }, + { + "epoch": 0.11232, + "grad_norm": 0.8598338665744785, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 11232 + }, + { + "epoch": 0.11233, + "grad_norm": 0.858424185394917, + "learning_rate": 0.003, + "loss": 4.096, + "step": 11233 + }, + { + "epoch": 0.11234, + "grad_norm": 0.8579128611958361, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 11234 + }, + { + "epoch": 0.11235, + "grad_norm": 1.0397658926409876, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 11235 + }, + { + "epoch": 0.11236, + "grad_norm": 1.1354136952993683, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 11236 + }, + { + "epoch": 0.11237, + "grad_norm": 0.9343726823813238, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 11237 + }, + { + "epoch": 0.11238, + "grad_norm": 0.9224423526242113, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 11238 + }, + { + "epoch": 0.11239, + "grad_norm": 1.130748748117673, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 11239 + }, + { + "epoch": 0.1124, + "grad_norm": 1.0017172539268393, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 11240 + }, + { + "epoch": 0.11241, + "grad_norm": 0.9175963249395397, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 11241 + }, + { + "epoch": 0.11242, + "grad_norm": 0.9534075599579533, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 11242 + }, + { + "epoch": 0.11243, + "grad_norm": 0.9189430914908193, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 11243 + }, + { + "epoch": 0.11244, + "grad_norm": 0.7555180429260873, + "learning_rate": 0.003, + "loss": 4.081, + "step": 11244 + }, + { + "epoch": 0.11245, + "grad_norm": 0.7229268696180132, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 11245 + }, + { + "epoch": 0.11246, + "grad_norm": 0.6252394878517432, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 11246 + }, + { + "epoch": 0.11247, + "grad_norm": 0.6029174577675842, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 11247 + }, + { + "epoch": 0.11248, + "grad_norm": 0.6090033628973571, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 11248 + }, + { + "epoch": 0.11249, + "grad_norm": 0.5542984043971567, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 11249 + }, + { + "epoch": 0.1125, + "grad_norm": 0.5694809489509668, + "learning_rate": 0.003, + "loss": 4.055, + "step": 11250 + }, + { + "epoch": 0.11251, + "grad_norm": 0.7028353991716569, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 11251 + }, + { + "epoch": 0.11252, + "grad_norm": 0.9484323487852909, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 11252 + }, + { + "epoch": 0.11253, + "grad_norm": 1.125644085468693, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 11253 + }, + { + "epoch": 0.11254, + "grad_norm": 0.7806047546301479, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11254 + }, + { + "epoch": 0.11255, + "grad_norm": 0.6341972128498081, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 11255 + }, + { + "epoch": 0.11256, + "grad_norm": 0.6939257636285394, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 11256 + }, + { + "epoch": 0.11257, + "grad_norm": 0.7911168291243957, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11257 + }, + { + "epoch": 0.11258, + "grad_norm": 0.8498509190820276, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 11258 + }, + { + "epoch": 0.11259, + "grad_norm": 0.935207478747258, + "learning_rate": 0.003, + "loss": 4.1129, + "step": 11259 + }, + { + "epoch": 0.1126, + "grad_norm": 0.9561311217334402, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 11260 + }, + { + "epoch": 0.11261, + "grad_norm": 0.9565326560921091, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 11261 + }, + { + "epoch": 0.11262, + "grad_norm": 0.9369527481049401, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 11262 + }, + { + "epoch": 0.11263, + "grad_norm": 0.8125887510030267, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 11263 + }, + { + "epoch": 0.11264, + "grad_norm": 0.7702865848328564, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 11264 + }, + { + "epoch": 0.11265, + "grad_norm": 0.8334162587414682, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 11265 + }, + { + "epoch": 0.11266, + "grad_norm": 0.7473872641971427, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 11266 + }, + { + "epoch": 0.11267, + "grad_norm": 0.6758426957564395, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 11267 + }, + { + "epoch": 0.11268, + "grad_norm": 0.7094024271823733, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 11268 + }, + { + "epoch": 0.11269, + "grad_norm": 0.6070926927638549, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 11269 + }, + { + "epoch": 0.1127, + "grad_norm": 0.4901913598283984, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 11270 + }, + { + "epoch": 0.11271, + "grad_norm": 0.4675843897479997, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 11271 + }, + { + "epoch": 0.11272, + "grad_norm": 0.4938736742635015, + "learning_rate": 0.003, + "loss": 4.064, + "step": 11272 + }, + { + "epoch": 0.11273, + "grad_norm": 0.5255525500157796, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 11273 + }, + { + "epoch": 0.11274, + "grad_norm": 0.5870623589704691, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 11274 + }, + { + "epoch": 0.11275, + "grad_norm": 0.6229595008991105, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 11275 + }, + { + "epoch": 0.11276, + "grad_norm": 0.6703357191635001, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 11276 + }, + { + "epoch": 0.11277, + "grad_norm": 0.836221722646284, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 11277 + }, + { + "epoch": 0.11278, + "grad_norm": 1.12987602448386, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 11278 + }, + { + "epoch": 0.11279, + "grad_norm": 0.8932128144389052, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 11279 + }, + { + "epoch": 0.1128, + "grad_norm": 0.8636575495307199, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 11280 + }, + { + "epoch": 0.11281, + "grad_norm": 0.8866969543624653, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 11281 + }, + { + "epoch": 0.11282, + "grad_norm": 0.9013365729779537, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 11282 + }, + { + "epoch": 0.11283, + "grad_norm": 0.8704159020885619, + "learning_rate": 0.003, + "loss": 4.09, + "step": 11283 + }, + { + "epoch": 0.11284, + "grad_norm": 0.8582948331776299, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 11284 + }, + { + "epoch": 0.11285, + "grad_norm": 0.7706440632938059, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 11285 + }, + { + "epoch": 0.11286, + "grad_norm": 0.7852780669628593, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 11286 + }, + { + "epoch": 0.11287, + "grad_norm": 0.8040334833067401, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 11287 + }, + { + "epoch": 0.11288, + "grad_norm": 0.6842046423450019, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 11288 + }, + { + "epoch": 0.11289, + "grad_norm": 0.7553569736226131, + "learning_rate": 0.003, + "loss": 4.1, + "step": 11289 + }, + { + "epoch": 0.1129, + "grad_norm": 0.7935338155336644, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 11290 + }, + { + "epoch": 0.11291, + "grad_norm": 0.7883400874254108, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 11291 + }, + { + "epoch": 0.11292, + "grad_norm": 0.8376593583255619, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 11292 + }, + { + "epoch": 0.11293, + "grad_norm": 0.9521589854357583, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 11293 + }, + { + "epoch": 0.11294, + "grad_norm": 1.0937306713955737, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 11294 + }, + { + "epoch": 0.11295, + "grad_norm": 1.161053778562748, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 11295 + }, + { + "epoch": 0.11296, + "grad_norm": 0.924084583616854, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 11296 + }, + { + "epoch": 0.11297, + "grad_norm": 0.8614899789944479, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 11297 + }, + { + "epoch": 0.11298, + "grad_norm": 0.7961809396109322, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 11298 + }, + { + "epoch": 0.11299, + "grad_norm": 0.8153057757029059, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 11299 + }, + { + "epoch": 0.113, + "grad_norm": 0.8197364004628112, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 11300 + }, + { + "epoch": 0.11301, + "grad_norm": 0.7907672894411515, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 11301 + }, + { + "epoch": 0.11302, + "grad_norm": 0.8034674001126053, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 11302 + }, + { + "epoch": 0.11303, + "grad_norm": 0.8567329453989047, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 11303 + }, + { + "epoch": 0.11304, + "grad_norm": 0.7296740606258915, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 11304 + }, + { + "epoch": 0.11305, + "grad_norm": 0.7929787988472935, + "learning_rate": 0.003, + "loss": 4.067, + "step": 11305 + }, + { + "epoch": 0.11306, + "grad_norm": 0.7956940101721797, + "learning_rate": 0.003, + "loss": 4.063, + "step": 11306 + }, + { + "epoch": 0.11307, + "grad_norm": 0.832525317844469, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 11307 + }, + { + "epoch": 0.11308, + "grad_norm": 1.0211642140432524, + "learning_rate": 0.003, + "loss": 4.084, + "step": 11308 + }, + { + "epoch": 0.11309, + "grad_norm": 1.2631486891454777, + "learning_rate": 0.003, + "loss": 4.1038, + "step": 11309 + }, + { + "epoch": 0.1131, + "grad_norm": 0.6534457353954479, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 11310 + }, + { + "epoch": 0.11311, + "grad_norm": 0.641428212190924, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 11311 + }, + { + "epoch": 0.11312, + "grad_norm": 0.6071836945502547, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 11312 + }, + { + "epoch": 0.11313, + "grad_norm": 0.6827199782593909, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 11313 + }, + { + "epoch": 0.11314, + "grad_norm": 0.7595211649403993, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 11314 + }, + { + "epoch": 0.11315, + "grad_norm": 0.7847473307102096, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 11315 + }, + { + "epoch": 0.11316, + "grad_norm": 0.8610672104104965, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 11316 + }, + { + "epoch": 0.11317, + "grad_norm": 1.0152458189238245, + "learning_rate": 0.003, + "loss": 4.094, + "step": 11317 + }, + { + "epoch": 0.11318, + "grad_norm": 1.0270125585690475, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 11318 + }, + { + "epoch": 0.11319, + "grad_norm": 0.7529031368887859, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 11319 + }, + { + "epoch": 0.1132, + "grad_norm": 0.7652686727641066, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 11320 + }, + { + "epoch": 0.11321, + "grad_norm": 0.7134228153263527, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 11321 + }, + { + "epoch": 0.11322, + "grad_norm": 0.749176398389972, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 11322 + }, + { + "epoch": 0.11323, + "grad_norm": 0.8053900797064021, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 11323 + }, + { + "epoch": 0.11324, + "grad_norm": 0.7753485199215412, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 11324 + }, + { + "epoch": 0.11325, + "grad_norm": 0.7849386213255387, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 11325 + }, + { + "epoch": 0.11326, + "grad_norm": 0.7789568595648719, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 11326 + }, + { + "epoch": 0.11327, + "grad_norm": 0.714660474310829, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 11327 + }, + { + "epoch": 0.11328, + "grad_norm": 0.6688504890705055, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 11328 + }, + { + "epoch": 0.11329, + "grad_norm": 0.6702221359575178, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 11329 + }, + { + "epoch": 0.1133, + "grad_norm": 0.7804814525640149, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 11330 + }, + { + "epoch": 0.11331, + "grad_norm": 0.766311055294654, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 11331 + }, + { + "epoch": 0.11332, + "grad_norm": 0.9002333596500396, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 11332 + }, + { + "epoch": 0.11333, + "grad_norm": 1.1717653891998372, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 11333 + }, + { + "epoch": 0.11334, + "grad_norm": 1.029720674784894, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 11334 + }, + { + "epoch": 0.11335, + "grad_norm": 0.9739791263307555, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 11335 + }, + { + "epoch": 0.11336, + "grad_norm": 0.9242582255769969, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 11336 + }, + { + "epoch": 0.11337, + "grad_norm": 0.7951647257658111, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 11337 + }, + { + "epoch": 0.11338, + "grad_norm": 0.7120786303604073, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 11338 + }, + { + "epoch": 0.11339, + "grad_norm": 0.712282867173393, + "learning_rate": 0.003, + "loss": 4.1177, + "step": 11339 + }, + { + "epoch": 0.1134, + "grad_norm": 0.7880488880259838, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 11340 + }, + { + "epoch": 0.11341, + "grad_norm": 0.9025993455958837, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 11341 + }, + { + "epoch": 0.11342, + "grad_norm": 1.0478701099210872, + "learning_rate": 0.003, + "loss": 4.076, + "step": 11342 + }, + { + "epoch": 0.11343, + "grad_norm": 0.8847627483247708, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 11343 + }, + { + "epoch": 0.11344, + "grad_norm": 0.960637853491343, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 11344 + }, + { + "epoch": 0.11345, + "grad_norm": 0.9450037331480519, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 11345 + }, + { + "epoch": 0.11346, + "grad_norm": 0.8502331559297919, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 11346 + }, + { + "epoch": 0.11347, + "grad_norm": 0.8682577083722111, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 11347 + }, + { + "epoch": 0.11348, + "grad_norm": 0.659113384235199, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 11348 + }, + { + "epoch": 0.11349, + "grad_norm": 0.6504104730803049, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 11349 + }, + { + "epoch": 0.1135, + "grad_norm": 0.6787150947859348, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 11350 + }, + { + "epoch": 0.11351, + "grad_norm": 0.7143052930987581, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 11351 + }, + { + "epoch": 0.11352, + "grad_norm": 0.8164438634612279, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 11352 + }, + { + "epoch": 0.11353, + "grad_norm": 1.0760369267228487, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 11353 + }, + { + "epoch": 0.11354, + "grad_norm": 1.0935038590588513, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 11354 + }, + { + "epoch": 0.11355, + "grad_norm": 0.8753571065441305, + "learning_rate": 0.003, + "loss": 4.12, + "step": 11355 + }, + { + "epoch": 0.11356, + "grad_norm": 0.8593670492792348, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 11356 + }, + { + "epoch": 0.11357, + "grad_norm": 0.8893452051885853, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 11357 + }, + { + "epoch": 0.11358, + "grad_norm": 1.0189217114351887, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 11358 + }, + { + "epoch": 0.11359, + "grad_norm": 0.9570621972206529, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 11359 + }, + { + "epoch": 0.1136, + "grad_norm": 0.8724005228517467, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 11360 + }, + { + "epoch": 0.11361, + "grad_norm": 0.9174533262178383, + "learning_rate": 0.003, + "loss": 4.064, + "step": 11361 + }, + { + "epoch": 0.11362, + "grad_norm": 0.932112927523047, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 11362 + }, + { + "epoch": 0.11363, + "grad_norm": 0.8755211146818656, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 11363 + }, + { + "epoch": 0.11364, + "grad_norm": 0.8389905241810933, + "learning_rate": 0.003, + "loss": 4.103, + "step": 11364 + }, + { + "epoch": 0.11365, + "grad_norm": 0.7005566512124908, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 11365 + }, + { + "epoch": 0.11366, + "grad_norm": 0.6535254256303773, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 11366 + }, + { + "epoch": 0.11367, + "grad_norm": 0.6972575428435476, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 11367 + }, + { + "epoch": 0.11368, + "grad_norm": 0.6107473289458821, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 11368 + }, + { + "epoch": 0.11369, + "grad_norm": 0.6124903478646556, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 11369 + }, + { + "epoch": 0.1137, + "grad_norm": 0.5656801850453923, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 11370 + }, + { + "epoch": 0.11371, + "grad_norm": 0.595358154993407, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 11371 + }, + { + "epoch": 0.11372, + "grad_norm": 0.6767828600929685, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 11372 + }, + { + "epoch": 0.11373, + "grad_norm": 0.6461746119819802, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 11373 + }, + { + "epoch": 0.11374, + "grad_norm": 0.604714571043439, + "learning_rate": 0.003, + "loss": 4.078, + "step": 11374 + }, + { + "epoch": 0.11375, + "grad_norm": 0.48164072007418185, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 11375 + }, + { + "epoch": 0.11376, + "grad_norm": 0.4519231553429714, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 11376 + }, + { + "epoch": 0.11377, + "grad_norm": 0.4901757436491158, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 11377 + }, + { + "epoch": 0.11378, + "grad_norm": 0.5499550851093622, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 11378 + }, + { + "epoch": 0.11379, + "grad_norm": 0.7378779725745853, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 11379 + }, + { + "epoch": 0.1138, + "grad_norm": 0.9744528980513482, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 11380 + }, + { + "epoch": 0.11381, + "grad_norm": 1.1621155727424501, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 11381 + }, + { + "epoch": 0.11382, + "grad_norm": 0.8383393396549385, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 11382 + }, + { + "epoch": 0.11383, + "grad_norm": 0.9902959766843705, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 11383 + }, + { + "epoch": 0.11384, + "grad_norm": 1.1649706827514636, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 11384 + }, + { + "epoch": 0.11385, + "grad_norm": 0.9401904170546886, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 11385 + }, + { + "epoch": 0.11386, + "grad_norm": 0.8918665757510584, + "learning_rate": 0.003, + "loss": 4.097, + "step": 11386 + }, + { + "epoch": 0.11387, + "grad_norm": 0.7858255897476053, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 11387 + }, + { + "epoch": 0.11388, + "grad_norm": 0.8812672645432824, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 11388 + }, + { + "epoch": 0.11389, + "grad_norm": 0.974005966509601, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 11389 + }, + { + "epoch": 0.1139, + "grad_norm": 0.9951555972112742, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 11390 + }, + { + "epoch": 0.11391, + "grad_norm": 0.7582466738628498, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 11391 + }, + { + "epoch": 0.11392, + "grad_norm": 0.7432719879616099, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 11392 + }, + { + "epoch": 0.11393, + "grad_norm": 0.8660936215295046, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 11393 + }, + { + "epoch": 0.11394, + "grad_norm": 0.8426330885658078, + "learning_rate": 0.003, + "loss": 4.057, + "step": 11394 + }, + { + "epoch": 0.11395, + "grad_norm": 0.7299468081687056, + "learning_rate": 0.003, + "loss": 4.087, + "step": 11395 + }, + { + "epoch": 0.11396, + "grad_norm": 0.8781582130259117, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 11396 + }, + { + "epoch": 0.11397, + "grad_norm": 1.0520706764406507, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 11397 + }, + { + "epoch": 0.11398, + "grad_norm": 1.1643736381328387, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 11398 + }, + { + "epoch": 0.11399, + "grad_norm": 0.8314606737771553, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 11399 + }, + { + "epoch": 0.114, + "grad_norm": 0.6798385625229799, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 11400 + }, + { + "epoch": 0.11401, + "grad_norm": 0.6336333271230933, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 11401 + }, + { + "epoch": 0.11402, + "grad_norm": 0.6573029968753343, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 11402 + }, + { + "epoch": 0.11403, + "grad_norm": 0.5896113626429579, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 11403 + }, + { + "epoch": 0.11404, + "grad_norm": 0.6243386792726651, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 11404 + }, + { + "epoch": 0.11405, + "grad_norm": 0.6821108719160749, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 11405 + }, + { + "epoch": 0.11406, + "grad_norm": 0.8538350667629351, + "learning_rate": 0.003, + "loss": 4.105, + "step": 11406 + }, + { + "epoch": 0.11407, + "grad_norm": 1.0878784230608254, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 11407 + }, + { + "epoch": 0.11408, + "grad_norm": 0.9450618160165626, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 11408 + }, + { + "epoch": 0.11409, + "grad_norm": 0.8358195561086937, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 11409 + }, + { + "epoch": 0.1141, + "grad_norm": 0.7769456605950715, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 11410 + }, + { + "epoch": 0.11411, + "grad_norm": 0.6725881254357532, + "learning_rate": 0.003, + "loss": 4.094, + "step": 11411 + }, + { + "epoch": 0.11412, + "grad_norm": 0.6317084075870764, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 11412 + }, + { + "epoch": 0.11413, + "grad_norm": 0.6892400591194415, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 11413 + }, + { + "epoch": 0.11414, + "grad_norm": 0.686994212866962, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 11414 + }, + { + "epoch": 0.11415, + "grad_norm": 0.7663353308798015, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 11415 + }, + { + "epoch": 0.11416, + "grad_norm": 0.8166295001650846, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 11416 + }, + { + "epoch": 0.11417, + "grad_norm": 0.8184869720580596, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 11417 + }, + { + "epoch": 0.11418, + "grad_norm": 0.838286743096995, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 11418 + }, + { + "epoch": 0.11419, + "grad_norm": 0.8381618543067891, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 11419 + }, + { + "epoch": 0.1142, + "grad_norm": 0.8838017739991473, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 11420 + }, + { + "epoch": 0.11421, + "grad_norm": 1.0594840809211652, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 11421 + }, + { + "epoch": 0.11422, + "grad_norm": 1.063582213055678, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 11422 + }, + { + "epoch": 0.11423, + "grad_norm": 0.969316505108481, + "learning_rate": 0.003, + "loss": 4.1194, + "step": 11423 + }, + { + "epoch": 0.11424, + "grad_norm": 0.8624992829553142, + "learning_rate": 0.003, + "loss": 4.1101, + "step": 11424 + }, + { + "epoch": 0.11425, + "grad_norm": 0.8210335247117174, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 11425 + }, + { + "epoch": 0.11426, + "grad_norm": 0.7752259282457125, + "learning_rate": 0.003, + "loss": 4.1094, + "step": 11426 + }, + { + "epoch": 0.11427, + "grad_norm": 0.7212696292472102, + "learning_rate": 0.003, + "loss": 4.078, + "step": 11427 + }, + { + "epoch": 0.11428, + "grad_norm": 0.7398697828017103, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 11428 + }, + { + "epoch": 0.11429, + "grad_norm": 0.838665969764261, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 11429 + }, + { + "epoch": 0.1143, + "grad_norm": 1.0609132590161845, + "learning_rate": 0.003, + "loss": 4.105, + "step": 11430 + }, + { + "epoch": 0.11431, + "grad_norm": 1.0204566381125515, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 11431 + }, + { + "epoch": 0.11432, + "grad_norm": 0.9229099605964733, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 11432 + }, + { + "epoch": 0.11433, + "grad_norm": 0.8108948768192613, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 11433 + }, + { + "epoch": 0.11434, + "grad_norm": 0.8017151237202406, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 11434 + }, + { + "epoch": 0.11435, + "grad_norm": 0.8134963922170028, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 11435 + }, + { + "epoch": 0.11436, + "grad_norm": 0.8594932491854741, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 11436 + }, + { + "epoch": 0.11437, + "grad_norm": 0.8605179134609567, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 11437 + }, + { + "epoch": 0.11438, + "grad_norm": 0.7706493151867468, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 11438 + }, + { + "epoch": 0.11439, + "grad_norm": 0.7876193908634262, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 11439 + }, + { + "epoch": 0.1144, + "grad_norm": 0.7417563149358162, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 11440 + }, + { + "epoch": 0.11441, + "grad_norm": 0.6831653796726206, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 11441 + }, + { + "epoch": 0.11442, + "grad_norm": 0.716445953485537, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11442 + }, + { + "epoch": 0.11443, + "grad_norm": 0.7845975212772534, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 11443 + }, + { + "epoch": 0.11444, + "grad_norm": 0.8859188228778797, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 11444 + }, + { + "epoch": 0.11445, + "grad_norm": 1.0662886467358412, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 11445 + }, + { + "epoch": 0.11446, + "grad_norm": 1.0791351632470838, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 11446 + }, + { + "epoch": 0.11447, + "grad_norm": 0.8063903960219335, + "learning_rate": 0.003, + "loss": 4.1224, + "step": 11447 + }, + { + "epoch": 0.11448, + "grad_norm": 0.7612790472500267, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 11448 + }, + { + "epoch": 0.11449, + "grad_norm": 0.8204595294586179, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 11449 + }, + { + "epoch": 0.1145, + "grad_norm": 0.9567275578093394, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 11450 + }, + { + "epoch": 0.11451, + "grad_norm": 1.041839573054231, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 11451 + }, + { + "epoch": 0.11452, + "grad_norm": 1.0196914215484494, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 11452 + }, + { + "epoch": 0.11453, + "grad_norm": 1.006303329412706, + "learning_rate": 0.003, + "loss": 4.085, + "step": 11453 + }, + { + "epoch": 0.11454, + "grad_norm": 0.809225131837401, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 11454 + }, + { + "epoch": 0.11455, + "grad_norm": 0.7460555423548791, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 11455 + }, + { + "epoch": 0.11456, + "grad_norm": 0.7670282450276944, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 11456 + }, + { + "epoch": 0.11457, + "grad_norm": 0.7274292844432366, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 11457 + }, + { + "epoch": 0.11458, + "grad_norm": 0.6995337734925635, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 11458 + }, + { + "epoch": 0.11459, + "grad_norm": 0.7095202336297924, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 11459 + }, + { + "epoch": 0.1146, + "grad_norm": 0.7245500167322204, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 11460 + }, + { + "epoch": 0.11461, + "grad_norm": 0.7527717681841568, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 11461 + }, + { + "epoch": 0.11462, + "grad_norm": 0.8087152381735974, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 11462 + }, + { + "epoch": 0.11463, + "grad_norm": 0.7632330734645594, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 11463 + }, + { + "epoch": 0.11464, + "grad_norm": 0.7753557989741182, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 11464 + }, + { + "epoch": 0.11465, + "grad_norm": 0.6697990719628059, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 11465 + }, + { + "epoch": 0.11466, + "grad_norm": 0.6008005372030594, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 11466 + }, + { + "epoch": 0.11467, + "grad_norm": 0.569637208753584, + "learning_rate": 0.003, + "loss": 4.087, + "step": 11467 + }, + { + "epoch": 0.11468, + "grad_norm": 0.5517799530171552, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 11468 + }, + { + "epoch": 0.11469, + "grad_norm": 0.6074959405867231, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 11469 + }, + { + "epoch": 0.1147, + "grad_norm": 0.6098855376813679, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 11470 + }, + { + "epoch": 0.11471, + "grad_norm": 0.6204049954948289, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 11471 + }, + { + "epoch": 0.11472, + "grad_norm": 0.6263198106194131, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 11472 + }, + { + "epoch": 0.11473, + "grad_norm": 0.678838673184852, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 11473 + }, + { + "epoch": 0.11474, + "grad_norm": 0.7927653456883111, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 11474 + }, + { + "epoch": 0.11475, + "grad_norm": 0.9809518620434418, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 11475 + }, + { + "epoch": 0.11476, + "grad_norm": 1.2903461691365623, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 11476 + }, + { + "epoch": 0.11477, + "grad_norm": 0.6913112712839737, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 11477 + }, + { + "epoch": 0.11478, + "grad_norm": 0.6541500342928829, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 11478 + }, + { + "epoch": 0.11479, + "grad_norm": 0.7000564726804843, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 11479 + }, + { + "epoch": 0.1148, + "grad_norm": 0.8708818991587933, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 11480 + }, + { + "epoch": 0.11481, + "grad_norm": 1.0678161831646351, + "learning_rate": 0.003, + "loss": 4.067, + "step": 11481 + }, + { + "epoch": 0.11482, + "grad_norm": 0.9724996724592337, + "learning_rate": 0.003, + "loss": 4.1138, + "step": 11482 + }, + { + "epoch": 0.11483, + "grad_norm": 0.9324828781779055, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 11483 + }, + { + "epoch": 0.11484, + "grad_norm": 0.9503019643609745, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 11484 + }, + { + "epoch": 0.11485, + "grad_norm": 1.0400641794373122, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 11485 + }, + { + "epoch": 0.11486, + "grad_norm": 1.052286895527032, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 11486 + }, + { + "epoch": 0.11487, + "grad_norm": 1.2140198115974443, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 11487 + }, + { + "epoch": 0.11488, + "grad_norm": 0.936990996016995, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 11488 + }, + { + "epoch": 0.11489, + "grad_norm": 1.026410885502423, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 11489 + }, + { + "epoch": 0.1149, + "grad_norm": 0.9947215426823502, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 11490 + }, + { + "epoch": 0.11491, + "grad_norm": 0.9054827913364408, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 11491 + }, + { + "epoch": 0.11492, + "grad_norm": 0.9483496834478657, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 11492 + }, + { + "epoch": 0.11493, + "grad_norm": 0.9246213336036101, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 11493 + }, + { + "epoch": 0.11494, + "grad_norm": 0.9009007669468101, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 11494 + }, + { + "epoch": 0.11495, + "grad_norm": 0.823448429847814, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 11495 + }, + { + "epoch": 0.11496, + "grad_norm": 0.7722589344197481, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 11496 + }, + { + "epoch": 0.11497, + "grad_norm": 0.8731844529480173, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 11497 + }, + { + "epoch": 0.11498, + "grad_norm": 1.0649491375982316, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 11498 + }, + { + "epoch": 0.11499, + "grad_norm": 0.99257363481656, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 11499 + }, + { + "epoch": 0.115, + "grad_norm": 1.1228448261634618, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 11500 + }, + { + "epoch": 0.11501, + "grad_norm": 0.914708366743979, + "learning_rate": 0.003, + "loss": 4.1185, + "step": 11501 + }, + { + "epoch": 0.11502, + "grad_norm": 0.7990894752219588, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 11502 + }, + { + "epoch": 0.11503, + "grad_norm": 0.8634859028430474, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 11503 + }, + { + "epoch": 0.11504, + "grad_norm": 0.8209522211301438, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 11504 + }, + { + "epoch": 0.11505, + "grad_norm": 0.896357133679053, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 11505 + }, + { + "epoch": 0.11506, + "grad_norm": 0.9311317378566581, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 11506 + }, + { + "epoch": 0.11507, + "grad_norm": 0.836624753600562, + "learning_rate": 0.003, + "loss": 4.111, + "step": 11507 + }, + { + "epoch": 0.11508, + "grad_norm": 0.7488419342087322, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 11508 + }, + { + "epoch": 0.11509, + "grad_norm": 0.8470336248385015, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 11509 + }, + { + "epoch": 0.1151, + "grad_norm": 0.6993378176385352, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 11510 + }, + { + "epoch": 0.11511, + "grad_norm": 0.6636735261002701, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 11511 + }, + { + "epoch": 0.11512, + "grad_norm": 0.7075834385932517, + "learning_rate": 0.003, + "loss": 4.071, + "step": 11512 + }, + { + "epoch": 0.11513, + "grad_norm": 0.8777037605850598, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 11513 + }, + { + "epoch": 0.11514, + "grad_norm": 1.0888790860680126, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 11514 + }, + { + "epoch": 0.11515, + "grad_norm": 0.7811499622133515, + "learning_rate": 0.003, + "loss": 4.119, + "step": 11515 + }, + { + "epoch": 0.11516, + "grad_norm": 0.587781962912552, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 11516 + }, + { + "epoch": 0.11517, + "grad_norm": 0.567097188926201, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 11517 + }, + { + "epoch": 0.11518, + "grad_norm": 0.5739870205232486, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 11518 + }, + { + "epoch": 0.11519, + "grad_norm": 0.5813935837226678, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 11519 + }, + { + "epoch": 0.1152, + "grad_norm": 0.5907545971598139, + "learning_rate": 0.003, + "loss": 4.039, + "step": 11520 + }, + { + "epoch": 0.11521, + "grad_norm": 0.5751735671742095, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 11521 + }, + { + "epoch": 0.11522, + "grad_norm": 0.6059075134416301, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11522 + }, + { + "epoch": 0.11523, + "grad_norm": 0.5832577261226649, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 11523 + }, + { + "epoch": 0.11524, + "grad_norm": 0.6607645711799944, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 11524 + }, + { + "epoch": 0.11525, + "grad_norm": 0.8334570906671758, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 11525 + }, + { + "epoch": 0.11526, + "grad_norm": 1.0909624802832556, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 11526 + }, + { + "epoch": 0.11527, + "grad_norm": 0.8976589256201832, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 11527 + }, + { + "epoch": 0.11528, + "grad_norm": 0.8247848124617317, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 11528 + }, + { + "epoch": 0.11529, + "grad_norm": 0.7003087941348926, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 11529 + }, + { + "epoch": 0.1153, + "grad_norm": 0.6668416364375418, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 11530 + }, + { + "epoch": 0.11531, + "grad_norm": 0.6635552982535685, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 11531 + }, + { + "epoch": 0.11532, + "grad_norm": 0.6895585494087922, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 11532 + }, + { + "epoch": 0.11533, + "grad_norm": 0.7200153011206304, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 11533 + }, + { + "epoch": 0.11534, + "grad_norm": 0.7002579219392758, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 11534 + }, + { + "epoch": 0.11535, + "grad_norm": 0.7115627139627968, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 11535 + }, + { + "epoch": 0.11536, + "grad_norm": 0.6951597815398586, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 11536 + }, + { + "epoch": 0.11537, + "grad_norm": 0.7358087564040365, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 11537 + }, + { + "epoch": 0.11538, + "grad_norm": 0.8336474073285849, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 11538 + }, + { + "epoch": 0.11539, + "grad_norm": 0.8213311226682373, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 11539 + }, + { + "epoch": 0.1154, + "grad_norm": 0.8566023662184081, + "learning_rate": 0.003, + "loss": 4.1, + "step": 11540 + }, + { + "epoch": 0.11541, + "grad_norm": 0.967516698557583, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 11541 + }, + { + "epoch": 0.11542, + "grad_norm": 1.1379133822032814, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 11542 + }, + { + "epoch": 0.11543, + "grad_norm": 0.8990012504669521, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 11543 + }, + { + "epoch": 0.11544, + "grad_norm": 0.7155605857504481, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 11544 + }, + { + "epoch": 0.11545, + "grad_norm": 0.6319060568344189, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 11545 + }, + { + "epoch": 0.11546, + "grad_norm": 0.7034897758309688, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 11546 + }, + { + "epoch": 0.11547, + "grad_norm": 0.8461973251976806, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 11547 + }, + { + "epoch": 0.11548, + "grad_norm": 0.908594053225602, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 11548 + }, + { + "epoch": 0.11549, + "grad_norm": 0.8941198601074396, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 11549 + }, + { + "epoch": 0.1155, + "grad_norm": 1.0641433609898754, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 11550 + }, + { + "epoch": 0.11551, + "grad_norm": 1.1226709769273038, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 11551 + }, + { + "epoch": 0.11552, + "grad_norm": 0.8744132912716648, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 11552 + }, + { + "epoch": 0.11553, + "grad_norm": 0.8749775551915074, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 11553 + }, + { + "epoch": 0.11554, + "grad_norm": 1.03868006630079, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 11554 + }, + { + "epoch": 0.11555, + "grad_norm": 1.1003305815443707, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 11555 + }, + { + "epoch": 0.11556, + "grad_norm": 0.7187702623784662, + "learning_rate": 0.003, + "loss": 4.057, + "step": 11556 + }, + { + "epoch": 0.11557, + "grad_norm": 0.6407519820442772, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11557 + }, + { + "epoch": 0.11558, + "grad_norm": 0.607065342726708, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 11558 + }, + { + "epoch": 0.11559, + "grad_norm": 0.6156699142506642, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 11559 + }, + { + "epoch": 0.1156, + "grad_norm": 0.7095856346271662, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 11560 + }, + { + "epoch": 0.11561, + "grad_norm": 0.8191987480258054, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 11561 + }, + { + "epoch": 0.11562, + "grad_norm": 0.9460987749435492, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 11562 + }, + { + "epoch": 0.11563, + "grad_norm": 1.1061407229901132, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 11563 + }, + { + "epoch": 0.11564, + "grad_norm": 0.9598519414792321, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 11564 + }, + { + "epoch": 0.11565, + "grad_norm": 1.0113948710353295, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 11565 + }, + { + "epoch": 0.11566, + "grad_norm": 0.9985959696805135, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 11566 + }, + { + "epoch": 0.11567, + "grad_norm": 0.9293996191943726, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 11567 + }, + { + "epoch": 0.11568, + "grad_norm": 0.8471543203842765, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 11568 + }, + { + "epoch": 0.11569, + "grad_norm": 0.7518687221166575, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 11569 + }, + { + "epoch": 0.1157, + "grad_norm": 0.7235992344577318, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 11570 + }, + { + "epoch": 0.11571, + "grad_norm": 0.690149421887094, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 11571 + }, + { + "epoch": 0.11572, + "grad_norm": 0.6652721769625423, + "learning_rate": 0.003, + "loss": 4.035, + "step": 11572 + }, + { + "epoch": 0.11573, + "grad_norm": 0.6896088693903043, + "learning_rate": 0.003, + "loss": 4.089, + "step": 11573 + }, + { + "epoch": 0.11574, + "grad_norm": 0.6949121480552437, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 11574 + }, + { + "epoch": 0.11575, + "grad_norm": 0.8298077286079635, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 11575 + }, + { + "epoch": 0.11576, + "grad_norm": 0.8883417883490657, + "learning_rate": 0.003, + "loss": 4.049, + "step": 11576 + }, + { + "epoch": 0.11577, + "grad_norm": 0.9051304976104619, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 11577 + }, + { + "epoch": 0.11578, + "grad_norm": 0.8752536710071942, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 11578 + }, + { + "epoch": 0.11579, + "grad_norm": 0.7653910663712051, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 11579 + }, + { + "epoch": 0.1158, + "grad_norm": 0.7465481995480454, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 11580 + }, + { + "epoch": 0.11581, + "grad_norm": 0.9199637978386034, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 11581 + }, + { + "epoch": 0.11582, + "grad_norm": 1.0783217332956068, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 11582 + }, + { + "epoch": 0.11583, + "grad_norm": 1.1107501153614088, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 11583 + }, + { + "epoch": 0.11584, + "grad_norm": 0.8180991703205399, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 11584 + }, + { + "epoch": 0.11585, + "grad_norm": 0.8100778301580475, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 11585 + }, + { + "epoch": 0.11586, + "grad_norm": 0.831553150897628, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 11586 + }, + { + "epoch": 0.11587, + "grad_norm": 0.6817930472661543, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 11587 + }, + { + "epoch": 0.11588, + "grad_norm": 0.5727527524744273, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11588 + }, + { + "epoch": 0.11589, + "grad_norm": 0.6316855955481255, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 11589 + }, + { + "epoch": 0.1159, + "grad_norm": 0.6752325329450864, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 11590 + }, + { + "epoch": 0.11591, + "grad_norm": 0.7232173881070808, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 11591 + }, + { + "epoch": 0.11592, + "grad_norm": 0.7063185542230442, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 11592 + }, + { + "epoch": 0.11593, + "grad_norm": 0.7074532846256573, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 11593 + }, + { + "epoch": 0.11594, + "grad_norm": 0.7188476307054906, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 11594 + }, + { + "epoch": 0.11595, + "grad_norm": 0.7967017363971508, + "learning_rate": 0.003, + "loss": 4.096, + "step": 11595 + }, + { + "epoch": 0.11596, + "grad_norm": 0.7979659641125272, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 11596 + }, + { + "epoch": 0.11597, + "grad_norm": 0.7503354963245181, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 11597 + }, + { + "epoch": 0.11598, + "grad_norm": 0.6968784511387319, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 11598 + }, + { + "epoch": 0.11599, + "grad_norm": 0.7322551139611673, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 11599 + }, + { + "epoch": 0.116, + "grad_norm": 0.7829563677335902, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 11600 + }, + { + "epoch": 0.11601, + "grad_norm": 0.8406288218812696, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 11601 + }, + { + "epoch": 0.11602, + "grad_norm": 0.907358702251944, + "learning_rate": 0.003, + "loss": 4.051, + "step": 11602 + }, + { + "epoch": 0.11603, + "grad_norm": 1.104466090694656, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 11603 + }, + { + "epoch": 0.11604, + "grad_norm": 1.1445811229811231, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 11604 + }, + { + "epoch": 0.11605, + "grad_norm": 1.086292276099412, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 11605 + }, + { + "epoch": 0.11606, + "grad_norm": 1.0248341720469938, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 11606 + }, + { + "epoch": 0.11607, + "grad_norm": 0.8936072607044433, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 11607 + }, + { + "epoch": 0.11608, + "grad_norm": 0.6640762927725768, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 11608 + }, + { + "epoch": 0.11609, + "grad_norm": 0.5929874444616173, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 11609 + }, + { + "epoch": 0.1161, + "grad_norm": 0.7676302133161541, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 11610 + }, + { + "epoch": 0.11611, + "grad_norm": 0.9772519395033423, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 11611 + }, + { + "epoch": 0.11612, + "grad_norm": 1.1722821627389872, + "learning_rate": 0.003, + "loss": 4.1121, + "step": 11612 + }, + { + "epoch": 0.11613, + "grad_norm": 0.8317669096635092, + "learning_rate": 0.003, + "loss": 4.065, + "step": 11613 + }, + { + "epoch": 0.11614, + "grad_norm": 0.7171157192926751, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 11614 + }, + { + "epoch": 0.11615, + "grad_norm": 0.6794332467164689, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11615 + }, + { + "epoch": 0.11616, + "grad_norm": 0.6686040048708934, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 11616 + }, + { + "epoch": 0.11617, + "grad_norm": 0.6649123255778612, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 11617 + }, + { + "epoch": 0.11618, + "grad_norm": 0.6327469112612996, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 11618 + }, + { + "epoch": 0.11619, + "grad_norm": 0.6733908474561229, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 11619 + }, + { + "epoch": 0.1162, + "grad_norm": 0.9038689078618168, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 11620 + }, + { + "epoch": 0.11621, + "grad_norm": 0.9877567335502722, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 11621 + }, + { + "epoch": 0.11622, + "grad_norm": 1.0269046635334127, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 11622 + }, + { + "epoch": 0.11623, + "grad_norm": 1.2003887721207918, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 11623 + }, + { + "epoch": 0.11624, + "grad_norm": 0.8284791223906107, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 11624 + }, + { + "epoch": 0.11625, + "grad_norm": 0.7200125040651179, + "learning_rate": 0.003, + "loss": 4.069, + "step": 11625 + }, + { + "epoch": 0.11626, + "grad_norm": 0.7398519699744737, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 11626 + }, + { + "epoch": 0.11627, + "grad_norm": 0.781547249335416, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 11627 + }, + { + "epoch": 0.11628, + "grad_norm": 0.9735491541443749, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 11628 + }, + { + "epoch": 0.11629, + "grad_norm": 1.145587338551523, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 11629 + }, + { + "epoch": 0.1163, + "grad_norm": 0.8653126775605255, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 11630 + }, + { + "epoch": 0.11631, + "grad_norm": 0.7247010628569122, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 11631 + }, + { + "epoch": 0.11632, + "grad_norm": 0.6422645539925438, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 11632 + }, + { + "epoch": 0.11633, + "grad_norm": 0.6621774036007515, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 11633 + }, + { + "epoch": 0.11634, + "grad_norm": 0.79779597288848, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 11634 + }, + { + "epoch": 0.11635, + "grad_norm": 0.8893518897756485, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 11635 + }, + { + "epoch": 0.11636, + "grad_norm": 0.8572953980170954, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 11636 + }, + { + "epoch": 0.11637, + "grad_norm": 0.8296041187898088, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 11637 + }, + { + "epoch": 0.11638, + "grad_norm": 0.8157149630498728, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 11638 + }, + { + "epoch": 0.11639, + "grad_norm": 0.8485733672985765, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 11639 + }, + { + "epoch": 0.1164, + "grad_norm": 0.8444919120692772, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 11640 + }, + { + "epoch": 0.11641, + "grad_norm": 0.9167579257168221, + "learning_rate": 0.003, + "loss": 4.09, + "step": 11641 + }, + { + "epoch": 0.11642, + "grad_norm": 1.0334298143941645, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 11642 + }, + { + "epoch": 0.11643, + "grad_norm": 0.9996522485793898, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 11643 + }, + { + "epoch": 0.11644, + "grad_norm": 0.9191598889332477, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 11644 + }, + { + "epoch": 0.11645, + "grad_norm": 0.786355606103955, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 11645 + }, + { + "epoch": 0.11646, + "grad_norm": 0.7874834113515161, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 11646 + }, + { + "epoch": 0.11647, + "grad_norm": 0.8462857762931152, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 11647 + }, + { + "epoch": 0.11648, + "grad_norm": 0.7219922126008644, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 11648 + }, + { + "epoch": 0.11649, + "grad_norm": 0.7870218836101154, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 11649 + }, + { + "epoch": 0.1165, + "grad_norm": 0.7831053281586716, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 11650 + }, + { + "epoch": 0.11651, + "grad_norm": 0.8416193416227049, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 11651 + }, + { + "epoch": 0.11652, + "grad_norm": 0.9220794419288686, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 11652 + }, + { + "epoch": 0.11653, + "grad_norm": 0.927703344770682, + "learning_rate": 0.003, + "loss": 4.093, + "step": 11653 + }, + { + "epoch": 0.11654, + "grad_norm": 0.9093528793704585, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 11654 + }, + { + "epoch": 0.11655, + "grad_norm": 0.76695683270814, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 11655 + }, + { + "epoch": 0.11656, + "grad_norm": 0.7569008148443639, + "learning_rate": 0.003, + "loss": 4.075, + "step": 11656 + }, + { + "epoch": 0.11657, + "grad_norm": 0.7569148698989775, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 11657 + }, + { + "epoch": 0.11658, + "grad_norm": 0.8316401753149308, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 11658 + }, + { + "epoch": 0.11659, + "grad_norm": 0.855298459137231, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 11659 + }, + { + "epoch": 0.1166, + "grad_norm": 0.7696793683113001, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 11660 + }, + { + "epoch": 0.11661, + "grad_norm": 0.8529284941515057, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 11661 + }, + { + "epoch": 0.11662, + "grad_norm": 0.7150325438414864, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 11662 + }, + { + "epoch": 0.11663, + "grad_norm": 0.6516355774146884, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 11663 + }, + { + "epoch": 0.11664, + "grad_norm": 0.6410165413648975, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 11664 + }, + { + "epoch": 0.11665, + "grad_norm": 0.6643487625692873, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 11665 + }, + { + "epoch": 0.11666, + "grad_norm": 0.7324064210364144, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 11666 + }, + { + "epoch": 0.11667, + "grad_norm": 0.9646425504933539, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 11667 + }, + { + "epoch": 0.11668, + "grad_norm": 1.3721880860905278, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 11668 + }, + { + "epoch": 0.11669, + "grad_norm": 0.6659074294409868, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 11669 + }, + { + "epoch": 0.1167, + "grad_norm": 0.7318779336296473, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 11670 + }, + { + "epoch": 0.11671, + "grad_norm": 0.7227667586352463, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 11671 + }, + { + "epoch": 0.11672, + "grad_norm": 0.7121967652453894, + "learning_rate": 0.003, + "loss": 4.108, + "step": 11672 + }, + { + "epoch": 0.11673, + "grad_norm": 0.7810832836036794, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 11673 + }, + { + "epoch": 0.11674, + "grad_norm": 0.8147932367854734, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 11674 + }, + { + "epoch": 0.11675, + "grad_norm": 0.8221780637712801, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 11675 + }, + { + "epoch": 0.11676, + "grad_norm": 0.8322387673293494, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 11676 + }, + { + "epoch": 0.11677, + "grad_norm": 1.049561091040258, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 11677 + }, + { + "epoch": 0.11678, + "grad_norm": 1.1285293686759006, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 11678 + }, + { + "epoch": 0.11679, + "grad_norm": 0.7942929439071569, + "learning_rate": 0.003, + "loss": 4.076, + "step": 11679 + }, + { + "epoch": 0.1168, + "grad_norm": 0.8121823955224093, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 11680 + }, + { + "epoch": 0.11681, + "grad_norm": 0.886985617007665, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 11681 + }, + { + "epoch": 0.11682, + "grad_norm": 0.9835906722387846, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 11682 + }, + { + "epoch": 0.11683, + "grad_norm": 0.9742855718416906, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 11683 + }, + { + "epoch": 0.11684, + "grad_norm": 0.9403544667282324, + "learning_rate": 0.003, + "loss": 4.088, + "step": 11684 + }, + { + "epoch": 0.11685, + "grad_norm": 0.8843233261450154, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 11685 + }, + { + "epoch": 0.11686, + "grad_norm": 0.9595432183246029, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 11686 + }, + { + "epoch": 0.11687, + "grad_norm": 0.9177948190718449, + "learning_rate": 0.003, + "loss": 4.101, + "step": 11687 + }, + { + "epoch": 0.11688, + "grad_norm": 0.7568265274966034, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 11688 + }, + { + "epoch": 0.11689, + "grad_norm": 0.6724191688083208, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 11689 + }, + { + "epoch": 0.1169, + "grad_norm": 0.5319571543918213, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 11690 + }, + { + "epoch": 0.11691, + "grad_norm": 0.6091559698255019, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 11691 + }, + { + "epoch": 0.11692, + "grad_norm": 0.5314136756707564, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 11692 + }, + { + "epoch": 0.11693, + "grad_norm": 0.6276209351566823, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 11693 + }, + { + "epoch": 0.11694, + "grad_norm": 0.5809151184858754, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 11694 + }, + { + "epoch": 0.11695, + "grad_norm": 0.5582511698901078, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 11695 + }, + { + "epoch": 0.11696, + "grad_norm": 0.625511431306643, + "learning_rate": 0.003, + "loss": 4.078, + "step": 11696 + }, + { + "epoch": 0.11697, + "grad_norm": 0.7422327835880065, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 11697 + }, + { + "epoch": 0.11698, + "grad_norm": 0.8382422647197152, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 11698 + }, + { + "epoch": 0.11699, + "grad_norm": 0.9000283347934224, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 11699 + }, + { + "epoch": 0.117, + "grad_norm": 1.0768928598652463, + "learning_rate": 0.003, + "loss": 4.081, + "step": 11700 + }, + { + "epoch": 0.11701, + "grad_norm": 1.0652656683545332, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 11701 + }, + { + "epoch": 0.11702, + "grad_norm": 1.057544811604008, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 11702 + }, + { + "epoch": 0.11703, + "grad_norm": 0.9043756476621583, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 11703 + }, + { + "epoch": 0.11704, + "grad_norm": 1.0027396176106307, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 11704 + }, + { + "epoch": 0.11705, + "grad_norm": 0.9744740701229088, + "learning_rate": 0.003, + "loss": 4.076, + "step": 11705 + }, + { + "epoch": 0.11706, + "grad_norm": 0.9664960636352933, + "learning_rate": 0.003, + "loss": 4.1082, + "step": 11706 + }, + { + "epoch": 0.11707, + "grad_norm": 0.8875027587459325, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 11707 + }, + { + "epoch": 0.11708, + "grad_norm": 0.9513123997990248, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 11708 + }, + { + "epoch": 0.11709, + "grad_norm": 1.1466303315616597, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 11709 + }, + { + "epoch": 0.1171, + "grad_norm": 0.7135812377242298, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 11710 + }, + { + "epoch": 0.11711, + "grad_norm": 0.7452097554644643, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 11711 + }, + { + "epoch": 0.11712, + "grad_norm": 0.8389929029307317, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 11712 + }, + { + "epoch": 0.11713, + "grad_norm": 0.987386349524161, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 11713 + }, + { + "epoch": 0.11714, + "grad_norm": 0.9815075689891968, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 11714 + }, + { + "epoch": 0.11715, + "grad_norm": 0.9108941418492904, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 11715 + }, + { + "epoch": 0.11716, + "grad_norm": 0.8388527095547572, + "learning_rate": 0.003, + "loss": 4.056, + "step": 11716 + }, + { + "epoch": 0.11717, + "grad_norm": 0.6655470826961603, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 11717 + }, + { + "epoch": 0.11718, + "grad_norm": 0.6149356984381353, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11718 + }, + { + "epoch": 0.11719, + "grad_norm": 0.6553899157977472, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 11719 + }, + { + "epoch": 0.1172, + "grad_norm": 0.6599660519536159, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 11720 + }, + { + "epoch": 0.11721, + "grad_norm": 0.6891664979129074, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 11721 + }, + { + "epoch": 0.11722, + "grad_norm": 0.7551605902492077, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 11722 + }, + { + "epoch": 0.11723, + "grad_norm": 0.9437203521693749, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 11723 + }, + { + "epoch": 0.11724, + "grad_norm": 1.063671318367107, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 11724 + }, + { + "epoch": 0.11725, + "grad_norm": 0.885530718983848, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 11725 + }, + { + "epoch": 0.11726, + "grad_norm": 0.7116632977684936, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 11726 + }, + { + "epoch": 0.11727, + "grad_norm": 0.6188393045829603, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 11727 + }, + { + "epoch": 0.11728, + "grad_norm": 0.6111188456564599, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 11728 + }, + { + "epoch": 0.11729, + "grad_norm": 0.6625273579626278, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 11729 + }, + { + "epoch": 0.1173, + "grad_norm": 0.7720788705683885, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 11730 + }, + { + "epoch": 0.11731, + "grad_norm": 0.9295753621861833, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 11731 + }, + { + "epoch": 0.11732, + "grad_norm": 1.117967364464593, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 11732 + }, + { + "epoch": 0.11733, + "grad_norm": 0.7599278186135924, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 11733 + }, + { + "epoch": 0.11734, + "grad_norm": 0.6147265724899618, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 11734 + }, + { + "epoch": 0.11735, + "grad_norm": 0.6725320357867157, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 11735 + }, + { + "epoch": 0.11736, + "grad_norm": 0.7456835884216022, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 11736 + }, + { + "epoch": 0.11737, + "grad_norm": 0.812797363085294, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 11737 + }, + { + "epoch": 0.11738, + "grad_norm": 0.8129215074388013, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 11738 + }, + { + "epoch": 0.11739, + "grad_norm": 0.9121844049181134, + "learning_rate": 0.003, + "loss": 4.087, + "step": 11739 + }, + { + "epoch": 0.1174, + "grad_norm": 0.9265831704683285, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 11740 + }, + { + "epoch": 0.11741, + "grad_norm": 0.9117372442258332, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 11741 + }, + { + "epoch": 0.11742, + "grad_norm": 0.8776990703973094, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 11742 + }, + { + "epoch": 0.11743, + "grad_norm": 0.9618020801881524, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 11743 + }, + { + "epoch": 0.11744, + "grad_norm": 0.9316312393728313, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 11744 + }, + { + "epoch": 0.11745, + "grad_norm": 1.031508767645295, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 11745 + }, + { + "epoch": 0.11746, + "grad_norm": 1.0480620764497686, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 11746 + }, + { + "epoch": 0.11747, + "grad_norm": 0.8812457935053363, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 11747 + }, + { + "epoch": 0.11748, + "grad_norm": 0.8446087681633835, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 11748 + }, + { + "epoch": 0.11749, + "grad_norm": 0.8806053106912328, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 11749 + }, + { + "epoch": 0.1175, + "grad_norm": 1.0652412401055025, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 11750 + }, + { + "epoch": 0.11751, + "grad_norm": 1.0154102072008697, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 11751 + }, + { + "epoch": 0.11752, + "grad_norm": 0.8782679887197985, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 11752 + }, + { + "epoch": 0.11753, + "grad_norm": 0.8768908843163372, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 11753 + }, + { + "epoch": 0.11754, + "grad_norm": 0.955884951313414, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 11754 + }, + { + "epoch": 0.11755, + "grad_norm": 0.9232099819063957, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 11755 + }, + { + "epoch": 0.11756, + "grad_norm": 0.8791836223086985, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 11756 + }, + { + "epoch": 0.11757, + "grad_norm": 1.0426667558176133, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 11757 + }, + { + "epoch": 0.11758, + "grad_norm": 1.1146348535077464, + "learning_rate": 0.003, + "loss": 4.1228, + "step": 11758 + }, + { + "epoch": 0.11759, + "grad_norm": 0.7774305248469459, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 11759 + }, + { + "epoch": 0.1176, + "grad_norm": 0.725058397672122, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 11760 + }, + { + "epoch": 0.11761, + "grad_norm": 0.7524803504936709, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 11761 + }, + { + "epoch": 0.11762, + "grad_norm": 0.6014674016359951, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 11762 + }, + { + "epoch": 0.11763, + "grad_norm": 0.6911615119202653, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 11763 + }, + { + "epoch": 0.11764, + "grad_norm": 0.9141987084938712, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 11764 + }, + { + "epoch": 0.11765, + "grad_norm": 1.0528972268151409, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 11765 + }, + { + "epoch": 0.11766, + "grad_norm": 1.0784419150121876, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 11766 + }, + { + "epoch": 0.11767, + "grad_norm": 0.8858386866249308, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 11767 + }, + { + "epoch": 0.11768, + "grad_norm": 0.8405360479720302, + "learning_rate": 0.003, + "loss": 4.084, + "step": 11768 + }, + { + "epoch": 0.11769, + "grad_norm": 0.686081864034834, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11769 + }, + { + "epoch": 0.1177, + "grad_norm": 0.5832102646409766, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 11770 + }, + { + "epoch": 0.11771, + "grad_norm": 0.5616671018869388, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 11771 + }, + { + "epoch": 0.11772, + "grad_norm": 0.5590360587542899, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 11772 + }, + { + "epoch": 0.11773, + "grad_norm": 0.5679055088444243, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 11773 + }, + { + "epoch": 0.11774, + "grad_norm": 0.4680771136119359, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 11774 + }, + { + "epoch": 0.11775, + "grad_norm": 0.5092729926954808, + "learning_rate": 0.003, + "loss": 4.067, + "step": 11775 + }, + { + "epoch": 0.11776, + "grad_norm": 0.5424922004773088, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 11776 + }, + { + "epoch": 0.11777, + "grad_norm": 0.5936151804472162, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 11777 + }, + { + "epoch": 0.11778, + "grad_norm": 0.6732649637257471, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 11778 + }, + { + "epoch": 0.11779, + "grad_norm": 0.7690416533056713, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 11779 + }, + { + "epoch": 0.1178, + "grad_norm": 0.8353140580453683, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 11780 + }, + { + "epoch": 0.11781, + "grad_norm": 0.8935837715211247, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 11781 + }, + { + "epoch": 0.11782, + "grad_norm": 1.0531796991636666, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 11782 + }, + { + "epoch": 0.11783, + "grad_norm": 1.079224226296788, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 11783 + }, + { + "epoch": 0.11784, + "grad_norm": 0.8356900460632927, + "learning_rate": 0.003, + "loss": 4.068, + "step": 11784 + }, + { + "epoch": 0.11785, + "grad_norm": 0.8084761278959023, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 11785 + }, + { + "epoch": 0.11786, + "grad_norm": 0.82914704043006, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 11786 + }, + { + "epoch": 0.11787, + "grad_norm": 0.6775644295176407, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 11787 + }, + { + "epoch": 0.11788, + "grad_norm": 0.6655348429091607, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 11788 + }, + { + "epoch": 0.11789, + "grad_norm": 0.6407300835634582, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 11789 + }, + { + "epoch": 0.1179, + "grad_norm": 0.7638731149124559, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 11790 + }, + { + "epoch": 0.11791, + "grad_norm": 1.0282178932707131, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 11791 + }, + { + "epoch": 0.11792, + "grad_norm": 1.264008956440248, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 11792 + }, + { + "epoch": 0.11793, + "grad_norm": 0.6519522242962901, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 11793 + }, + { + "epoch": 0.11794, + "grad_norm": 0.5935865323698845, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 11794 + }, + { + "epoch": 0.11795, + "grad_norm": 0.6666966012390865, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 11795 + }, + { + "epoch": 0.11796, + "grad_norm": 0.8503744688629382, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 11796 + }, + { + "epoch": 0.11797, + "grad_norm": 1.0816103202425806, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 11797 + }, + { + "epoch": 0.11798, + "grad_norm": 0.8486292640258042, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 11798 + }, + { + "epoch": 0.11799, + "grad_norm": 0.774259880796334, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 11799 + }, + { + "epoch": 0.118, + "grad_norm": 0.6827970488744037, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 11800 + }, + { + "epoch": 0.11801, + "grad_norm": 0.680022168428237, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 11801 + }, + { + "epoch": 0.11802, + "grad_norm": 0.7608902817432555, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 11802 + }, + { + "epoch": 0.11803, + "grad_norm": 0.8771018181150345, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 11803 + }, + { + "epoch": 0.11804, + "grad_norm": 1.0895264488416811, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 11804 + }, + { + "epoch": 0.11805, + "grad_norm": 1.1713998207842216, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 11805 + }, + { + "epoch": 0.11806, + "grad_norm": 0.9559247722625149, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 11806 + }, + { + "epoch": 0.11807, + "grad_norm": 1.0176177339184564, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 11807 + }, + { + "epoch": 0.11808, + "grad_norm": 0.9828894080361346, + "learning_rate": 0.003, + "loss": 4.071, + "step": 11808 + }, + { + "epoch": 0.11809, + "grad_norm": 0.9078709897541808, + "learning_rate": 0.003, + "loss": 4.101, + "step": 11809 + }, + { + "epoch": 0.1181, + "grad_norm": 0.7298296561198457, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 11810 + }, + { + "epoch": 0.11811, + "grad_norm": 0.7136721860676385, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 11811 + }, + { + "epoch": 0.11812, + "grad_norm": 0.7120421825821814, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 11812 + }, + { + "epoch": 0.11813, + "grad_norm": 0.7436975761264889, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 11813 + }, + { + "epoch": 0.11814, + "grad_norm": 0.6765075476982464, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 11814 + }, + { + "epoch": 0.11815, + "grad_norm": 0.7008626110877016, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 11815 + }, + { + "epoch": 0.11816, + "grad_norm": 0.7690086118905748, + "learning_rate": 0.003, + "loss": 4.058, + "step": 11816 + }, + { + "epoch": 0.11817, + "grad_norm": 0.8286856888738373, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 11817 + }, + { + "epoch": 0.11818, + "grad_norm": 0.833350887245866, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11818 + }, + { + "epoch": 0.11819, + "grad_norm": 0.8133365071024339, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 11819 + }, + { + "epoch": 0.1182, + "grad_norm": 0.8950574441093196, + "learning_rate": 0.003, + "loss": 4.09, + "step": 11820 + }, + { + "epoch": 0.11821, + "grad_norm": 0.8314993561563281, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 11821 + }, + { + "epoch": 0.11822, + "grad_norm": 0.879384195427867, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 11822 + }, + { + "epoch": 0.11823, + "grad_norm": 0.8712791151137181, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 11823 + }, + { + "epoch": 0.11824, + "grad_norm": 0.7946335403095336, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 11824 + }, + { + "epoch": 0.11825, + "grad_norm": 0.9251946099724122, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 11825 + }, + { + "epoch": 0.11826, + "grad_norm": 0.9089433872644047, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 11826 + }, + { + "epoch": 0.11827, + "grad_norm": 0.960148991783686, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 11827 + }, + { + "epoch": 0.11828, + "grad_norm": 0.9553461678044473, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 11828 + }, + { + "epoch": 0.11829, + "grad_norm": 0.793056431276092, + "learning_rate": 0.003, + "loss": 4.106, + "step": 11829 + }, + { + "epoch": 0.1183, + "grad_norm": 0.7258879341191068, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 11830 + }, + { + "epoch": 0.11831, + "grad_norm": 0.7398380601017597, + "learning_rate": 0.003, + "loss": 4.086, + "step": 11831 + }, + { + "epoch": 0.11832, + "grad_norm": 0.6754318588653985, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 11832 + }, + { + "epoch": 0.11833, + "grad_norm": 0.6170706589009188, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 11833 + }, + { + "epoch": 0.11834, + "grad_norm": 0.646106463233276, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 11834 + }, + { + "epoch": 0.11835, + "grad_norm": 0.8762103450992815, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 11835 + }, + { + "epoch": 0.11836, + "grad_norm": 1.2776762081665294, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 11836 + }, + { + "epoch": 0.11837, + "grad_norm": 0.8108701163884705, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 11837 + }, + { + "epoch": 0.11838, + "grad_norm": 0.6940338943383062, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 11838 + }, + { + "epoch": 0.11839, + "grad_norm": 0.7901864807977117, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 11839 + }, + { + "epoch": 0.1184, + "grad_norm": 0.7807961097446459, + "learning_rate": 0.003, + "loss": 4.07, + "step": 11840 + }, + { + "epoch": 0.11841, + "grad_norm": 0.9471463996011317, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 11841 + }, + { + "epoch": 0.11842, + "grad_norm": 0.8478233584393488, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 11842 + }, + { + "epoch": 0.11843, + "grad_norm": 0.763512806199942, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 11843 + }, + { + "epoch": 0.11844, + "grad_norm": 0.6452078727657857, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 11844 + }, + { + "epoch": 0.11845, + "grad_norm": 0.6445995030373571, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 11845 + }, + { + "epoch": 0.11846, + "grad_norm": 0.742374167214842, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 11846 + }, + { + "epoch": 0.11847, + "grad_norm": 0.7351815640255105, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 11847 + }, + { + "epoch": 0.11848, + "grad_norm": 0.8196365500471058, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 11848 + }, + { + "epoch": 0.11849, + "grad_norm": 0.831588584012136, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 11849 + }, + { + "epoch": 0.1185, + "grad_norm": 0.8433679354730954, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 11850 + }, + { + "epoch": 0.11851, + "grad_norm": 0.9137556915989256, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 11851 + }, + { + "epoch": 0.11852, + "grad_norm": 1.0697839356450594, + "learning_rate": 0.003, + "loss": 4.077, + "step": 11852 + }, + { + "epoch": 0.11853, + "grad_norm": 1.049866632695257, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 11853 + }, + { + "epoch": 0.11854, + "grad_norm": 0.8671901419952202, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 11854 + }, + { + "epoch": 0.11855, + "grad_norm": 0.9208510989321324, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 11855 + }, + { + "epoch": 0.11856, + "grad_norm": 0.8926572862378708, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 11856 + }, + { + "epoch": 0.11857, + "grad_norm": 0.8433740093332531, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 11857 + }, + { + "epoch": 0.11858, + "grad_norm": 0.8254144915988062, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 11858 + }, + { + "epoch": 0.11859, + "grad_norm": 0.7926083472119856, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 11859 + }, + { + "epoch": 0.1186, + "grad_norm": 0.794748225166051, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 11860 + }, + { + "epoch": 0.11861, + "grad_norm": 0.8684563379643798, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 11861 + }, + { + "epoch": 0.11862, + "grad_norm": 0.8851848538484391, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 11862 + }, + { + "epoch": 0.11863, + "grad_norm": 0.73032091671914, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 11863 + }, + { + "epoch": 0.11864, + "grad_norm": 0.6668427993843925, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 11864 + }, + { + "epoch": 0.11865, + "grad_norm": 0.6165145527182839, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 11865 + }, + { + "epoch": 0.11866, + "grad_norm": 0.612055241418331, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 11866 + }, + { + "epoch": 0.11867, + "grad_norm": 0.7059845049393006, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 11867 + }, + { + "epoch": 0.11868, + "grad_norm": 0.7560768778001865, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 11868 + }, + { + "epoch": 0.11869, + "grad_norm": 0.8673186931348383, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 11869 + }, + { + "epoch": 0.1187, + "grad_norm": 1.010879366295258, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 11870 + }, + { + "epoch": 0.11871, + "grad_norm": 1.0209959618047066, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 11871 + }, + { + "epoch": 0.11872, + "grad_norm": 0.9860088323911688, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 11872 + }, + { + "epoch": 0.11873, + "grad_norm": 1.1659158688060458, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 11873 + }, + { + "epoch": 0.11874, + "grad_norm": 0.8094261387221169, + "learning_rate": 0.003, + "loss": 4.042, + "step": 11874 + }, + { + "epoch": 0.11875, + "grad_norm": 0.6335448424270009, + "learning_rate": 0.003, + "loss": 4.08, + "step": 11875 + }, + { + "epoch": 0.11876, + "grad_norm": 0.6773731825309582, + "learning_rate": 0.003, + "loss": 4.052, + "step": 11876 + }, + { + "epoch": 0.11877, + "grad_norm": 0.6984872535116806, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 11877 + }, + { + "epoch": 0.11878, + "grad_norm": 0.7251659690338177, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 11878 + }, + { + "epoch": 0.11879, + "grad_norm": 0.7588124610807487, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 11879 + }, + { + "epoch": 0.1188, + "grad_norm": 0.9426291501868473, + "learning_rate": 0.003, + "loss": 4.084, + "step": 11880 + }, + { + "epoch": 0.11881, + "grad_norm": 0.8549154743151803, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 11881 + }, + { + "epoch": 0.11882, + "grad_norm": 0.7486076080102272, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 11882 + }, + { + "epoch": 0.11883, + "grad_norm": 0.7005995077416897, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 11883 + }, + { + "epoch": 0.11884, + "grad_norm": 0.6614383598281759, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 11884 + }, + { + "epoch": 0.11885, + "grad_norm": 0.7390909664238483, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 11885 + }, + { + "epoch": 0.11886, + "grad_norm": 0.8006965897045121, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 11886 + }, + { + "epoch": 0.11887, + "grad_norm": 0.7146342820208608, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 11887 + }, + { + "epoch": 0.11888, + "grad_norm": 0.6767998942626821, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 11888 + }, + { + "epoch": 0.11889, + "grad_norm": 0.7800125822291902, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 11889 + }, + { + "epoch": 0.1189, + "grad_norm": 0.8777806739707887, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 11890 + }, + { + "epoch": 0.11891, + "grad_norm": 0.9746936424903314, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 11891 + }, + { + "epoch": 0.11892, + "grad_norm": 1.1884031352290723, + "learning_rate": 0.003, + "loss": 4.09, + "step": 11892 + }, + { + "epoch": 0.11893, + "grad_norm": 1.13475999737239, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 11893 + }, + { + "epoch": 0.11894, + "grad_norm": 0.8967699394773493, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 11894 + }, + { + "epoch": 0.11895, + "grad_norm": 0.8007111264651628, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 11895 + }, + { + "epoch": 0.11896, + "grad_norm": 0.7307540515461585, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 11896 + }, + { + "epoch": 0.11897, + "grad_norm": 0.7815929432354917, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 11897 + }, + { + "epoch": 0.11898, + "grad_norm": 0.7385538729204432, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 11898 + }, + { + "epoch": 0.11899, + "grad_norm": 0.6650532955074363, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 11899 + }, + { + "epoch": 0.119, + "grad_norm": 0.6876031187179125, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 11900 + }, + { + "epoch": 0.11901, + "grad_norm": 0.6693901471853071, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 11901 + }, + { + "epoch": 0.11902, + "grad_norm": 0.7450683537894556, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 11902 + }, + { + "epoch": 0.11903, + "grad_norm": 0.9583378538144384, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 11903 + }, + { + "epoch": 0.11904, + "grad_norm": 1.2031401004775466, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 11904 + }, + { + "epoch": 0.11905, + "grad_norm": 0.6708910129080695, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 11905 + }, + { + "epoch": 0.11906, + "grad_norm": 0.6088762788761715, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 11906 + }, + { + "epoch": 0.11907, + "grad_norm": 0.6657607666816874, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 11907 + }, + { + "epoch": 0.11908, + "grad_norm": 0.7841564673513367, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 11908 + }, + { + "epoch": 0.11909, + "grad_norm": 0.9021268502553939, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 11909 + }, + { + "epoch": 0.1191, + "grad_norm": 0.916707108404376, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 11910 + }, + { + "epoch": 0.11911, + "grad_norm": 0.8246149905091653, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 11911 + }, + { + "epoch": 0.11912, + "grad_norm": 0.7323629401540885, + "learning_rate": 0.003, + "loss": 4.1161, + "step": 11912 + }, + { + "epoch": 0.11913, + "grad_norm": 0.9061360685992332, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 11913 + }, + { + "epoch": 0.11914, + "grad_norm": 1.0343836872001413, + "learning_rate": 0.003, + "loss": 4.111, + "step": 11914 + }, + { + "epoch": 0.11915, + "grad_norm": 0.997844312316651, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 11915 + }, + { + "epoch": 0.11916, + "grad_norm": 1.1217834060028307, + "learning_rate": 0.003, + "loss": 4.116, + "step": 11916 + }, + { + "epoch": 0.11917, + "grad_norm": 0.9155676472690304, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 11917 + }, + { + "epoch": 0.11918, + "grad_norm": 0.8305774438930578, + "learning_rate": 0.003, + "loss": 4.091, + "step": 11918 + }, + { + "epoch": 0.11919, + "grad_norm": 0.692585947841226, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 11919 + }, + { + "epoch": 0.1192, + "grad_norm": 0.615359817362125, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 11920 + }, + { + "epoch": 0.11921, + "grad_norm": 0.6550229673560121, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 11921 + }, + { + "epoch": 0.11922, + "grad_norm": 0.7481644276266792, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 11922 + }, + { + "epoch": 0.11923, + "grad_norm": 0.9216830062652961, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 11923 + }, + { + "epoch": 0.11924, + "grad_norm": 1.148633737473478, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 11924 + }, + { + "epoch": 0.11925, + "grad_norm": 0.88659831115291, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 11925 + }, + { + "epoch": 0.11926, + "grad_norm": 0.8069585267153818, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 11926 + }, + { + "epoch": 0.11927, + "grad_norm": 0.7347630623585638, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 11927 + }, + { + "epoch": 0.11928, + "grad_norm": 0.7050032718999796, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 11928 + }, + { + "epoch": 0.11929, + "grad_norm": 0.7815451331357121, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 11929 + }, + { + "epoch": 0.1193, + "grad_norm": 0.6849344807101223, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 11930 + }, + { + "epoch": 0.11931, + "grad_norm": 0.679361133468356, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 11931 + }, + { + "epoch": 0.11932, + "grad_norm": 0.8450698013980862, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 11932 + }, + { + "epoch": 0.11933, + "grad_norm": 0.8835274823568191, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 11933 + }, + { + "epoch": 0.11934, + "grad_norm": 0.8690045370066336, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 11934 + }, + { + "epoch": 0.11935, + "grad_norm": 1.0116570673577854, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 11935 + }, + { + "epoch": 0.11936, + "grad_norm": 1.0767742621445415, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 11936 + }, + { + "epoch": 0.11937, + "grad_norm": 0.9916078684299126, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 11937 + }, + { + "epoch": 0.11938, + "grad_norm": 0.9222300875660133, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 11938 + }, + { + "epoch": 0.11939, + "grad_norm": 0.8135030956349809, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 11939 + }, + { + "epoch": 0.1194, + "grad_norm": 0.8370228867878781, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 11940 + }, + { + "epoch": 0.11941, + "grad_norm": 0.8571547503694238, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 11941 + }, + { + "epoch": 0.11942, + "grad_norm": 0.805516408836029, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 11942 + }, + { + "epoch": 0.11943, + "grad_norm": 0.8193503171831941, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 11943 + }, + { + "epoch": 0.11944, + "grad_norm": 0.8006684034359407, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 11944 + }, + { + "epoch": 0.11945, + "grad_norm": 0.76548572031874, + "learning_rate": 0.003, + "loss": 4.052, + "step": 11945 + }, + { + "epoch": 0.11946, + "grad_norm": 0.8012017184250187, + "learning_rate": 0.003, + "loss": 4.067, + "step": 11946 + }, + { + "epoch": 0.11947, + "grad_norm": 0.7268659883443598, + "learning_rate": 0.003, + "loss": 4.095, + "step": 11947 + }, + { + "epoch": 0.11948, + "grad_norm": 0.748838960756178, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 11948 + }, + { + "epoch": 0.11949, + "grad_norm": 0.859028005033597, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 11949 + }, + { + "epoch": 0.1195, + "grad_norm": 0.9373645772108132, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 11950 + }, + { + "epoch": 0.11951, + "grad_norm": 1.2337400168375312, + "learning_rate": 0.003, + "loss": 4.095, + "step": 11951 + }, + { + "epoch": 0.11952, + "grad_norm": 0.9365772417221291, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 11952 + }, + { + "epoch": 0.11953, + "grad_norm": 0.8691607276398206, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 11953 + }, + { + "epoch": 0.11954, + "grad_norm": 0.8045361104724315, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 11954 + }, + { + "epoch": 0.11955, + "grad_norm": 0.822560519294801, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 11955 + }, + { + "epoch": 0.11956, + "grad_norm": 0.9071708859836859, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 11956 + }, + { + "epoch": 0.11957, + "grad_norm": 1.0317809037825372, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 11957 + }, + { + "epoch": 0.11958, + "grad_norm": 1.0457569164754317, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 11958 + }, + { + "epoch": 0.11959, + "grad_norm": 0.9491594661548564, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 11959 + }, + { + "epoch": 0.1196, + "grad_norm": 0.8583938821808423, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 11960 + }, + { + "epoch": 0.11961, + "grad_norm": 0.7415940680604313, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 11961 + }, + { + "epoch": 0.11962, + "grad_norm": 0.6353780517582217, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 11962 + }, + { + "epoch": 0.11963, + "grad_norm": 0.7983793537208222, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 11963 + }, + { + "epoch": 0.11964, + "grad_norm": 0.7886334581246542, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 11964 + }, + { + "epoch": 0.11965, + "grad_norm": 0.718014453801322, + "learning_rate": 0.003, + "loss": 4.086, + "step": 11965 + }, + { + "epoch": 0.11966, + "grad_norm": 0.7093182418337178, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 11966 + }, + { + "epoch": 0.11967, + "grad_norm": 0.6216164077811807, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 11967 + }, + { + "epoch": 0.11968, + "grad_norm": 0.5674808291344278, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 11968 + }, + { + "epoch": 0.11969, + "grad_norm": 0.5220969467595893, + "learning_rate": 0.003, + "loss": 4.087, + "step": 11969 + }, + { + "epoch": 0.1197, + "grad_norm": 0.5201727120164347, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 11970 + }, + { + "epoch": 0.11971, + "grad_norm": 0.45658338264133513, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 11971 + }, + { + "epoch": 0.11972, + "grad_norm": 0.5541459167779041, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 11972 + }, + { + "epoch": 0.11973, + "grad_norm": 0.6364115851727812, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 11973 + }, + { + "epoch": 0.11974, + "grad_norm": 0.7923773553003368, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 11974 + }, + { + "epoch": 0.11975, + "grad_norm": 1.119949647941313, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 11975 + }, + { + "epoch": 0.11976, + "grad_norm": 1.23207415112035, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 11976 + }, + { + "epoch": 0.11977, + "grad_norm": 0.6590221214615162, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 11977 + }, + { + "epoch": 0.11978, + "grad_norm": 0.6948614605624547, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 11978 + }, + { + "epoch": 0.11979, + "grad_norm": 0.8847266907483391, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 11979 + }, + { + "epoch": 0.1198, + "grad_norm": 0.9312694274233432, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 11980 + }, + { + "epoch": 0.11981, + "grad_norm": 0.9461444011302217, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 11981 + }, + { + "epoch": 0.11982, + "grad_norm": 0.8758913587065924, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 11982 + }, + { + "epoch": 0.11983, + "grad_norm": 0.8760617492377928, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 11983 + }, + { + "epoch": 0.11984, + "grad_norm": 0.7243109034421162, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 11984 + }, + { + "epoch": 0.11985, + "grad_norm": 0.7004600396377721, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 11985 + }, + { + "epoch": 0.11986, + "grad_norm": 0.7221015934897439, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 11986 + }, + { + "epoch": 0.11987, + "grad_norm": 0.7871829593065788, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 11987 + }, + { + "epoch": 0.11988, + "grad_norm": 0.7075770816654835, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 11988 + }, + { + "epoch": 0.11989, + "grad_norm": 0.7606136914808911, + "learning_rate": 0.003, + "loss": 4.082, + "step": 11989 + }, + { + "epoch": 0.1199, + "grad_norm": 0.9010988151799043, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 11990 + }, + { + "epoch": 0.11991, + "grad_norm": 0.9786531877527461, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 11991 + }, + { + "epoch": 0.11992, + "grad_norm": 1.0463234459105692, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 11992 + }, + { + "epoch": 0.11993, + "grad_norm": 0.9405513356826912, + "learning_rate": 0.003, + "loss": 4.065, + "step": 11993 + }, + { + "epoch": 0.11994, + "grad_norm": 0.9646495068141073, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 11994 + }, + { + "epoch": 0.11995, + "grad_norm": 1.0309215622098242, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 11995 + }, + { + "epoch": 0.11996, + "grad_norm": 0.9241477306836697, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 11996 + }, + { + "epoch": 0.11997, + "grad_norm": 0.8097824344911, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 11997 + }, + { + "epoch": 0.11998, + "grad_norm": 0.7067736238117723, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 11998 + }, + { + "epoch": 0.11999, + "grad_norm": 0.6538639483861481, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 11999 + }, + { + "epoch": 0.12, + "grad_norm": 0.7659243077400497, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 12000 + }, + { + "epoch": 0.12001, + "grad_norm": 0.8023521521090117, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 12001 + }, + { + "epoch": 0.12002, + "grad_norm": 0.961242206121727, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 12002 + }, + { + "epoch": 0.12003, + "grad_norm": 1.2526177653248893, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 12003 + }, + { + "epoch": 0.12004, + "grad_norm": 0.8410304975750107, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12004 + }, + { + "epoch": 0.12005, + "grad_norm": 0.7279790520142565, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 12005 + }, + { + "epoch": 0.12006, + "grad_norm": 0.7285212920146146, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 12006 + }, + { + "epoch": 0.12007, + "grad_norm": 0.6647624461558206, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 12007 + }, + { + "epoch": 0.12008, + "grad_norm": 0.6419513250348899, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 12008 + }, + { + "epoch": 0.12009, + "grad_norm": 0.642549189417902, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 12009 + }, + { + "epoch": 0.1201, + "grad_norm": 0.7023091498881583, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 12010 + }, + { + "epoch": 0.12011, + "grad_norm": 0.7171302422861198, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 12011 + }, + { + "epoch": 0.12012, + "grad_norm": 0.7942440278284121, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 12012 + }, + { + "epoch": 0.12013, + "grad_norm": 0.8896477823603075, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 12013 + }, + { + "epoch": 0.12014, + "grad_norm": 0.8837668069948614, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 12014 + }, + { + "epoch": 0.12015, + "grad_norm": 0.8435689467729217, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 12015 + }, + { + "epoch": 0.12016, + "grad_norm": 0.9212366189235262, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 12016 + }, + { + "epoch": 0.12017, + "grad_norm": 0.9053821425733813, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 12017 + }, + { + "epoch": 0.12018, + "grad_norm": 1.070929549236012, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 12018 + }, + { + "epoch": 0.12019, + "grad_norm": 1.1632855793109629, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 12019 + }, + { + "epoch": 0.1202, + "grad_norm": 0.9689388366189522, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 12020 + }, + { + "epoch": 0.12021, + "grad_norm": 0.8805816300549908, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 12021 + }, + { + "epoch": 0.12022, + "grad_norm": 0.9393700862182391, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 12022 + }, + { + "epoch": 0.12023, + "grad_norm": 0.9408800937079208, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 12023 + }, + { + "epoch": 0.12024, + "grad_norm": 0.8780239980937175, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 12024 + }, + { + "epoch": 0.12025, + "grad_norm": 0.8283146623762387, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 12025 + }, + { + "epoch": 0.12026, + "grad_norm": 0.8358219955732257, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 12026 + }, + { + "epoch": 0.12027, + "grad_norm": 0.6763683484972164, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 12027 + }, + { + "epoch": 0.12028, + "grad_norm": 0.7241408294923761, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 12028 + }, + { + "epoch": 0.12029, + "grad_norm": 0.9977567222444976, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 12029 + }, + { + "epoch": 0.1203, + "grad_norm": 1.4343501765745075, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 12030 + }, + { + "epoch": 0.12031, + "grad_norm": 0.5934750778806325, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 12031 + }, + { + "epoch": 0.12032, + "grad_norm": 0.7317713488446722, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 12032 + }, + { + "epoch": 0.12033, + "grad_norm": 0.8453119025974559, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 12033 + }, + { + "epoch": 0.12034, + "grad_norm": 0.8886814508473693, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 12034 + }, + { + "epoch": 0.12035, + "grad_norm": 0.9104362807341917, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 12035 + }, + { + "epoch": 0.12036, + "grad_norm": 0.9600326587977933, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 12036 + }, + { + "epoch": 0.12037, + "grad_norm": 0.8723262804067118, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 12037 + }, + { + "epoch": 0.12038, + "grad_norm": 0.8081761707745708, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 12038 + }, + { + "epoch": 0.12039, + "grad_norm": 0.7495659446732845, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 12039 + }, + { + "epoch": 0.1204, + "grad_norm": 0.7120487632255719, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 12040 + }, + { + "epoch": 0.12041, + "grad_norm": 0.8530661194106129, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 12041 + }, + { + "epoch": 0.12042, + "grad_norm": 0.882956070651322, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 12042 + }, + { + "epoch": 0.12043, + "grad_norm": 0.8017395404248646, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 12043 + }, + { + "epoch": 0.12044, + "grad_norm": 0.6723683416837811, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 12044 + }, + { + "epoch": 0.12045, + "grad_norm": 0.6584653483199643, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 12045 + }, + { + "epoch": 0.12046, + "grad_norm": 0.7035025023856035, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 12046 + }, + { + "epoch": 0.12047, + "grad_norm": 0.8530059671037054, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 12047 + }, + { + "epoch": 0.12048, + "grad_norm": 1.0610477177598243, + "learning_rate": 0.003, + "loss": 4.091, + "step": 12048 + }, + { + "epoch": 0.12049, + "grad_norm": 0.925017658633742, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 12049 + }, + { + "epoch": 0.1205, + "grad_norm": 0.9545499424695932, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 12050 + }, + { + "epoch": 0.12051, + "grad_norm": 0.9724560908706467, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 12051 + }, + { + "epoch": 0.12052, + "grad_norm": 1.0799923024937357, + "learning_rate": 0.003, + "loss": 4.106, + "step": 12052 + }, + { + "epoch": 0.12053, + "grad_norm": 0.9852278779342245, + "learning_rate": 0.003, + "loss": 4.097, + "step": 12053 + }, + { + "epoch": 0.12054, + "grad_norm": 1.1082986645884205, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 12054 + }, + { + "epoch": 0.12055, + "grad_norm": 0.9782150774884669, + "learning_rate": 0.003, + "loss": 4.11, + "step": 12055 + }, + { + "epoch": 0.12056, + "grad_norm": 0.981583538098505, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 12056 + }, + { + "epoch": 0.12057, + "grad_norm": 0.9000013529274676, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 12057 + }, + { + "epoch": 0.12058, + "grad_norm": 0.8248975117610136, + "learning_rate": 0.003, + "loss": 4.1152, + "step": 12058 + }, + { + "epoch": 0.12059, + "grad_norm": 0.7316550518767464, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 12059 + }, + { + "epoch": 0.1206, + "grad_norm": 0.8694276237701707, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 12060 + }, + { + "epoch": 0.12061, + "grad_norm": 0.938263264905831, + "learning_rate": 0.003, + "loss": 4.1137, + "step": 12061 + }, + { + "epoch": 0.12062, + "grad_norm": 1.0585572997712702, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12062 + }, + { + "epoch": 0.12063, + "grad_norm": 1.0263333261586491, + "learning_rate": 0.003, + "loss": 4.1, + "step": 12063 + }, + { + "epoch": 0.12064, + "grad_norm": 0.9744024691980951, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 12064 + }, + { + "epoch": 0.12065, + "grad_norm": 0.9723547264958703, + "learning_rate": 0.003, + "loss": 4.083, + "step": 12065 + }, + { + "epoch": 0.12066, + "grad_norm": 0.8621056775718198, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 12066 + }, + { + "epoch": 0.12067, + "grad_norm": 0.760732961962908, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 12067 + }, + { + "epoch": 0.12068, + "grad_norm": 0.7224923416076042, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 12068 + }, + { + "epoch": 0.12069, + "grad_norm": 0.6985051056644312, + "learning_rate": 0.003, + "loss": 4.075, + "step": 12069 + }, + { + "epoch": 0.1207, + "grad_norm": 0.6892685380871093, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 12070 + }, + { + "epoch": 0.12071, + "grad_norm": 0.7753502411874906, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 12071 + }, + { + "epoch": 0.12072, + "grad_norm": 1.0273084993720307, + "learning_rate": 0.003, + "loss": 4.1245, + "step": 12072 + }, + { + "epoch": 0.12073, + "grad_norm": 1.1702943961349295, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 12073 + }, + { + "epoch": 0.12074, + "grad_norm": 0.7291161680358292, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 12074 + }, + { + "epoch": 0.12075, + "grad_norm": 0.6377229522581281, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 12075 + }, + { + "epoch": 0.12076, + "grad_norm": 0.655318568436041, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 12076 + }, + { + "epoch": 0.12077, + "grad_norm": 0.5461348060545833, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 12077 + }, + { + "epoch": 0.12078, + "grad_norm": 0.5313314121879531, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 12078 + }, + { + "epoch": 0.12079, + "grad_norm": 0.45366667880293565, + "learning_rate": 0.003, + "loss": 4.062, + "step": 12079 + }, + { + "epoch": 0.1208, + "grad_norm": 0.4978555488964222, + "learning_rate": 0.003, + "loss": 4.079, + "step": 12080 + }, + { + "epoch": 0.12081, + "grad_norm": 0.5435159491187604, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 12081 + }, + { + "epoch": 0.12082, + "grad_norm": 0.7004071495504851, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 12082 + }, + { + "epoch": 0.12083, + "grad_norm": 0.8819757494392108, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 12083 + }, + { + "epoch": 0.12084, + "grad_norm": 0.9984371636379986, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 12084 + }, + { + "epoch": 0.12085, + "grad_norm": 0.9377941167196149, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 12085 + }, + { + "epoch": 0.12086, + "grad_norm": 0.777402896704748, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 12086 + }, + { + "epoch": 0.12087, + "grad_norm": 0.6768328289763865, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 12087 + }, + { + "epoch": 0.12088, + "grad_norm": 0.6550784747702699, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 12088 + }, + { + "epoch": 0.12089, + "grad_norm": 0.6799046805949871, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 12089 + }, + { + "epoch": 0.1209, + "grad_norm": 0.6888254527740507, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 12090 + }, + { + "epoch": 0.12091, + "grad_norm": 0.6698631300665974, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 12091 + }, + { + "epoch": 0.12092, + "grad_norm": 0.6758036757087778, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 12092 + }, + { + "epoch": 0.12093, + "grad_norm": 0.7516931246977546, + "learning_rate": 0.003, + "loss": 4.085, + "step": 12093 + }, + { + "epoch": 0.12094, + "grad_norm": 0.8660913984037959, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 12094 + }, + { + "epoch": 0.12095, + "grad_norm": 1.148225611386786, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 12095 + }, + { + "epoch": 0.12096, + "grad_norm": 0.9430544649654374, + "learning_rate": 0.003, + "loss": 4.07, + "step": 12096 + }, + { + "epoch": 0.12097, + "grad_norm": 0.8004448350274224, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 12097 + }, + { + "epoch": 0.12098, + "grad_norm": 0.8437588482083355, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 12098 + }, + { + "epoch": 0.12099, + "grad_norm": 0.9654506646429969, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 12099 + }, + { + "epoch": 0.121, + "grad_norm": 0.8052346106708665, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 12100 + }, + { + "epoch": 0.12101, + "grad_norm": 0.7761954135951914, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 12101 + }, + { + "epoch": 0.12102, + "grad_norm": 0.7865471443949508, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12102 + }, + { + "epoch": 0.12103, + "grad_norm": 0.908020254613268, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 12103 + }, + { + "epoch": 0.12104, + "grad_norm": 0.9714484820971058, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 12104 + }, + { + "epoch": 0.12105, + "grad_norm": 1.1516958837405888, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 12105 + }, + { + "epoch": 0.12106, + "grad_norm": 0.956438837762759, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 12106 + }, + { + "epoch": 0.12107, + "grad_norm": 0.9605465015934317, + "learning_rate": 0.003, + "loss": 4.113, + "step": 12107 + }, + { + "epoch": 0.12108, + "grad_norm": 1.0015242903697923, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 12108 + }, + { + "epoch": 0.12109, + "grad_norm": 0.9656350030878798, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 12109 + }, + { + "epoch": 0.1211, + "grad_norm": 1.0208554246207817, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 12110 + }, + { + "epoch": 0.12111, + "grad_norm": 1.045212071755535, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 12111 + }, + { + "epoch": 0.12112, + "grad_norm": 0.9609127200304951, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 12112 + }, + { + "epoch": 0.12113, + "grad_norm": 0.9315381484230759, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 12113 + }, + { + "epoch": 0.12114, + "grad_norm": 1.1086423634112808, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 12114 + }, + { + "epoch": 0.12115, + "grad_norm": 0.9523300917629679, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 12115 + }, + { + "epoch": 0.12116, + "grad_norm": 0.8687541089117045, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 12116 + }, + { + "epoch": 0.12117, + "grad_norm": 0.7873255227446385, + "learning_rate": 0.003, + "loss": 4.1131, + "step": 12117 + }, + { + "epoch": 0.12118, + "grad_norm": 0.7821982607193855, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 12118 + }, + { + "epoch": 0.12119, + "grad_norm": 0.7616462691624264, + "learning_rate": 0.003, + "loss": 4.106, + "step": 12119 + }, + { + "epoch": 0.1212, + "grad_norm": 0.707120879825376, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 12120 + }, + { + "epoch": 0.12121, + "grad_norm": 0.8449356180095963, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 12121 + }, + { + "epoch": 0.12122, + "grad_norm": 0.891037945284292, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 12122 + }, + { + "epoch": 0.12123, + "grad_norm": 0.8461260410105871, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 12123 + }, + { + "epoch": 0.12124, + "grad_norm": 0.9045354603549911, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 12124 + }, + { + "epoch": 0.12125, + "grad_norm": 1.0010016238193329, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 12125 + }, + { + "epoch": 0.12126, + "grad_norm": 0.9632668167060271, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 12126 + }, + { + "epoch": 0.12127, + "grad_norm": 0.9500807246486168, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 12127 + }, + { + "epoch": 0.12128, + "grad_norm": 0.8874953984881597, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 12128 + }, + { + "epoch": 0.12129, + "grad_norm": 0.7891377567378397, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 12129 + }, + { + "epoch": 0.1213, + "grad_norm": 0.7152733415442877, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 12130 + }, + { + "epoch": 0.12131, + "grad_norm": 0.7280311154787182, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 12131 + }, + { + "epoch": 0.12132, + "grad_norm": 0.708291815992429, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 12132 + }, + { + "epoch": 0.12133, + "grad_norm": 0.7402534639010128, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 12133 + }, + { + "epoch": 0.12134, + "grad_norm": 0.8119931330116694, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 12134 + }, + { + "epoch": 0.12135, + "grad_norm": 0.8350450819192635, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 12135 + }, + { + "epoch": 0.12136, + "grad_norm": 0.8039762209981959, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 12136 + }, + { + "epoch": 0.12137, + "grad_norm": 0.7253700956457722, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 12137 + }, + { + "epoch": 0.12138, + "grad_norm": 0.821368229958523, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 12138 + }, + { + "epoch": 0.12139, + "grad_norm": 0.8890056249264902, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 12139 + }, + { + "epoch": 0.1214, + "grad_norm": 1.0673718378818036, + "learning_rate": 0.003, + "loss": 4.1068, + "step": 12140 + }, + { + "epoch": 0.12141, + "grad_norm": 0.998773771713253, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 12141 + }, + { + "epoch": 0.12142, + "grad_norm": 0.8343867553391711, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 12142 + }, + { + "epoch": 0.12143, + "grad_norm": 0.7439287321630871, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 12143 + }, + { + "epoch": 0.12144, + "grad_norm": 0.7548192958000438, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 12144 + }, + { + "epoch": 0.12145, + "grad_norm": 0.6863287660828014, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 12145 + }, + { + "epoch": 0.12146, + "grad_norm": 0.6374615352659703, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 12146 + }, + { + "epoch": 0.12147, + "grad_norm": 0.6626425129250582, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 12147 + }, + { + "epoch": 0.12148, + "grad_norm": 0.664469899083491, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 12148 + }, + { + "epoch": 0.12149, + "grad_norm": 0.6726604782386362, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 12149 + }, + { + "epoch": 0.1215, + "grad_norm": 0.6071104446064951, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 12150 + }, + { + "epoch": 0.12151, + "grad_norm": 0.759680928265347, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 12151 + }, + { + "epoch": 0.12152, + "grad_norm": 0.9315784295233887, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 12152 + }, + { + "epoch": 0.12153, + "grad_norm": 1.320521685808312, + "learning_rate": 0.003, + "loss": 4.1163, + "step": 12153 + }, + { + "epoch": 0.12154, + "grad_norm": 0.6453639037401027, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 12154 + }, + { + "epoch": 0.12155, + "grad_norm": 0.6706571195781358, + "learning_rate": 0.003, + "loss": 4.051, + "step": 12155 + }, + { + "epoch": 0.12156, + "grad_norm": 0.7951848965645847, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 12156 + }, + { + "epoch": 0.12157, + "grad_norm": 0.7596900007881088, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 12157 + }, + { + "epoch": 0.12158, + "grad_norm": 0.8127025968699181, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 12158 + }, + { + "epoch": 0.12159, + "grad_norm": 0.876589766771459, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 12159 + }, + { + "epoch": 0.1216, + "grad_norm": 0.8972856167148897, + "learning_rate": 0.003, + "loss": 4.066, + "step": 12160 + }, + { + "epoch": 0.12161, + "grad_norm": 0.8563068013271339, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 12161 + }, + { + "epoch": 0.12162, + "grad_norm": 0.8306703766568939, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12162 + }, + { + "epoch": 0.12163, + "grad_norm": 0.9815681209976873, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 12163 + }, + { + "epoch": 0.12164, + "grad_norm": 1.0912966250633331, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 12164 + }, + { + "epoch": 0.12165, + "grad_norm": 0.9444453508980822, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 12165 + }, + { + "epoch": 0.12166, + "grad_norm": 0.9043036529174772, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 12166 + }, + { + "epoch": 0.12167, + "grad_norm": 0.8864176334512155, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 12167 + }, + { + "epoch": 0.12168, + "grad_norm": 0.9185772588555268, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 12168 + }, + { + "epoch": 0.12169, + "grad_norm": 0.9369480001448344, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 12169 + }, + { + "epoch": 0.1217, + "grad_norm": 1.095455332190787, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 12170 + }, + { + "epoch": 0.12171, + "grad_norm": 0.9453296225202272, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 12171 + }, + { + "epoch": 0.12172, + "grad_norm": 0.8178161188650642, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 12172 + }, + { + "epoch": 0.12173, + "grad_norm": 0.6656216309214364, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 12173 + }, + { + "epoch": 0.12174, + "grad_norm": 0.6143330151167342, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 12174 + }, + { + "epoch": 0.12175, + "grad_norm": 0.6095956765502458, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 12175 + }, + { + "epoch": 0.12176, + "grad_norm": 0.6274349985290663, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 12176 + }, + { + "epoch": 0.12177, + "grad_norm": 0.6963270153737929, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 12177 + }, + { + "epoch": 0.12178, + "grad_norm": 0.8053955223339382, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 12178 + }, + { + "epoch": 0.12179, + "grad_norm": 0.8560733484824161, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 12179 + }, + { + "epoch": 0.1218, + "grad_norm": 0.7746834097286917, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 12180 + }, + { + "epoch": 0.12181, + "grad_norm": 0.7575071571844539, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 12181 + }, + { + "epoch": 0.12182, + "grad_norm": 0.8423095942998952, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 12182 + }, + { + "epoch": 0.12183, + "grad_norm": 0.8305437413626926, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 12183 + }, + { + "epoch": 0.12184, + "grad_norm": 0.8213877652517387, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 12184 + }, + { + "epoch": 0.12185, + "grad_norm": 0.7800946606127453, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 12185 + }, + { + "epoch": 0.12186, + "grad_norm": 0.8897274458424853, + "learning_rate": 0.003, + "loss": 4.067, + "step": 12186 + }, + { + "epoch": 0.12187, + "grad_norm": 1.0276624626599788, + "learning_rate": 0.003, + "loss": 4.092, + "step": 12187 + }, + { + "epoch": 0.12188, + "grad_norm": 1.0309198946336533, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 12188 + }, + { + "epoch": 0.12189, + "grad_norm": 0.9790137072694165, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 12189 + }, + { + "epoch": 0.1219, + "grad_norm": 0.9688236140763872, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 12190 + }, + { + "epoch": 0.12191, + "grad_norm": 1.0222420660886349, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 12191 + }, + { + "epoch": 0.12192, + "grad_norm": 1.04032773609806, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 12192 + }, + { + "epoch": 0.12193, + "grad_norm": 0.9148037001203069, + "learning_rate": 0.003, + "loss": 4.1111, + "step": 12193 + }, + { + "epoch": 0.12194, + "grad_norm": 0.8461984518265281, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 12194 + }, + { + "epoch": 0.12195, + "grad_norm": 0.8962201511783212, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 12195 + }, + { + "epoch": 0.12196, + "grad_norm": 0.8998155876981508, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 12196 + }, + { + "epoch": 0.12197, + "grad_norm": 0.869197527370743, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 12197 + }, + { + "epoch": 0.12198, + "grad_norm": 0.9925011333423569, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 12198 + }, + { + "epoch": 0.12199, + "grad_norm": 0.9924066788111143, + "learning_rate": 0.003, + "loss": 4.093, + "step": 12199 + }, + { + "epoch": 0.122, + "grad_norm": 0.9780810833776715, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 12200 + }, + { + "epoch": 0.12201, + "grad_norm": 0.7749765569011773, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 12201 + }, + { + "epoch": 0.12202, + "grad_norm": 0.68963493036336, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 12202 + }, + { + "epoch": 0.12203, + "grad_norm": 0.6944315750387414, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 12203 + }, + { + "epoch": 0.12204, + "grad_norm": 0.7092490852739447, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 12204 + }, + { + "epoch": 0.12205, + "grad_norm": 0.7824985651770687, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 12205 + }, + { + "epoch": 0.12206, + "grad_norm": 0.7841193956816908, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 12206 + }, + { + "epoch": 0.12207, + "grad_norm": 0.7727636401343367, + "learning_rate": 0.003, + "loss": 4.121, + "step": 12207 + }, + { + "epoch": 0.12208, + "grad_norm": 0.8821542042505839, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 12208 + }, + { + "epoch": 0.12209, + "grad_norm": 1.1296209326467996, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 12209 + }, + { + "epoch": 0.1221, + "grad_norm": 1.0041093442524542, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 12210 + }, + { + "epoch": 0.12211, + "grad_norm": 0.8895041277507798, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 12211 + }, + { + "epoch": 0.12212, + "grad_norm": 0.7247035510410651, + "learning_rate": 0.003, + "loss": 4.078, + "step": 12212 + }, + { + "epoch": 0.12213, + "grad_norm": 0.6265246520373837, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 12213 + }, + { + "epoch": 0.12214, + "grad_norm": 0.5284681468382831, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 12214 + }, + { + "epoch": 0.12215, + "grad_norm": 0.5469626070392162, + "learning_rate": 0.003, + "loss": 4.097, + "step": 12215 + }, + { + "epoch": 0.12216, + "grad_norm": 0.6047355431829017, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12216 + }, + { + "epoch": 0.12217, + "grad_norm": 0.5873750180070837, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 12217 + }, + { + "epoch": 0.12218, + "grad_norm": 0.6158510675727711, + "learning_rate": 0.003, + "loss": 4.051, + "step": 12218 + }, + { + "epoch": 0.12219, + "grad_norm": 0.6121083851344632, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 12219 + }, + { + "epoch": 0.1222, + "grad_norm": 0.7609522642713573, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 12220 + }, + { + "epoch": 0.12221, + "grad_norm": 0.8809276711303994, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 12221 + }, + { + "epoch": 0.12222, + "grad_norm": 0.9475649039258731, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 12222 + }, + { + "epoch": 0.12223, + "grad_norm": 0.9629002752009357, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 12223 + }, + { + "epoch": 0.12224, + "grad_norm": 1.008015518858347, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 12224 + }, + { + "epoch": 0.12225, + "grad_norm": 0.9237570720650717, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 12225 + }, + { + "epoch": 0.12226, + "grad_norm": 0.8799849671986355, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 12226 + }, + { + "epoch": 0.12227, + "grad_norm": 0.9346772009432872, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 12227 + }, + { + "epoch": 0.12228, + "grad_norm": 0.8533561203210697, + "learning_rate": 0.003, + "loss": 4.051, + "step": 12228 + }, + { + "epoch": 0.12229, + "grad_norm": 0.9299445302671507, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 12229 + }, + { + "epoch": 0.1223, + "grad_norm": 0.970061316848315, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 12230 + }, + { + "epoch": 0.12231, + "grad_norm": 1.04935587821548, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 12231 + }, + { + "epoch": 0.12232, + "grad_norm": 0.977851356364189, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 12232 + }, + { + "epoch": 0.12233, + "grad_norm": 0.8433149915900962, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 12233 + }, + { + "epoch": 0.12234, + "grad_norm": 0.8496072567309231, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 12234 + }, + { + "epoch": 0.12235, + "grad_norm": 0.889913163004263, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 12235 + }, + { + "epoch": 0.12236, + "grad_norm": 0.9829956720219396, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 12236 + }, + { + "epoch": 0.12237, + "grad_norm": 0.9758935947825471, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 12237 + }, + { + "epoch": 0.12238, + "grad_norm": 0.905293659895765, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 12238 + }, + { + "epoch": 0.12239, + "grad_norm": 0.880979930850334, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 12239 + }, + { + "epoch": 0.1224, + "grad_norm": 0.832007631237703, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 12240 + }, + { + "epoch": 0.12241, + "grad_norm": 0.9034049863110921, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 12241 + }, + { + "epoch": 0.12242, + "grad_norm": 0.976237493191658, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 12242 + }, + { + "epoch": 0.12243, + "grad_norm": 1.1732103912413159, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 12243 + }, + { + "epoch": 0.12244, + "grad_norm": 1.020065784383985, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 12244 + }, + { + "epoch": 0.12245, + "grad_norm": 0.9639699286831159, + "learning_rate": 0.003, + "loss": 4.1168, + "step": 12245 + }, + { + "epoch": 0.12246, + "grad_norm": 0.9814805407910625, + "learning_rate": 0.003, + "loss": 4.094, + "step": 12246 + }, + { + "epoch": 0.12247, + "grad_norm": 0.8533786196620828, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 12247 + }, + { + "epoch": 0.12248, + "grad_norm": 0.6662766502862788, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 12248 + }, + { + "epoch": 0.12249, + "grad_norm": 0.6702892402590316, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 12249 + }, + { + "epoch": 0.1225, + "grad_norm": 0.6533489392576404, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 12250 + }, + { + "epoch": 0.12251, + "grad_norm": 0.5709588901774177, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 12251 + }, + { + "epoch": 0.12252, + "grad_norm": 0.5718032323858817, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 12252 + }, + { + "epoch": 0.12253, + "grad_norm": 0.5424113872571699, + "learning_rate": 0.003, + "loss": 4.076, + "step": 12253 + }, + { + "epoch": 0.12254, + "grad_norm": 0.5122148366369595, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 12254 + }, + { + "epoch": 0.12255, + "grad_norm": 0.6207724489282378, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 12255 + }, + { + "epoch": 0.12256, + "grad_norm": 0.8036473546761361, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 12256 + }, + { + "epoch": 0.12257, + "grad_norm": 1.0622283319885992, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 12257 + }, + { + "epoch": 0.12258, + "grad_norm": 1.0380816145963498, + "learning_rate": 0.003, + "loss": 4.065, + "step": 12258 + }, + { + "epoch": 0.12259, + "grad_norm": 0.9805063613712668, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 12259 + }, + { + "epoch": 0.1226, + "grad_norm": 0.8470946725098341, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 12260 + }, + { + "epoch": 0.12261, + "grad_norm": 0.7167763323497804, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 12261 + }, + { + "epoch": 0.12262, + "grad_norm": 0.8492307595053198, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 12262 + }, + { + "epoch": 0.12263, + "grad_norm": 0.7470430691505191, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 12263 + }, + { + "epoch": 0.12264, + "grad_norm": 0.7437026810187541, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 12264 + }, + { + "epoch": 0.12265, + "grad_norm": 0.6921338875604943, + "learning_rate": 0.003, + "loss": 4.062, + "step": 12265 + }, + { + "epoch": 0.12266, + "grad_norm": 0.6340720803713298, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 12266 + }, + { + "epoch": 0.12267, + "grad_norm": 0.5996048202296703, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 12267 + }, + { + "epoch": 0.12268, + "grad_norm": 0.5253278971864735, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 12268 + }, + { + "epoch": 0.12269, + "grad_norm": 0.5059015116943797, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 12269 + }, + { + "epoch": 0.1227, + "grad_norm": 0.5611070010602227, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 12270 + }, + { + "epoch": 0.12271, + "grad_norm": 0.6010429068065455, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 12271 + }, + { + "epoch": 0.12272, + "grad_norm": 0.6560178371037455, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 12272 + }, + { + "epoch": 0.12273, + "grad_norm": 0.7535031170180342, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 12273 + }, + { + "epoch": 0.12274, + "grad_norm": 0.9517936135796402, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 12274 + }, + { + "epoch": 0.12275, + "grad_norm": 1.178351373783213, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 12275 + }, + { + "epoch": 0.12276, + "grad_norm": 0.8912268880412101, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 12276 + }, + { + "epoch": 0.12277, + "grad_norm": 0.8855062963775535, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 12277 + }, + { + "epoch": 0.12278, + "grad_norm": 0.8988888499927662, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 12278 + }, + { + "epoch": 0.12279, + "grad_norm": 0.9681405602313218, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 12279 + }, + { + "epoch": 0.1228, + "grad_norm": 0.840459247453639, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 12280 + }, + { + "epoch": 0.12281, + "grad_norm": 0.7862761387462439, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 12281 + }, + { + "epoch": 0.12282, + "grad_norm": 0.7910672908173746, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 12282 + }, + { + "epoch": 0.12283, + "grad_norm": 0.7996847191182868, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 12283 + }, + { + "epoch": 0.12284, + "grad_norm": 0.9761078001765382, + "learning_rate": 0.003, + "loss": 4.057, + "step": 12284 + }, + { + "epoch": 0.12285, + "grad_norm": 1.2122969362612241, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 12285 + }, + { + "epoch": 0.12286, + "grad_norm": 0.7982955277029816, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 12286 + }, + { + "epoch": 0.12287, + "grad_norm": 0.6832723853119029, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 12287 + }, + { + "epoch": 0.12288, + "grad_norm": 0.7410385916665433, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 12288 + }, + { + "epoch": 0.12289, + "grad_norm": 0.7795829113013923, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 12289 + }, + { + "epoch": 0.1229, + "grad_norm": 0.8203151883451628, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 12290 + }, + { + "epoch": 0.12291, + "grad_norm": 1.0129421338927522, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 12291 + }, + { + "epoch": 0.12292, + "grad_norm": 1.061571835074904, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 12292 + }, + { + "epoch": 0.12293, + "grad_norm": 0.8374218946860755, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 12293 + }, + { + "epoch": 0.12294, + "grad_norm": 0.7543087370375461, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 12294 + }, + { + "epoch": 0.12295, + "grad_norm": 0.7706914881903262, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 12295 + }, + { + "epoch": 0.12296, + "grad_norm": 0.7607184633400262, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 12296 + }, + { + "epoch": 0.12297, + "grad_norm": 0.7724401795263441, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 12297 + }, + { + "epoch": 0.12298, + "grad_norm": 0.8638283512198204, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 12298 + }, + { + "epoch": 0.12299, + "grad_norm": 0.8982120323560557, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12299 + }, + { + "epoch": 0.123, + "grad_norm": 0.9124691457151956, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 12300 + }, + { + "epoch": 0.12301, + "grad_norm": 1.0861149071905536, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 12301 + }, + { + "epoch": 0.12302, + "grad_norm": 1.056574819962284, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 12302 + }, + { + "epoch": 0.12303, + "grad_norm": 0.8962846526441469, + "learning_rate": 0.003, + "loss": 4.1494, + "step": 12303 + }, + { + "epoch": 0.12304, + "grad_norm": 0.7551672668971874, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 12304 + }, + { + "epoch": 0.12305, + "grad_norm": 0.7687559626159957, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 12305 + }, + { + "epoch": 0.12306, + "grad_norm": 0.7832740456242118, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 12306 + }, + { + "epoch": 0.12307, + "grad_norm": 0.8334537225717343, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 12307 + }, + { + "epoch": 0.12308, + "grad_norm": 1.0396736321564775, + "learning_rate": 0.003, + "loss": 4.101, + "step": 12308 + }, + { + "epoch": 0.12309, + "grad_norm": 1.0988737205635373, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 12309 + }, + { + "epoch": 0.1231, + "grad_norm": 0.7415438867578784, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 12310 + }, + { + "epoch": 0.12311, + "grad_norm": 0.6274126363565536, + "learning_rate": 0.003, + "loss": 4.06, + "step": 12311 + }, + { + "epoch": 0.12312, + "grad_norm": 0.6460470961906579, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 12312 + }, + { + "epoch": 0.12313, + "grad_norm": 0.7743078883377551, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 12313 + }, + { + "epoch": 0.12314, + "grad_norm": 0.9427175393301743, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 12314 + }, + { + "epoch": 0.12315, + "grad_norm": 0.8781349943437341, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 12315 + }, + { + "epoch": 0.12316, + "grad_norm": 0.8208307368125973, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 12316 + }, + { + "epoch": 0.12317, + "grad_norm": 0.7706107052937355, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 12317 + }, + { + "epoch": 0.12318, + "grad_norm": 0.7306685595290431, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 12318 + }, + { + "epoch": 0.12319, + "grad_norm": 0.7692113098316952, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 12319 + }, + { + "epoch": 0.1232, + "grad_norm": 0.8912250345171907, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 12320 + }, + { + "epoch": 0.12321, + "grad_norm": 0.9856624656988558, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 12321 + }, + { + "epoch": 0.12322, + "grad_norm": 1.005052658751967, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 12322 + }, + { + "epoch": 0.12323, + "grad_norm": 0.9148997566925725, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 12323 + }, + { + "epoch": 0.12324, + "grad_norm": 0.8558840802530098, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12324 + }, + { + "epoch": 0.12325, + "grad_norm": 0.7961488539800927, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 12325 + }, + { + "epoch": 0.12326, + "grad_norm": 0.7793292980068781, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 12326 + }, + { + "epoch": 0.12327, + "grad_norm": 0.7290581138354479, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 12327 + }, + { + "epoch": 0.12328, + "grad_norm": 0.6926715810449556, + "learning_rate": 0.003, + "loss": 4.045, + "step": 12328 + }, + { + "epoch": 0.12329, + "grad_norm": 0.775214172914529, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 12329 + }, + { + "epoch": 0.1233, + "grad_norm": 0.82900822759267, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 12330 + }, + { + "epoch": 0.12331, + "grad_norm": 0.8842733850561869, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 12331 + }, + { + "epoch": 0.12332, + "grad_norm": 0.9751583110833055, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 12332 + }, + { + "epoch": 0.12333, + "grad_norm": 1.1071006881382284, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 12333 + }, + { + "epoch": 0.12334, + "grad_norm": 1.0117747800099712, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 12334 + }, + { + "epoch": 0.12335, + "grad_norm": 0.9163473104991442, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 12335 + }, + { + "epoch": 0.12336, + "grad_norm": 0.7932919852703744, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 12336 + }, + { + "epoch": 0.12337, + "grad_norm": 0.5938066929793493, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 12337 + }, + { + "epoch": 0.12338, + "grad_norm": 0.6757743849715577, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 12338 + }, + { + "epoch": 0.12339, + "grad_norm": 0.91201916345825, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 12339 + }, + { + "epoch": 0.1234, + "grad_norm": 1.1617820868185735, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 12340 + }, + { + "epoch": 0.12341, + "grad_norm": 0.7103223518308457, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 12341 + }, + { + "epoch": 0.12342, + "grad_norm": 0.6337513778485245, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 12342 + }, + { + "epoch": 0.12343, + "grad_norm": 0.7079606290293606, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 12343 + }, + { + "epoch": 0.12344, + "grad_norm": 0.6689056464669664, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 12344 + }, + { + "epoch": 0.12345, + "grad_norm": 0.5707386249666849, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 12345 + }, + { + "epoch": 0.12346, + "grad_norm": 0.6191142900461684, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 12346 + }, + { + "epoch": 0.12347, + "grad_norm": 0.599638430699415, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 12347 + }, + { + "epoch": 0.12348, + "grad_norm": 0.5975280171501979, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 12348 + }, + { + "epoch": 0.12349, + "grad_norm": 0.6641026027063925, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 12349 + }, + { + "epoch": 0.1235, + "grad_norm": 0.748212361076718, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 12350 + }, + { + "epoch": 0.12351, + "grad_norm": 0.8750142390126173, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 12351 + }, + { + "epoch": 0.12352, + "grad_norm": 1.059951964622264, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 12352 + }, + { + "epoch": 0.12353, + "grad_norm": 1.118541436018302, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 12353 + }, + { + "epoch": 0.12354, + "grad_norm": 0.8240209127136764, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 12354 + }, + { + "epoch": 0.12355, + "grad_norm": 0.7811652917801903, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 12355 + }, + { + "epoch": 0.12356, + "grad_norm": 0.8184349715564275, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 12356 + }, + { + "epoch": 0.12357, + "grad_norm": 0.8288548630561049, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 12357 + }, + { + "epoch": 0.12358, + "grad_norm": 0.6820615218677171, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 12358 + }, + { + "epoch": 0.12359, + "grad_norm": 0.6348546447588544, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 12359 + }, + { + "epoch": 0.1236, + "grad_norm": 0.5927829921374093, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 12360 + }, + { + "epoch": 0.12361, + "grad_norm": 0.6877751343256268, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 12361 + }, + { + "epoch": 0.12362, + "grad_norm": 0.6465142056870291, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 12362 + }, + { + "epoch": 0.12363, + "grad_norm": 0.6658993255518995, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 12363 + }, + { + "epoch": 0.12364, + "grad_norm": 0.812717613930872, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 12364 + }, + { + "epoch": 0.12365, + "grad_norm": 0.9841896762626663, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 12365 + }, + { + "epoch": 0.12366, + "grad_norm": 1.0444743383641244, + "learning_rate": 0.003, + "loss": 4.1276, + "step": 12366 + }, + { + "epoch": 0.12367, + "grad_norm": 0.8735959659864412, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 12367 + }, + { + "epoch": 0.12368, + "grad_norm": 0.8104436938471729, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 12368 + }, + { + "epoch": 0.12369, + "grad_norm": 0.7647719667314723, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 12369 + }, + { + "epoch": 0.1237, + "grad_norm": 0.9798723311447599, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 12370 + }, + { + "epoch": 0.12371, + "grad_norm": 1.1073068164129845, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 12371 + }, + { + "epoch": 0.12372, + "grad_norm": 0.8783686640733369, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 12372 + }, + { + "epoch": 0.12373, + "grad_norm": 0.8571504331738046, + "learning_rate": 0.003, + "loss": 4.091, + "step": 12373 + }, + { + "epoch": 0.12374, + "grad_norm": 0.8824814783901349, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 12374 + }, + { + "epoch": 0.12375, + "grad_norm": 0.8370906418565935, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 12375 + }, + { + "epoch": 0.12376, + "grad_norm": 0.8566745651264175, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 12376 + }, + { + "epoch": 0.12377, + "grad_norm": 0.7896995443932111, + "learning_rate": 0.003, + "loss": 4.042, + "step": 12377 + }, + { + "epoch": 0.12378, + "grad_norm": 0.7716242348864555, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 12378 + }, + { + "epoch": 0.12379, + "grad_norm": 0.8896154225927658, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 12379 + }, + { + "epoch": 0.1238, + "grad_norm": 1.0655680616463061, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 12380 + }, + { + "epoch": 0.12381, + "grad_norm": 1.1344858251808465, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 12381 + }, + { + "epoch": 0.12382, + "grad_norm": 0.7336688413835789, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 12382 + }, + { + "epoch": 0.12383, + "grad_norm": 0.6335176185088268, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 12383 + }, + { + "epoch": 0.12384, + "grad_norm": 0.6164379207701107, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 12384 + }, + { + "epoch": 0.12385, + "grad_norm": 0.682851429818355, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 12385 + }, + { + "epoch": 0.12386, + "grad_norm": 0.7538417672700986, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 12386 + }, + { + "epoch": 0.12387, + "grad_norm": 0.7767506727160641, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 12387 + }, + { + "epoch": 0.12388, + "grad_norm": 0.725322805837034, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 12388 + }, + { + "epoch": 0.12389, + "grad_norm": 0.7855361772914995, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 12389 + }, + { + "epoch": 0.1239, + "grad_norm": 0.8377198942167394, + "learning_rate": 0.003, + "loss": 4.1172, + "step": 12390 + }, + { + "epoch": 0.12391, + "grad_norm": 1.0220559200837096, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 12391 + }, + { + "epoch": 0.12392, + "grad_norm": 1.1679405125779883, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 12392 + }, + { + "epoch": 0.12393, + "grad_norm": 0.8079046078128874, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 12393 + }, + { + "epoch": 0.12394, + "grad_norm": 0.807912420130648, + "learning_rate": 0.003, + "loss": 4.076, + "step": 12394 + }, + { + "epoch": 0.12395, + "grad_norm": 0.8156046926666899, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 12395 + }, + { + "epoch": 0.12396, + "grad_norm": 0.6424939616056728, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 12396 + }, + { + "epoch": 0.12397, + "grad_norm": 0.6161565824363795, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 12397 + }, + { + "epoch": 0.12398, + "grad_norm": 0.6868072633878525, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 12398 + }, + { + "epoch": 0.12399, + "grad_norm": 0.8381894625150602, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 12399 + }, + { + "epoch": 0.124, + "grad_norm": 1.094648190282617, + "learning_rate": 0.003, + "loss": 4.128, + "step": 12400 + }, + { + "epoch": 0.12401, + "grad_norm": 1.0716932323490935, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12401 + }, + { + "epoch": 0.12402, + "grad_norm": 0.8443736440934634, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 12402 + }, + { + "epoch": 0.12403, + "grad_norm": 0.8174631840031019, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 12403 + }, + { + "epoch": 0.12404, + "grad_norm": 0.8302272019753588, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 12404 + }, + { + "epoch": 0.12405, + "grad_norm": 0.8356565848410257, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 12405 + }, + { + "epoch": 0.12406, + "grad_norm": 0.8091300206792589, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 12406 + }, + { + "epoch": 0.12407, + "grad_norm": 0.7332184789133536, + "learning_rate": 0.003, + "loss": 4.07, + "step": 12407 + }, + { + "epoch": 0.12408, + "grad_norm": 0.8687526884068933, + "learning_rate": 0.003, + "loss": 4.093, + "step": 12408 + }, + { + "epoch": 0.12409, + "grad_norm": 0.8506045260272557, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 12409 + }, + { + "epoch": 0.1241, + "grad_norm": 0.7742830890220069, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 12410 + }, + { + "epoch": 0.12411, + "grad_norm": 0.7645377692364871, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 12411 + }, + { + "epoch": 0.12412, + "grad_norm": 0.6900258135012273, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 12412 + }, + { + "epoch": 0.12413, + "grad_norm": 0.7008170255398872, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 12413 + }, + { + "epoch": 0.12414, + "grad_norm": 0.7807544703948569, + "learning_rate": 0.003, + "loss": 4.061, + "step": 12414 + }, + { + "epoch": 0.12415, + "grad_norm": 1.0418165862715427, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 12415 + }, + { + "epoch": 0.12416, + "grad_norm": 1.2588624025355457, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 12416 + }, + { + "epoch": 0.12417, + "grad_norm": 1.0174602374348387, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 12417 + }, + { + "epoch": 0.12418, + "grad_norm": 1.1048010081413318, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 12418 + }, + { + "epoch": 0.12419, + "grad_norm": 0.8166182229591952, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 12419 + }, + { + "epoch": 0.1242, + "grad_norm": 0.9415925328154592, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 12420 + }, + { + "epoch": 0.12421, + "grad_norm": 0.8690733941514077, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 12421 + }, + { + "epoch": 0.12422, + "grad_norm": 0.830745473123471, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 12422 + }, + { + "epoch": 0.12423, + "grad_norm": 0.8342523814524774, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 12423 + }, + { + "epoch": 0.12424, + "grad_norm": 0.894813671165597, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 12424 + }, + { + "epoch": 0.12425, + "grad_norm": 0.9884607649566219, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 12425 + }, + { + "epoch": 0.12426, + "grad_norm": 1.1435522369886537, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 12426 + }, + { + "epoch": 0.12427, + "grad_norm": 1.0095395273350793, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 12427 + }, + { + "epoch": 0.12428, + "grad_norm": 0.8441201562767902, + "learning_rate": 0.003, + "loss": 4.103, + "step": 12428 + }, + { + "epoch": 0.12429, + "grad_norm": 0.6643197463278329, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 12429 + }, + { + "epoch": 0.1243, + "grad_norm": 0.7675558389841266, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 12430 + }, + { + "epoch": 0.12431, + "grad_norm": 0.9729795890323124, + "learning_rate": 0.003, + "loss": 4.076, + "step": 12431 + }, + { + "epoch": 0.12432, + "grad_norm": 0.9891625907463222, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 12432 + }, + { + "epoch": 0.12433, + "grad_norm": 0.867863487443049, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 12433 + }, + { + "epoch": 0.12434, + "grad_norm": 0.8096004679100748, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 12434 + }, + { + "epoch": 0.12435, + "grad_norm": 0.7696790010013758, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 12435 + }, + { + "epoch": 0.12436, + "grad_norm": 0.7074429258609628, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 12436 + }, + { + "epoch": 0.12437, + "grad_norm": 0.6928512254596106, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 12437 + }, + { + "epoch": 0.12438, + "grad_norm": 0.7276723133589919, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 12438 + }, + { + "epoch": 0.12439, + "grad_norm": 0.8097392985970517, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 12439 + }, + { + "epoch": 0.1244, + "grad_norm": 0.9078311600338613, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 12440 + }, + { + "epoch": 0.12441, + "grad_norm": 0.9656091039492581, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 12441 + }, + { + "epoch": 0.12442, + "grad_norm": 0.8382607157716104, + "learning_rate": 0.003, + "loss": 4.083, + "step": 12442 + }, + { + "epoch": 0.12443, + "grad_norm": 0.7195463112482667, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 12443 + }, + { + "epoch": 0.12444, + "grad_norm": 0.692287995881062, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 12444 + }, + { + "epoch": 0.12445, + "grad_norm": 0.7260972533808546, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 12445 + }, + { + "epoch": 0.12446, + "grad_norm": 0.7545307041135739, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 12446 + }, + { + "epoch": 0.12447, + "grad_norm": 0.8399120440566367, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 12447 + }, + { + "epoch": 0.12448, + "grad_norm": 0.82477064475789, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 12448 + }, + { + "epoch": 0.12449, + "grad_norm": 0.879099511063207, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 12449 + }, + { + "epoch": 0.1245, + "grad_norm": 0.9756698583699438, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 12450 + }, + { + "epoch": 0.12451, + "grad_norm": 1.055809397868483, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 12451 + }, + { + "epoch": 0.12452, + "grad_norm": 0.9273043848413589, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 12452 + }, + { + "epoch": 0.12453, + "grad_norm": 0.8424893235278818, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 12453 + }, + { + "epoch": 0.12454, + "grad_norm": 0.7769934753116866, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 12454 + }, + { + "epoch": 0.12455, + "grad_norm": 0.8422414337643478, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 12455 + }, + { + "epoch": 0.12456, + "grad_norm": 1.0644890222125198, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 12456 + }, + { + "epoch": 0.12457, + "grad_norm": 1.034725852802373, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 12457 + }, + { + "epoch": 0.12458, + "grad_norm": 1.0201030281238366, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 12458 + }, + { + "epoch": 0.12459, + "grad_norm": 0.9606410872706322, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 12459 + }, + { + "epoch": 0.1246, + "grad_norm": 0.9802952571985856, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 12460 + }, + { + "epoch": 0.12461, + "grad_norm": 1.0145179379970155, + "learning_rate": 0.003, + "loss": 4.084, + "step": 12461 + }, + { + "epoch": 0.12462, + "grad_norm": 0.8941296602201455, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 12462 + }, + { + "epoch": 0.12463, + "grad_norm": 0.7880644625841127, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 12463 + }, + { + "epoch": 0.12464, + "grad_norm": 0.7478314581834201, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 12464 + }, + { + "epoch": 0.12465, + "grad_norm": 0.6874206299133098, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 12465 + }, + { + "epoch": 0.12466, + "grad_norm": 0.5532656428700546, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 12466 + }, + { + "epoch": 0.12467, + "grad_norm": 0.5085094451819194, + "learning_rate": 0.003, + "loss": 4.081, + "step": 12467 + }, + { + "epoch": 0.12468, + "grad_norm": 0.4833011801969543, + "learning_rate": 0.003, + "loss": 4.091, + "step": 12468 + }, + { + "epoch": 0.12469, + "grad_norm": 0.5658074187218443, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 12469 + }, + { + "epoch": 0.1247, + "grad_norm": 0.6584925064901498, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 12470 + }, + { + "epoch": 0.12471, + "grad_norm": 0.8347468519445372, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 12471 + }, + { + "epoch": 0.12472, + "grad_norm": 1.147252471970649, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 12472 + }, + { + "epoch": 0.12473, + "grad_norm": 0.9151661598565768, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 12473 + }, + { + "epoch": 0.12474, + "grad_norm": 0.7783521582318006, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 12474 + }, + { + "epoch": 0.12475, + "grad_norm": 0.754762458139977, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 12475 + }, + { + "epoch": 0.12476, + "grad_norm": 0.8076264580293819, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 12476 + }, + { + "epoch": 0.12477, + "grad_norm": 0.8440102098928844, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 12477 + }, + { + "epoch": 0.12478, + "grad_norm": 0.8292745717859727, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 12478 + }, + { + "epoch": 0.12479, + "grad_norm": 0.7712475170483982, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 12479 + }, + { + "epoch": 0.1248, + "grad_norm": 0.6625408481360877, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 12480 + }, + { + "epoch": 0.12481, + "grad_norm": 0.6502393782701698, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 12481 + }, + { + "epoch": 0.12482, + "grad_norm": 0.6758788180464269, + "learning_rate": 0.003, + "loss": 4.08, + "step": 12482 + }, + { + "epoch": 0.12483, + "grad_norm": 0.6727320857627304, + "learning_rate": 0.003, + "loss": 4.084, + "step": 12483 + }, + { + "epoch": 0.12484, + "grad_norm": 0.6356059088846429, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 12484 + }, + { + "epoch": 0.12485, + "grad_norm": 0.5870391387072846, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 12485 + }, + { + "epoch": 0.12486, + "grad_norm": 0.7368756368425331, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 12486 + }, + { + "epoch": 0.12487, + "grad_norm": 0.9396079302973953, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 12487 + }, + { + "epoch": 0.12488, + "grad_norm": 1.1292542302681363, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 12488 + }, + { + "epoch": 0.12489, + "grad_norm": 0.8528878129899282, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 12489 + }, + { + "epoch": 0.1249, + "grad_norm": 0.7516004369114074, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 12490 + }, + { + "epoch": 0.12491, + "grad_norm": 0.9147555580331406, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 12491 + }, + { + "epoch": 0.12492, + "grad_norm": 1.265380678620011, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 12492 + }, + { + "epoch": 0.12493, + "grad_norm": 0.9085831832477737, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 12493 + }, + { + "epoch": 0.12494, + "grad_norm": 0.8933451452856955, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 12494 + }, + { + "epoch": 0.12495, + "grad_norm": 0.9486602997841618, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 12495 + }, + { + "epoch": 0.12496, + "grad_norm": 0.9671534076597736, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 12496 + }, + { + "epoch": 0.12497, + "grad_norm": 0.9748030910513085, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 12497 + }, + { + "epoch": 0.12498, + "grad_norm": 0.9304947010935671, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 12498 + }, + { + "epoch": 0.12499, + "grad_norm": 0.926582461696646, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 12499 + }, + { + "epoch": 0.125, + "grad_norm": 0.9627114037689704, + "learning_rate": 0.003, + "loss": 4.1296, + "step": 12500 + }, + { + "epoch": 0.12501, + "grad_norm": 1.0604940883484884, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 12501 + }, + { + "epoch": 0.12502, + "grad_norm": 0.9577059818976567, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 12502 + }, + { + "epoch": 0.12503, + "grad_norm": 0.884475209540014, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 12503 + }, + { + "epoch": 0.12504, + "grad_norm": 0.8173440312329352, + "learning_rate": 0.003, + "loss": 4.089, + "step": 12504 + }, + { + "epoch": 0.12505, + "grad_norm": 0.7489029149484316, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 12505 + }, + { + "epoch": 0.12506, + "grad_norm": 0.7572504979291013, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 12506 + }, + { + "epoch": 0.12507, + "grad_norm": 0.8399998909902504, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 12507 + }, + { + "epoch": 0.12508, + "grad_norm": 0.9337566512163873, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 12508 + }, + { + "epoch": 0.12509, + "grad_norm": 0.8861296871193007, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 12509 + }, + { + "epoch": 0.1251, + "grad_norm": 0.8711578061763322, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 12510 + }, + { + "epoch": 0.12511, + "grad_norm": 0.810476642592666, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 12511 + }, + { + "epoch": 0.12512, + "grad_norm": 0.7685750219209562, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 12512 + }, + { + "epoch": 0.12513, + "grad_norm": 0.8276538036476391, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 12513 + }, + { + "epoch": 0.12514, + "grad_norm": 0.8908374164057034, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 12514 + }, + { + "epoch": 0.12515, + "grad_norm": 1.0910727360858632, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 12515 + }, + { + "epoch": 0.12516, + "grad_norm": 1.0212161466319103, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 12516 + }, + { + "epoch": 0.12517, + "grad_norm": 1.066691309468899, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 12517 + }, + { + "epoch": 0.12518, + "grad_norm": 0.9988527766381885, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 12518 + }, + { + "epoch": 0.12519, + "grad_norm": 1.0140692171238843, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 12519 + }, + { + "epoch": 0.1252, + "grad_norm": 0.9828000839884308, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 12520 + }, + { + "epoch": 0.12521, + "grad_norm": 0.9327551598434594, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 12521 + }, + { + "epoch": 0.12522, + "grad_norm": 0.807818758831363, + "learning_rate": 0.003, + "loss": 4.1256, + "step": 12522 + }, + { + "epoch": 0.12523, + "grad_norm": 0.7805048294306602, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 12523 + }, + { + "epoch": 0.12524, + "grad_norm": 0.6780475265822701, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 12524 + }, + { + "epoch": 0.12525, + "grad_norm": 0.672416846874558, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 12525 + }, + { + "epoch": 0.12526, + "grad_norm": 0.6660977882609734, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 12526 + }, + { + "epoch": 0.12527, + "grad_norm": 0.6541819706521673, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 12527 + }, + { + "epoch": 0.12528, + "grad_norm": 0.67528315323153, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 12528 + }, + { + "epoch": 0.12529, + "grad_norm": 0.6203124045169987, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 12529 + }, + { + "epoch": 0.1253, + "grad_norm": 0.6662733723107425, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 12530 + }, + { + "epoch": 0.12531, + "grad_norm": 0.6766311073551332, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 12531 + }, + { + "epoch": 0.12532, + "grad_norm": 0.7659473563470474, + "learning_rate": 0.003, + "loss": 4.059, + "step": 12532 + }, + { + "epoch": 0.12533, + "grad_norm": 0.8952162523682594, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 12533 + }, + { + "epoch": 0.12534, + "grad_norm": 1.1705063949964587, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 12534 + }, + { + "epoch": 0.12535, + "grad_norm": 0.8209608074328265, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 12535 + }, + { + "epoch": 0.12536, + "grad_norm": 0.6216744131373718, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 12536 + }, + { + "epoch": 0.12537, + "grad_norm": 0.5813816881218197, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 12537 + }, + { + "epoch": 0.12538, + "grad_norm": 0.7213851704102082, + "learning_rate": 0.003, + "loss": 4.071, + "step": 12538 + }, + { + "epoch": 0.12539, + "grad_norm": 0.9239622219843724, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 12539 + }, + { + "epoch": 0.1254, + "grad_norm": 1.0591751024024054, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 12540 + }, + { + "epoch": 0.12541, + "grad_norm": 1.0899608081175878, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 12541 + }, + { + "epoch": 0.12542, + "grad_norm": 0.9360062255258262, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 12542 + }, + { + "epoch": 0.12543, + "grad_norm": 0.981563508663698, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 12543 + }, + { + "epoch": 0.12544, + "grad_norm": 0.9401216606961313, + "learning_rate": 0.003, + "loss": 4.1135, + "step": 12544 + }, + { + "epoch": 0.12545, + "grad_norm": 0.8967062592092603, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 12545 + }, + { + "epoch": 0.12546, + "grad_norm": 0.8775015205434612, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 12546 + }, + { + "epoch": 0.12547, + "grad_norm": 0.9385758344702522, + "learning_rate": 0.003, + "loss": 4.108, + "step": 12547 + }, + { + "epoch": 0.12548, + "grad_norm": 1.0170124611390352, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 12548 + }, + { + "epoch": 0.12549, + "grad_norm": 1.0282448767977495, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 12549 + }, + { + "epoch": 0.1255, + "grad_norm": 0.9355240438214039, + "learning_rate": 0.003, + "loss": 4.077, + "step": 12550 + }, + { + "epoch": 0.12551, + "grad_norm": 0.8492515385424486, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 12551 + }, + { + "epoch": 0.12552, + "grad_norm": 0.8328292067679657, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 12552 + }, + { + "epoch": 0.12553, + "grad_norm": 0.8018285159763675, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 12553 + }, + { + "epoch": 0.12554, + "grad_norm": 0.7826087493872959, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 12554 + }, + { + "epoch": 0.12555, + "grad_norm": 0.753072073280961, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 12555 + }, + { + "epoch": 0.12556, + "grad_norm": 0.6605457000607075, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 12556 + }, + { + "epoch": 0.12557, + "grad_norm": 0.6119107954645638, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 12557 + }, + { + "epoch": 0.12558, + "grad_norm": 0.6351510959937766, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 12558 + }, + { + "epoch": 0.12559, + "grad_norm": 0.6776216370299248, + "learning_rate": 0.003, + "loss": 4.051, + "step": 12559 + }, + { + "epoch": 0.1256, + "grad_norm": 0.7259179291843775, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 12560 + }, + { + "epoch": 0.12561, + "grad_norm": 0.9873701878185189, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 12561 + }, + { + "epoch": 0.12562, + "grad_norm": 1.1549459750928384, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 12562 + }, + { + "epoch": 0.12563, + "grad_norm": 0.7812249219085289, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 12563 + }, + { + "epoch": 0.12564, + "grad_norm": 0.6703423951202678, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 12564 + }, + { + "epoch": 0.12565, + "grad_norm": 0.8357079073240602, + "learning_rate": 0.003, + "loss": 4.087, + "step": 12565 + }, + { + "epoch": 0.12566, + "grad_norm": 0.9552000107530532, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 12566 + }, + { + "epoch": 0.12567, + "grad_norm": 0.9355841374475491, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 12567 + }, + { + "epoch": 0.12568, + "grad_norm": 0.8066419116118289, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 12568 + }, + { + "epoch": 0.12569, + "grad_norm": 0.7235825525019571, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 12569 + }, + { + "epoch": 0.1257, + "grad_norm": 0.7580888885201048, + "learning_rate": 0.003, + "loss": 4.1195, + "step": 12570 + }, + { + "epoch": 0.12571, + "grad_norm": 0.8107129972338145, + "learning_rate": 0.003, + "loss": 4.087, + "step": 12571 + }, + { + "epoch": 0.12572, + "grad_norm": 0.930245214850763, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 12572 + }, + { + "epoch": 0.12573, + "grad_norm": 1.2684625702329895, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 12573 + }, + { + "epoch": 0.12574, + "grad_norm": 0.8440354075216758, + "learning_rate": 0.003, + "loss": 4.089, + "step": 12574 + }, + { + "epoch": 0.12575, + "grad_norm": 0.6315786225056687, + "learning_rate": 0.003, + "loss": 4.112, + "step": 12575 + }, + { + "epoch": 0.12576, + "grad_norm": 0.7041702276896806, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 12576 + }, + { + "epoch": 0.12577, + "grad_norm": 0.7647414725186429, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 12577 + }, + { + "epoch": 0.12578, + "grad_norm": 0.8551047957220735, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 12578 + }, + { + "epoch": 0.12579, + "grad_norm": 0.866404723267827, + "learning_rate": 0.003, + "loss": 4.091, + "step": 12579 + }, + { + "epoch": 0.1258, + "grad_norm": 0.8809057071597594, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 12580 + }, + { + "epoch": 0.12581, + "grad_norm": 0.9915394714466932, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 12581 + }, + { + "epoch": 0.12582, + "grad_norm": 1.1197125321519874, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 12582 + }, + { + "epoch": 0.12583, + "grad_norm": 1.0929628884873095, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 12583 + }, + { + "epoch": 0.12584, + "grad_norm": 0.8525918323487954, + "learning_rate": 0.003, + "loss": 4.062, + "step": 12584 + }, + { + "epoch": 0.12585, + "grad_norm": 0.7374588766012489, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 12585 + }, + { + "epoch": 0.12586, + "grad_norm": 0.7388595932129336, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 12586 + }, + { + "epoch": 0.12587, + "grad_norm": 0.7639845260876715, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 12587 + }, + { + "epoch": 0.12588, + "grad_norm": 0.9487270636440439, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 12588 + }, + { + "epoch": 0.12589, + "grad_norm": 1.0956858061928865, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 12589 + }, + { + "epoch": 0.1259, + "grad_norm": 0.9630797615431528, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 12590 + }, + { + "epoch": 0.12591, + "grad_norm": 0.9493875531469873, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 12591 + }, + { + "epoch": 0.12592, + "grad_norm": 0.8389151739871537, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 12592 + }, + { + "epoch": 0.12593, + "grad_norm": 0.7908775788009506, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 12593 + }, + { + "epoch": 0.12594, + "grad_norm": 0.7313439476606562, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 12594 + }, + { + "epoch": 0.12595, + "grad_norm": 0.8468426952635264, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 12595 + }, + { + "epoch": 0.12596, + "grad_norm": 0.821865089793791, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 12596 + }, + { + "epoch": 0.12597, + "grad_norm": 0.9392946494153627, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 12597 + }, + { + "epoch": 0.12598, + "grad_norm": 0.9931903669329258, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 12598 + }, + { + "epoch": 0.12599, + "grad_norm": 0.9262748105070246, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 12599 + }, + { + "epoch": 0.126, + "grad_norm": 0.7207751396428811, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 12600 + }, + { + "epoch": 0.12601, + "grad_norm": 0.6901411251358255, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 12601 + }, + { + "epoch": 0.12602, + "grad_norm": 0.6198097139667673, + "learning_rate": 0.003, + "loss": 4.091, + "step": 12602 + }, + { + "epoch": 0.12603, + "grad_norm": 0.603433732393143, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 12603 + }, + { + "epoch": 0.12604, + "grad_norm": 0.7273903809849699, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 12604 + }, + { + "epoch": 0.12605, + "grad_norm": 0.827027777018654, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 12605 + }, + { + "epoch": 0.12606, + "grad_norm": 0.8312039632390354, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 12606 + }, + { + "epoch": 0.12607, + "grad_norm": 0.8300227267934658, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 12607 + }, + { + "epoch": 0.12608, + "grad_norm": 0.8334629282650833, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 12608 + }, + { + "epoch": 0.12609, + "grad_norm": 0.8244190523319734, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 12609 + }, + { + "epoch": 0.1261, + "grad_norm": 0.7931709246643678, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 12610 + }, + { + "epoch": 0.12611, + "grad_norm": 0.8401160267227625, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 12611 + }, + { + "epoch": 0.12612, + "grad_norm": 0.9085808627649283, + "learning_rate": 0.003, + "loss": 4.069, + "step": 12612 + }, + { + "epoch": 0.12613, + "grad_norm": 0.9173499827636392, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 12613 + }, + { + "epoch": 0.12614, + "grad_norm": 1.0897131711310648, + "learning_rate": 0.003, + "loss": 4.096, + "step": 12614 + }, + { + "epoch": 0.12615, + "grad_norm": 0.9364496572065395, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 12615 + }, + { + "epoch": 0.12616, + "grad_norm": 0.9667681960767411, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 12616 + }, + { + "epoch": 0.12617, + "grad_norm": 1.0303928702195748, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 12617 + }, + { + "epoch": 0.12618, + "grad_norm": 0.9436246602241444, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 12618 + }, + { + "epoch": 0.12619, + "grad_norm": 0.8890093706854717, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 12619 + }, + { + "epoch": 0.1262, + "grad_norm": 0.9333633955100752, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 12620 + }, + { + "epoch": 0.12621, + "grad_norm": 0.9322594488051535, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 12621 + }, + { + "epoch": 0.12622, + "grad_norm": 0.9033426657643931, + "learning_rate": 0.003, + "loss": 4.085, + "step": 12622 + }, + { + "epoch": 0.12623, + "grad_norm": 0.8078614711868852, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 12623 + }, + { + "epoch": 0.12624, + "grad_norm": 0.7553219433703763, + "learning_rate": 0.003, + "loss": 4.07, + "step": 12624 + }, + { + "epoch": 0.12625, + "grad_norm": 0.6536387376296591, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 12625 + }, + { + "epoch": 0.12626, + "grad_norm": 0.6509050382838308, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 12626 + }, + { + "epoch": 0.12627, + "grad_norm": 0.5894424757023002, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 12627 + }, + { + "epoch": 0.12628, + "grad_norm": 0.5190451966239275, + "learning_rate": 0.003, + "loss": 4.046, + "step": 12628 + }, + { + "epoch": 0.12629, + "grad_norm": 0.5063596654288052, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 12629 + }, + { + "epoch": 0.1263, + "grad_norm": 0.5915460424015676, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 12630 + }, + { + "epoch": 0.12631, + "grad_norm": 0.7716495330152952, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 12631 + }, + { + "epoch": 0.12632, + "grad_norm": 1.0127677704631437, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 12632 + }, + { + "epoch": 0.12633, + "grad_norm": 1.155960196618749, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 12633 + }, + { + "epoch": 0.12634, + "grad_norm": 0.6767776447411626, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 12634 + }, + { + "epoch": 0.12635, + "grad_norm": 0.6486610587694518, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 12635 + }, + { + "epoch": 0.12636, + "grad_norm": 0.9413168441927956, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 12636 + }, + { + "epoch": 0.12637, + "grad_norm": 1.1497099524548084, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 12637 + }, + { + "epoch": 0.12638, + "grad_norm": 0.841578834111071, + "learning_rate": 0.003, + "loss": 4.081, + "step": 12638 + }, + { + "epoch": 0.12639, + "grad_norm": 0.8511860535824317, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 12639 + }, + { + "epoch": 0.1264, + "grad_norm": 0.9587860814417453, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 12640 + }, + { + "epoch": 0.12641, + "grad_norm": 1.0399472699458243, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 12641 + }, + { + "epoch": 0.12642, + "grad_norm": 0.904514659473879, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 12642 + }, + { + "epoch": 0.12643, + "grad_norm": 0.8968843408912103, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 12643 + }, + { + "epoch": 0.12644, + "grad_norm": 0.9694460812122874, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 12644 + }, + { + "epoch": 0.12645, + "grad_norm": 1.1479583275761367, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 12645 + }, + { + "epoch": 0.12646, + "grad_norm": 0.9028210380164542, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 12646 + }, + { + "epoch": 0.12647, + "grad_norm": 0.8392746175735948, + "learning_rate": 0.003, + "loss": 4.068, + "step": 12647 + }, + { + "epoch": 0.12648, + "grad_norm": 0.9050343448598624, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 12648 + }, + { + "epoch": 0.12649, + "grad_norm": 1.0979731835477473, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 12649 + }, + { + "epoch": 0.1265, + "grad_norm": 0.9970749819263638, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 12650 + }, + { + "epoch": 0.12651, + "grad_norm": 1.0300097240371, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 12651 + }, + { + "epoch": 0.12652, + "grad_norm": 0.8094128941912325, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 12652 + }, + { + "epoch": 0.12653, + "grad_norm": 0.8399627373921515, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 12653 + }, + { + "epoch": 0.12654, + "grad_norm": 0.9677214469576224, + "learning_rate": 0.003, + "loss": 4.079, + "step": 12654 + }, + { + "epoch": 0.12655, + "grad_norm": 0.8876434643959279, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 12655 + }, + { + "epoch": 0.12656, + "grad_norm": 0.7688755791017492, + "learning_rate": 0.003, + "loss": 4.1134, + "step": 12656 + }, + { + "epoch": 0.12657, + "grad_norm": 0.7627679301180477, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 12657 + }, + { + "epoch": 0.12658, + "grad_norm": 0.7358996935502059, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 12658 + }, + { + "epoch": 0.12659, + "grad_norm": 0.777638381282783, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 12659 + }, + { + "epoch": 0.1266, + "grad_norm": 0.7263923280451818, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 12660 + }, + { + "epoch": 0.12661, + "grad_norm": 0.7179583472088668, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 12661 + }, + { + "epoch": 0.12662, + "grad_norm": 0.6501556131988983, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 12662 + }, + { + "epoch": 0.12663, + "grad_norm": 0.6406902271699736, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 12663 + }, + { + "epoch": 0.12664, + "grad_norm": 0.6080111377564112, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 12664 + }, + { + "epoch": 0.12665, + "grad_norm": 0.6036312883406694, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 12665 + }, + { + "epoch": 0.12666, + "grad_norm": 0.5768466033785397, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 12666 + }, + { + "epoch": 0.12667, + "grad_norm": 0.6576162442410906, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 12667 + }, + { + "epoch": 0.12668, + "grad_norm": 0.8097843106878837, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 12668 + }, + { + "epoch": 0.12669, + "grad_norm": 0.9462883400550094, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 12669 + }, + { + "epoch": 0.1267, + "grad_norm": 0.9608897301195578, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 12670 + }, + { + "epoch": 0.12671, + "grad_norm": 1.0340855127401694, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 12671 + }, + { + "epoch": 0.12672, + "grad_norm": 1.1531000426119538, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 12672 + }, + { + "epoch": 0.12673, + "grad_norm": 0.8261766238933164, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 12673 + }, + { + "epoch": 0.12674, + "grad_norm": 0.6920260802570728, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 12674 + }, + { + "epoch": 0.12675, + "grad_norm": 0.6335380850494294, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 12675 + }, + { + "epoch": 0.12676, + "grad_norm": 0.7209542624800619, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 12676 + }, + { + "epoch": 0.12677, + "grad_norm": 0.87268806862565, + "learning_rate": 0.003, + "loss": 4.049, + "step": 12677 + }, + { + "epoch": 0.12678, + "grad_norm": 0.8840380040948691, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 12678 + }, + { + "epoch": 0.12679, + "grad_norm": 0.9747808622851675, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 12679 + }, + { + "epoch": 0.1268, + "grad_norm": 1.063767639110145, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 12680 + }, + { + "epoch": 0.12681, + "grad_norm": 0.9301110174392209, + "learning_rate": 0.003, + "loss": 4.091, + "step": 12681 + }, + { + "epoch": 0.12682, + "grad_norm": 0.9316617602133014, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 12682 + }, + { + "epoch": 0.12683, + "grad_norm": 0.8621769611167813, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 12683 + }, + { + "epoch": 0.12684, + "grad_norm": 0.8619352180499619, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 12684 + }, + { + "epoch": 0.12685, + "grad_norm": 0.7818292664604228, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 12685 + }, + { + "epoch": 0.12686, + "grad_norm": 0.831157545940667, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 12686 + }, + { + "epoch": 0.12687, + "grad_norm": 0.9587347990085802, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 12687 + }, + { + "epoch": 0.12688, + "grad_norm": 1.0579182796949145, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 12688 + }, + { + "epoch": 0.12689, + "grad_norm": 0.9786164686866942, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 12689 + }, + { + "epoch": 0.1269, + "grad_norm": 0.863318584753949, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 12690 + }, + { + "epoch": 0.12691, + "grad_norm": 0.8304504158648132, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 12691 + }, + { + "epoch": 0.12692, + "grad_norm": 0.7904162445080669, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 12692 + }, + { + "epoch": 0.12693, + "grad_norm": 0.7862023906149539, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 12693 + }, + { + "epoch": 0.12694, + "grad_norm": 0.8898702567841869, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 12694 + }, + { + "epoch": 0.12695, + "grad_norm": 0.9731601117378544, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 12695 + }, + { + "epoch": 0.12696, + "grad_norm": 0.9838657840018509, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 12696 + }, + { + "epoch": 0.12697, + "grad_norm": 0.8410138578490889, + "learning_rate": 0.003, + "loss": 4.044, + "step": 12697 + }, + { + "epoch": 0.12698, + "grad_norm": 0.7824779930609662, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 12698 + }, + { + "epoch": 0.12699, + "grad_norm": 0.7325758562041346, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 12699 + }, + { + "epoch": 0.127, + "grad_norm": 0.7476038265054501, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 12700 + }, + { + "epoch": 0.12701, + "grad_norm": 0.6474234218140162, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 12701 + }, + { + "epoch": 0.12702, + "grad_norm": 0.5994708210940758, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 12702 + }, + { + "epoch": 0.12703, + "grad_norm": 0.5947004248947164, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 12703 + }, + { + "epoch": 0.12704, + "grad_norm": 0.5822643700856132, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 12704 + }, + { + "epoch": 0.12705, + "grad_norm": 0.6375257904131024, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 12705 + }, + { + "epoch": 0.12706, + "grad_norm": 0.8232573084375202, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 12706 + }, + { + "epoch": 0.12707, + "grad_norm": 1.0490864377683808, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 12707 + }, + { + "epoch": 0.12708, + "grad_norm": 0.9646458821564334, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 12708 + }, + { + "epoch": 0.12709, + "grad_norm": 0.9237631362442041, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 12709 + }, + { + "epoch": 0.1271, + "grad_norm": 1.1018179181554677, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 12710 + }, + { + "epoch": 0.12711, + "grad_norm": 1.0497471397953404, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 12711 + }, + { + "epoch": 0.12712, + "grad_norm": 0.9068318922074851, + "learning_rate": 0.003, + "loss": 4.066, + "step": 12712 + }, + { + "epoch": 0.12713, + "grad_norm": 0.8587292207217677, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 12713 + }, + { + "epoch": 0.12714, + "grad_norm": 0.7722901403512779, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 12714 + }, + { + "epoch": 0.12715, + "grad_norm": 0.6293681040156622, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 12715 + }, + { + "epoch": 0.12716, + "grad_norm": 0.6214351278871658, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 12716 + }, + { + "epoch": 0.12717, + "grad_norm": 0.6163124174025716, + "learning_rate": 0.003, + "loss": 4.083, + "step": 12717 + }, + { + "epoch": 0.12718, + "grad_norm": 0.7295122022742531, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 12718 + }, + { + "epoch": 0.12719, + "grad_norm": 0.7489060106943374, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 12719 + }, + { + "epoch": 0.1272, + "grad_norm": 0.8774515707694688, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 12720 + }, + { + "epoch": 0.12721, + "grad_norm": 1.107975365365161, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 12721 + }, + { + "epoch": 0.12722, + "grad_norm": 1.0814064376376955, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 12722 + }, + { + "epoch": 0.12723, + "grad_norm": 0.7852302860828565, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 12723 + }, + { + "epoch": 0.12724, + "grad_norm": 0.7165640244151582, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 12724 + }, + { + "epoch": 0.12725, + "grad_norm": 0.6870728656197633, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 12725 + }, + { + "epoch": 0.12726, + "grad_norm": 0.6875586946041499, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 12726 + }, + { + "epoch": 0.12727, + "grad_norm": 0.6534996603381649, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 12727 + }, + { + "epoch": 0.12728, + "grad_norm": 0.7220811580946503, + "learning_rate": 0.003, + "loss": 4.078, + "step": 12728 + }, + { + "epoch": 0.12729, + "grad_norm": 0.8525327543524897, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 12729 + }, + { + "epoch": 0.1273, + "grad_norm": 1.0524089976707018, + "learning_rate": 0.003, + "loss": 4.073, + "step": 12730 + }, + { + "epoch": 0.12731, + "grad_norm": 1.253081012855374, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 12731 + }, + { + "epoch": 0.12732, + "grad_norm": 0.7417151562374673, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 12732 + }, + { + "epoch": 0.12733, + "grad_norm": 0.7611871014784876, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 12733 + }, + { + "epoch": 0.12734, + "grad_norm": 0.8215105248523885, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 12734 + }, + { + "epoch": 0.12735, + "grad_norm": 0.8238174222975156, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 12735 + }, + { + "epoch": 0.12736, + "grad_norm": 0.8048380654708065, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 12736 + }, + { + "epoch": 0.12737, + "grad_norm": 0.71805881185348, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 12737 + }, + { + "epoch": 0.12738, + "grad_norm": 0.803432731777592, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 12738 + }, + { + "epoch": 0.12739, + "grad_norm": 0.7892230046494437, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 12739 + }, + { + "epoch": 0.1274, + "grad_norm": 0.8432138331722246, + "learning_rate": 0.003, + "loss": 4.067, + "step": 12740 + }, + { + "epoch": 0.12741, + "grad_norm": 0.8419812098722906, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 12741 + }, + { + "epoch": 0.12742, + "grad_norm": 0.9038260065352892, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 12742 + }, + { + "epoch": 0.12743, + "grad_norm": 0.9751762842945431, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 12743 + }, + { + "epoch": 0.12744, + "grad_norm": 0.9720053831555276, + "learning_rate": 0.003, + "loss": 4.066, + "step": 12744 + }, + { + "epoch": 0.12745, + "grad_norm": 0.9544614546080166, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 12745 + }, + { + "epoch": 0.12746, + "grad_norm": 0.9481144845024555, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 12746 + }, + { + "epoch": 0.12747, + "grad_norm": 0.9534086282439148, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 12747 + }, + { + "epoch": 0.12748, + "grad_norm": 0.7921609079785911, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 12748 + }, + { + "epoch": 0.12749, + "grad_norm": 0.7341232190175854, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 12749 + }, + { + "epoch": 0.1275, + "grad_norm": 0.7934210242478213, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 12750 + }, + { + "epoch": 0.12751, + "grad_norm": 0.8361610882033421, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 12751 + }, + { + "epoch": 0.12752, + "grad_norm": 0.8533153979514542, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 12752 + }, + { + "epoch": 0.12753, + "grad_norm": 0.8429252720795538, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 12753 + }, + { + "epoch": 0.12754, + "grad_norm": 0.8936232536103126, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 12754 + }, + { + "epoch": 0.12755, + "grad_norm": 0.9624691229469716, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 12755 + }, + { + "epoch": 0.12756, + "grad_norm": 1.0371851035955135, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 12756 + }, + { + "epoch": 0.12757, + "grad_norm": 1.045476061843089, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 12757 + }, + { + "epoch": 0.12758, + "grad_norm": 1.07435159385966, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 12758 + }, + { + "epoch": 0.12759, + "grad_norm": 1.081798623831364, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 12759 + }, + { + "epoch": 0.1276, + "grad_norm": 0.9703479152295932, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 12760 + }, + { + "epoch": 0.12761, + "grad_norm": 1.0968494337077377, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 12761 + }, + { + "epoch": 0.12762, + "grad_norm": 0.9243545413233614, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 12762 + }, + { + "epoch": 0.12763, + "grad_norm": 0.9520404423737605, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 12763 + }, + { + "epoch": 0.12764, + "grad_norm": 0.9195906991550372, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 12764 + }, + { + "epoch": 0.12765, + "grad_norm": 0.9882817414585594, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 12765 + }, + { + "epoch": 0.12766, + "grad_norm": 1.0954239881165995, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 12766 + }, + { + "epoch": 0.12767, + "grad_norm": 0.9642797317457092, + "learning_rate": 0.003, + "loss": 4.089, + "step": 12767 + }, + { + "epoch": 0.12768, + "grad_norm": 0.9065007479346235, + "learning_rate": 0.003, + "loss": 4.1076, + "step": 12768 + }, + { + "epoch": 0.12769, + "grad_norm": 0.8194405556326955, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 12769 + }, + { + "epoch": 0.1277, + "grad_norm": 0.6593942198315941, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 12770 + }, + { + "epoch": 0.12771, + "grad_norm": 0.6359885378281872, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 12771 + }, + { + "epoch": 0.12772, + "grad_norm": 0.5624414834893711, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12772 + }, + { + "epoch": 0.12773, + "grad_norm": 0.5534065103167038, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 12773 + }, + { + "epoch": 0.12774, + "grad_norm": 0.5263871455787446, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 12774 + }, + { + "epoch": 0.12775, + "grad_norm": 0.5001264134923619, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 12775 + }, + { + "epoch": 0.12776, + "grad_norm": 0.4965004417569365, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 12776 + }, + { + "epoch": 0.12777, + "grad_norm": 0.6427303533809592, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 12777 + }, + { + "epoch": 0.12778, + "grad_norm": 0.827424161741527, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 12778 + }, + { + "epoch": 0.12779, + "grad_norm": 0.9720714915404918, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 12779 + }, + { + "epoch": 0.1278, + "grad_norm": 1.2273812453673385, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 12780 + }, + { + "epoch": 0.12781, + "grad_norm": 0.7903980992873841, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 12781 + }, + { + "epoch": 0.12782, + "grad_norm": 0.7933372670231379, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 12782 + }, + { + "epoch": 0.12783, + "grad_norm": 0.9056791732628652, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 12783 + }, + { + "epoch": 0.12784, + "grad_norm": 0.9379469704928732, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 12784 + }, + { + "epoch": 0.12785, + "grad_norm": 0.8312325160204573, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 12785 + }, + { + "epoch": 0.12786, + "grad_norm": 0.811516624713473, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 12786 + }, + { + "epoch": 0.12787, + "grad_norm": 0.8151714149466239, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 12787 + }, + { + "epoch": 0.12788, + "grad_norm": 0.9047057444695532, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 12788 + }, + { + "epoch": 0.12789, + "grad_norm": 1.0056745263984743, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 12789 + }, + { + "epoch": 0.1279, + "grad_norm": 1.007074405841303, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 12790 + }, + { + "epoch": 0.12791, + "grad_norm": 0.9098293337743794, + "learning_rate": 0.003, + "loss": 4.086, + "step": 12791 + }, + { + "epoch": 0.12792, + "grad_norm": 0.8581978897830528, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 12792 + }, + { + "epoch": 0.12793, + "grad_norm": 0.7387993654938598, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 12793 + }, + { + "epoch": 0.12794, + "grad_norm": 0.8717184426974447, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 12794 + }, + { + "epoch": 0.12795, + "grad_norm": 0.9609144342761544, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 12795 + }, + { + "epoch": 0.12796, + "grad_norm": 0.9585935249627756, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 12796 + }, + { + "epoch": 0.12797, + "grad_norm": 0.8826724398226735, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 12797 + }, + { + "epoch": 0.12798, + "grad_norm": 0.7823401183104872, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 12798 + }, + { + "epoch": 0.12799, + "grad_norm": 0.768598197070406, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 12799 + }, + { + "epoch": 0.128, + "grad_norm": 0.7938414804643384, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 12800 + }, + { + "epoch": 0.12801, + "grad_norm": 0.9525671530334016, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 12801 + }, + { + "epoch": 0.12802, + "grad_norm": 1.13163526424453, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 12802 + }, + { + "epoch": 0.12803, + "grad_norm": 1.0032091422069624, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 12803 + }, + { + "epoch": 0.12804, + "grad_norm": 0.9508845934766839, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 12804 + }, + { + "epoch": 0.12805, + "grad_norm": 0.7860804655030508, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 12805 + }, + { + "epoch": 0.12806, + "grad_norm": 0.8292491651003593, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 12806 + }, + { + "epoch": 0.12807, + "grad_norm": 0.7773321265433382, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 12807 + }, + { + "epoch": 0.12808, + "grad_norm": 0.8264676701429482, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 12808 + }, + { + "epoch": 0.12809, + "grad_norm": 0.9778809971197301, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 12809 + }, + { + "epoch": 0.1281, + "grad_norm": 1.051517774945295, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 12810 + }, + { + "epoch": 0.12811, + "grad_norm": 0.8186785474711578, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 12811 + }, + { + "epoch": 0.12812, + "grad_norm": 0.7508226536859004, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 12812 + }, + { + "epoch": 0.12813, + "grad_norm": 0.6934791634966413, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 12813 + }, + { + "epoch": 0.12814, + "grad_norm": 0.7893377048543236, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 12814 + }, + { + "epoch": 0.12815, + "grad_norm": 0.8338115760468434, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 12815 + }, + { + "epoch": 0.12816, + "grad_norm": 0.8479740993884198, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 12816 + }, + { + "epoch": 0.12817, + "grad_norm": 0.9431496997904292, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 12817 + }, + { + "epoch": 0.12818, + "grad_norm": 0.8920237331550439, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 12818 + }, + { + "epoch": 0.12819, + "grad_norm": 0.7156508642588407, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 12819 + }, + { + "epoch": 0.1282, + "grad_norm": 0.6066254421966263, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 12820 + }, + { + "epoch": 0.12821, + "grad_norm": 0.6351940607150885, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 12821 + }, + { + "epoch": 0.12822, + "grad_norm": 0.7034877260393382, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 12822 + }, + { + "epoch": 0.12823, + "grad_norm": 0.6347898413970317, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 12823 + }, + { + "epoch": 0.12824, + "grad_norm": 0.6241083299387901, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 12824 + }, + { + "epoch": 0.12825, + "grad_norm": 0.6756292702600858, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 12825 + }, + { + "epoch": 0.12826, + "grad_norm": 0.8197715713923135, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 12826 + }, + { + "epoch": 0.12827, + "grad_norm": 0.9710675544326205, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 12827 + }, + { + "epoch": 0.12828, + "grad_norm": 1.1480053353882405, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 12828 + }, + { + "epoch": 0.12829, + "grad_norm": 0.7644574256098923, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 12829 + }, + { + "epoch": 0.1283, + "grad_norm": 0.6388582020514061, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 12830 + }, + { + "epoch": 0.12831, + "grad_norm": 0.635757694719699, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 12831 + }, + { + "epoch": 0.12832, + "grad_norm": 0.5750405720226927, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 12832 + }, + { + "epoch": 0.12833, + "grad_norm": 0.6023010902518902, + "learning_rate": 0.003, + "loss": 4.1065, + "step": 12833 + }, + { + "epoch": 0.12834, + "grad_norm": 0.6011194316530417, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 12834 + }, + { + "epoch": 0.12835, + "grad_norm": 0.638985093365683, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 12835 + }, + { + "epoch": 0.12836, + "grad_norm": 0.777352893414973, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 12836 + }, + { + "epoch": 0.12837, + "grad_norm": 0.8026043171894651, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 12837 + }, + { + "epoch": 0.12838, + "grad_norm": 0.7360736772156009, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 12838 + }, + { + "epoch": 0.12839, + "grad_norm": 0.9227694794787703, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 12839 + }, + { + "epoch": 0.1284, + "grad_norm": 1.0628301651771763, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12840 + }, + { + "epoch": 0.12841, + "grad_norm": 0.9214693039919348, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 12841 + }, + { + "epoch": 0.12842, + "grad_norm": 0.9447539388483064, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 12842 + }, + { + "epoch": 0.12843, + "grad_norm": 0.9482242552067807, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 12843 + }, + { + "epoch": 0.12844, + "grad_norm": 0.8759196059051805, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 12844 + }, + { + "epoch": 0.12845, + "grad_norm": 0.8253334642549143, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 12845 + }, + { + "epoch": 0.12846, + "grad_norm": 0.9249682151513382, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 12846 + }, + { + "epoch": 0.12847, + "grad_norm": 0.965513311820476, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 12847 + }, + { + "epoch": 0.12848, + "grad_norm": 0.9461560906880512, + "learning_rate": 0.003, + "loss": 4.095, + "step": 12848 + }, + { + "epoch": 0.12849, + "grad_norm": 1.0196975758139848, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 12849 + }, + { + "epoch": 0.1285, + "grad_norm": 1.3554759866006791, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 12850 + }, + { + "epoch": 0.12851, + "grad_norm": 0.8908069491735001, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 12851 + }, + { + "epoch": 0.12852, + "grad_norm": 0.8475096520827629, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 12852 + }, + { + "epoch": 0.12853, + "grad_norm": 0.8234717310938925, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 12853 + }, + { + "epoch": 0.12854, + "grad_norm": 0.7358780400837106, + "learning_rate": 0.003, + "loss": 4.101, + "step": 12854 + }, + { + "epoch": 0.12855, + "grad_norm": 0.743786573155452, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 12855 + }, + { + "epoch": 0.12856, + "grad_norm": 0.7386790569167223, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 12856 + }, + { + "epoch": 0.12857, + "grad_norm": 0.7129780781483097, + "learning_rate": 0.003, + "loss": 4.057, + "step": 12857 + }, + { + "epoch": 0.12858, + "grad_norm": 0.7341165024646048, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 12858 + }, + { + "epoch": 0.12859, + "grad_norm": 0.8805065977562269, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 12859 + }, + { + "epoch": 0.1286, + "grad_norm": 1.067801959326891, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 12860 + }, + { + "epoch": 0.12861, + "grad_norm": 1.1242566935397247, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 12861 + }, + { + "epoch": 0.12862, + "grad_norm": 0.9426431915569224, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 12862 + }, + { + "epoch": 0.12863, + "grad_norm": 0.8841812950302461, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 12863 + }, + { + "epoch": 0.12864, + "grad_norm": 0.9420783722109335, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 12864 + }, + { + "epoch": 0.12865, + "grad_norm": 0.9631534265418081, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 12865 + }, + { + "epoch": 0.12866, + "grad_norm": 0.8454715636945396, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 12866 + }, + { + "epoch": 0.12867, + "grad_norm": 0.9503572689770518, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 12867 + }, + { + "epoch": 0.12868, + "grad_norm": 0.9047058078312504, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 12868 + }, + { + "epoch": 0.12869, + "grad_norm": 0.7069135144597288, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 12869 + }, + { + "epoch": 0.1287, + "grad_norm": 0.7834958846660894, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 12870 + }, + { + "epoch": 0.12871, + "grad_norm": 0.8358942142934565, + "learning_rate": 0.003, + "loss": 4.1196, + "step": 12871 + }, + { + "epoch": 0.12872, + "grad_norm": 1.027921339490834, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 12872 + }, + { + "epoch": 0.12873, + "grad_norm": 1.1544453029236919, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 12873 + }, + { + "epoch": 0.12874, + "grad_norm": 0.7813340075521686, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 12874 + }, + { + "epoch": 0.12875, + "grad_norm": 0.6801009675717281, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 12875 + }, + { + "epoch": 0.12876, + "grad_norm": 0.5836689953332413, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 12876 + }, + { + "epoch": 0.12877, + "grad_norm": 0.6791272029631606, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 12877 + }, + { + "epoch": 0.12878, + "grad_norm": 0.7048056118590715, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 12878 + }, + { + "epoch": 0.12879, + "grad_norm": 0.8486441486568741, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 12879 + }, + { + "epoch": 0.1288, + "grad_norm": 1.062333230871585, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 12880 + }, + { + "epoch": 0.12881, + "grad_norm": 1.1208324918485693, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 12881 + }, + { + "epoch": 0.12882, + "grad_norm": 0.7273735247448398, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 12882 + }, + { + "epoch": 0.12883, + "grad_norm": 0.6281737882036764, + "learning_rate": 0.003, + "loss": 4.075, + "step": 12883 + }, + { + "epoch": 0.12884, + "grad_norm": 0.7477414465891695, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 12884 + }, + { + "epoch": 0.12885, + "grad_norm": 0.7890015805595267, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 12885 + }, + { + "epoch": 0.12886, + "grad_norm": 0.7046100214749329, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 12886 + }, + { + "epoch": 0.12887, + "grad_norm": 0.7418100470213351, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 12887 + }, + { + "epoch": 0.12888, + "grad_norm": 0.8826632618108597, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 12888 + }, + { + "epoch": 0.12889, + "grad_norm": 1.0399927608543118, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 12889 + }, + { + "epoch": 0.1289, + "grad_norm": 0.9699092371767827, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 12890 + }, + { + "epoch": 0.12891, + "grad_norm": 0.8861813940068809, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 12891 + }, + { + "epoch": 0.12892, + "grad_norm": 0.933256377383019, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 12892 + }, + { + "epoch": 0.12893, + "grad_norm": 0.8541594423021338, + "learning_rate": 0.003, + "loss": 4.1217, + "step": 12893 + }, + { + "epoch": 0.12894, + "grad_norm": 0.9481820783573804, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 12894 + }, + { + "epoch": 0.12895, + "grad_norm": 0.8663082137737302, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 12895 + }, + { + "epoch": 0.12896, + "grad_norm": 0.8678060328249163, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 12896 + }, + { + "epoch": 0.12897, + "grad_norm": 0.9375534317661987, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 12897 + }, + { + "epoch": 0.12898, + "grad_norm": 1.046933704977625, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 12898 + }, + { + "epoch": 0.12899, + "grad_norm": 0.9438937247867584, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 12899 + }, + { + "epoch": 0.129, + "grad_norm": 0.8095108174709523, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 12900 + }, + { + "epoch": 0.12901, + "grad_norm": 0.8002431242863978, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 12901 + }, + { + "epoch": 0.12902, + "grad_norm": 0.8138692043697183, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 12902 + }, + { + "epoch": 0.12903, + "grad_norm": 0.8796085016606462, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 12903 + }, + { + "epoch": 0.12904, + "grad_norm": 0.9236062888538203, + "learning_rate": 0.003, + "loss": 4.046, + "step": 12904 + }, + { + "epoch": 0.12905, + "grad_norm": 0.8413774387686741, + "learning_rate": 0.003, + "loss": 4.068, + "step": 12905 + }, + { + "epoch": 0.12906, + "grad_norm": 0.9456752557198059, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 12906 + }, + { + "epoch": 0.12907, + "grad_norm": 0.9542059686757122, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 12907 + }, + { + "epoch": 0.12908, + "grad_norm": 0.8969952436447906, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 12908 + }, + { + "epoch": 0.12909, + "grad_norm": 0.8660752120290028, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 12909 + }, + { + "epoch": 0.1291, + "grad_norm": 0.7206792488668173, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 12910 + }, + { + "epoch": 0.12911, + "grad_norm": 0.5894989401822226, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 12911 + }, + { + "epoch": 0.12912, + "grad_norm": 0.6632744636774794, + "learning_rate": 0.003, + "loss": 4.045, + "step": 12912 + }, + { + "epoch": 0.12913, + "grad_norm": 0.7989619794857723, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 12913 + }, + { + "epoch": 0.12914, + "grad_norm": 1.1599328636317552, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 12914 + }, + { + "epoch": 0.12915, + "grad_norm": 1.1144397017046104, + "learning_rate": 0.003, + "loss": 4.055, + "step": 12915 + }, + { + "epoch": 0.12916, + "grad_norm": 0.7532191660574785, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 12916 + }, + { + "epoch": 0.12917, + "grad_norm": 0.5736646240015582, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 12917 + }, + { + "epoch": 0.12918, + "grad_norm": 0.6677992780705447, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 12918 + }, + { + "epoch": 0.12919, + "grad_norm": 0.8075768288501416, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 12919 + }, + { + "epoch": 0.1292, + "grad_norm": 0.9430240478196712, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 12920 + }, + { + "epoch": 0.12921, + "grad_norm": 0.8289275041982015, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 12921 + }, + { + "epoch": 0.12922, + "grad_norm": 0.761314826384689, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 12922 + }, + { + "epoch": 0.12923, + "grad_norm": 0.8013300239881165, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 12923 + }, + { + "epoch": 0.12924, + "grad_norm": 0.7535540996983051, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 12924 + }, + { + "epoch": 0.12925, + "grad_norm": 0.7337997334087438, + "learning_rate": 0.003, + "loss": 4.071, + "step": 12925 + }, + { + "epoch": 0.12926, + "grad_norm": 0.6758098417641483, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 12926 + }, + { + "epoch": 0.12927, + "grad_norm": 0.6646773609405928, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 12927 + }, + { + "epoch": 0.12928, + "grad_norm": 0.7758042765328156, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 12928 + }, + { + "epoch": 0.12929, + "grad_norm": 0.9014493018144143, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 12929 + }, + { + "epoch": 0.1293, + "grad_norm": 0.9069207676053608, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 12930 + }, + { + "epoch": 0.12931, + "grad_norm": 0.8762834510823424, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 12931 + }, + { + "epoch": 0.12932, + "grad_norm": 0.9091437078713948, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12932 + }, + { + "epoch": 0.12933, + "grad_norm": 0.8633790914666007, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 12933 + }, + { + "epoch": 0.12934, + "grad_norm": 0.8055331303781942, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 12934 + }, + { + "epoch": 0.12935, + "grad_norm": 0.8179019459764888, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 12935 + }, + { + "epoch": 0.12936, + "grad_norm": 0.7862144587775611, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 12936 + }, + { + "epoch": 0.12937, + "grad_norm": 0.8099260534544216, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 12937 + }, + { + "epoch": 0.12938, + "grad_norm": 0.8650704844461392, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 12938 + }, + { + "epoch": 0.12939, + "grad_norm": 0.8769552795770255, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 12939 + }, + { + "epoch": 0.1294, + "grad_norm": 1.1069886038868555, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 12940 + }, + { + "epoch": 0.12941, + "grad_norm": 0.8768949926516877, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 12941 + }, + { + "epoch": 0.12942, + "grad_norm": 0.7580148325910523, + "learning_rate": 0.003, + "loss": 4.06, + "step": 12942 + }, + { + "epoch": 0.12943, + "grad_norm": 0.766777102968796, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 12943 + }, + { + "epoch": 0.12944, + "grad_norm": 0.7366808448795482, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 12944 + }, + { + "epoch": 0.12945, + "grad_norm": 0.680775920463159, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 12945 + }, + { + "epoch": 0.12946, + "grad_norm": 0.8120383621936467, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 12946 + }, + { + "epoch": 0.12947, + "grad_norm": 1.0223654037160204, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12947 + }, + { + "epoch": 0.12948, + "grad_norm": 1.169831218843661, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 12948 + }, + { + "epoch": 0.12949, + "grad_norm": 0.8200417608046442, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 12949 + }, + { + "epoch": 0.1295, + "grad_norm": 0.7819459739999236, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 12950 + }, + { + "epoch": 0.12951, + "grad_norm": 0.8241898175736854, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 12951 + }, + { + "epoch": 0.12952, + "grad_norm": 0.7630068419542384, + "learning_rate": 0.003, + "loss": 4.062, + "step": 12952 + }, + { + "epoch": 0.12953, + "grad_norm": 0.7052265617346959, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 12953 + }, + { + "epoch": 0.12954, + "grad_norm": 0.6481834232325121, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 12954 + }, + { + "epoch": 0.12955, + "grad_norm": 0.5027166237042523, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 12955 + }, + { + "epoch": 0.12956, + "grad_norm": 0.6150068430572803, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 12956 + }, + { + "epoch": 0.12957, + "grad_norm": 0.7659736446842879, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 12957 + }, + { + "epoch": 0.12958, + "grad_norm": 0.9386359161422575, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 12958 + }, + { + "epoch": 0.12959, + "grad_norm": 1.0537360487182426, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 12959 + }, + { + "epoch": 0.1296, + "grad_norm": 0.861816070614856, + "learning_rate": 0.003, + "loss": 4.065, + "step": 12960 + }, + { + "epoch": 0.12961, + "grad_norm": 0.7717231801413187, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 12961 + }, + { + "epoch": 0.12962, + "grad_norm": 0.6888662908134324, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 12962 + }, + { + "epoch": 0.12963, + "grad_norm": 0.6496168495428045, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 12963 + }, + { + "epoch": 0.12964, + "grad_norm": 0.7498354228466765, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 12964 + }, + { + "epoch": 0.12965, + "grad_norm": 0.8796226972801704, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 12965 + }, + { + "epoch": 0.12966, + "grad_norm": 0.9162752314847318, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 12966 + }, + { + "epoch": 0.12967, + "grad_norm": 0.949790235599875, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 12967 + }, + { + "epoch": 0.12968, + "grad_norm": 0.9381701683390344, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 12968 + }, + { + "epoch": 0.12969, + "grad_norm": 1.0926731768455018, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 12969 + }, + { + "epoch": 0.1297, + "grad_norm": 0.8945847611525154, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 12970 + }, + { + "epoch": 0.12971, + "grad_norm": 0.8190152391461676, + "learning_rate": 0.003, + "loss": 4.068, + "step": 12971 + }, + { + "epoch": 0.12972, + "grad_norm": 0.7558570658616437, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 12972 + }, + { + "epoch": 0.12973, + "grad_norm": 0.8469493343029718, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 12973 + }, + { + "epoch": 0.12974, + "grad_norm": 1.0091381673217301, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 12974 + }, + { + "epoch": 0.12975, + "grad_norm": 1.1879356593057941, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 12975 + }, + { + "epoch": 0.12976, + "grad_norm": 1.0384012763491532, + "learning_rate": 0.003, + "loss": 4.1009, + "step": 12976 + }, + { + "epoch": 0.12977, + "grad_norm": 1.0360118231586506, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 12977 + }, + { + "epoch": 0.12978, + "grad_norm": 1.0249444072596572, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 12978 + }, + { + "epoch": 0.12979, + "grad_norm": 1.091963358540554, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 12979 + }, + { + "epoch": 0.1298, + "grad_norm": 0.889879935839374, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 12980 + }, + { + "epoch": 0.12981, + "grad_norm": 0.860354677563818, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 12981 + }, + { + "epoch": 0.12982, + "grad_norm": 0.9381865988928322, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 12982 + }, + { + "epoch": 0.12983, + "grad_norm": 1.0938494344300176, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 12983 + }, + { + "epoch": 0.12984, + "grad_norm": 0.9402280239919303, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 12984 + }, + { + "epoch": 0.12985, + "grad_norm": 0.8944998677862985, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 12985 + }, + { + "epoch": 0.12986, + "grad_norm": 0.8105911122210153, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 12986 + }, + { + "epoch": 0.12987, + "grad_norm": 0.9400223837798752, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 12987 + }, + { + "epoch": 0.12988, + "grad_norm": 1.1004084188466887, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 12988 + }, + { + "epoch": 0.12989, + "grad_norm": 0.9366609708033388, + "learning_rate": 0.003, + "loss": 4.084, + "step": 12989 + }, + { + "epoch": 0.1299, + "grad_norm": 1.0336862582703732, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 12990 + }, + { + "epoch": 0.12991, + "grad_norm": 0.9608165821353573, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 12991 + }, + { + "epoch": 0.12992, + "grad_norm": 0.9952602100669985, + "learning_rate": 0.003, + "loss": 4.1035, + "step": 12992 + }, + { + "epoch": 0.12993, + "grad_norm": 1.0227492748641744, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 12993 + }, + { + "epoch": 0.12994, + "grad_norm": 0.7880943350209298, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 12994 + }, + { + "epoch": 0.12995, + "grad_norm": 0.790431399092981, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 12995 + }, + { + "epoch": 0.12996, + "grad_norm": 0.9321800086828976, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 12996 + }, + { + "epoch": 0.12997, + "grad_norm": 0.8615839377992883, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 12997 + }, + { + "epoch": 0.12998, + "grad_norm": 0.9067442775911201, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 12998 + }, + { + "epoch": 0.12999, + "grad_norm": 0.8373932866515066, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 12999 + }, + { + "epoch": 0.13, + "grad_norm": 0.8540315642567873, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 13000 + }, + { + "epoch": 0.13001, + "grad_norm": 0.9359698174485682, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 13001 + }, + { + "epoch": 0.13002, + "grad_norm": 0.9592830153337547, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 13002 + }, + { + "epoch": 0.13003, + "grad_norm": 1.046448617999635, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 13003 + }, + { + "epoch": 0.13004, + "grad_norm": 1.1041883637967569, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 13004 + }, + { + "epoch": 0.13005, + "grad_norm": 1.0369515453918619, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 13005 + }, + { + "epoch": 0.13006, + "grad_norm": 0.9979735094558032, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 13006 + }, + { + "epoch": 0.13007, + "grad_norm": 1.0611639538018411, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 13007 + }, + { + "epoch": 0.13008, + "grad_norm": 0.8201878202049163, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 13008 + }, + { + "epoch": 0.13009, + "grad_norm": 0.7118108357889752, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 13009 + }, + { + "epoch": 0.1301, + "grad_norm": 0.6629440697703289, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 13010 + }, + { + "epoch": 0.13011, + "grad_norm": 0.6007149610370014, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 13011 + }, + { + "epoch": 0.13012, + "grad_norm": 0.6921546080483328, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 13012 + }, + { + "epoch": 0.13013, + "grad_norm": 0.7107746587669299, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 13013 + }, + { + "epoch": 0.13014, + "grad_norm": 0.6984294373117751, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 13014 + }, + { + "epoch": 0.13015, + "grad_norm": 0.6789059948176281, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 13015 + }, + { + "epoch": 0.13016, + "grad_norm": 0.6844927291604583, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 13016 + }, + { + "epoch": 0.13017, + "grad_norm": 0.6431965019570215, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 13017 + }, + { + "epoch": 0.13018, + "grad_norm": 0.6011039426413707, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 13018 + }, + { + "epoch": 0.13019, + "grad_norm": 0.5103307281775522, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 13019 + }, + { + "epoch": 0.1302, + "grad_norm": 0.5353798392896162, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 13020 + }, + { + "epoch": 0.13021, + "grad_norm": 0.548977835450416, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 13021 + }, + { + "epoch": 0.13022, + "grad_norm": 0.5161434070433679, + "learning_rate": 0.003, + "loss": 4.078, + "step": 13022 + }, + { + "epoch": 0.13023, + "grad_norm": 0.5269826526927525, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 13023 + }, + { + "epoch": 0.13024, + "grad_norm": 0.6259089187899785, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 13024 + }, + { + "epoch": 0.13025, + "grad_norm": 0.8800244285647391, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 13025 + }, + { + "epoch": 0.13026, + "grad_norm": 1.2491259496586147, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 13026 + }, + { + "epoch": 0.13027, + "grad_norm": 0.9315497999331711, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 13027 + }, + { + "epoch": 0.13028, + "grad_norm": 0.8417572856152467, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 13028 + }, + { + "epoch": 0.13029, + "grad_norm": 0.8742979611520736, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 13029 + }, + { + "epoch": 0.1303, + "grad_norm": 0.7711651552812404, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 13030 + }, + { + "epoch": 0.13031, + "grad_norm": 0.803693913087942, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 13031 + }, + { + "epoch": 0.13032, + "grad_norm": 0.9716408104367728, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 13032 + }, + { + "epoch": 0.13033, + "grad_norm": 1.066738321773958, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13033 + }, + { + "epoch": 0.13034, + "grad_norm": 1.064298193459379, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 13034 + }, + { + "epoch": 0.13035, + "grad_norm": 0.8019257374761538, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 13035 + }, + { + "epoch": 0.13036, + "grad_norm": 0.845083701172066, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 13036 + }, + { + "epoch": 0.13037, + "grad_norm": 0.8738824292069386, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 13037 + }, + { + "epoch": 0.13038, + "grad_norm": 0.861807617539825, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 13038 + }, + { + "epoch": 0.13039, + "grad_norm": 0.984947554147437, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 13039 + }, + { + "epoch": 0.1304, + "grad_norm": 1.0049046163421724, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 13040 + }, + { + "epoch": 0.13041, + "grad_norm": 1.0052885994190504, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 13041 + }, + { + "epoch": 0.13042, + "grad_norm": 0.9684938282207431, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 13042 + }, + { + "epoch": 0.13043, + "grad_norm": 0.9184514064066112, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 13043 + }, + { + "epoch": 0.13044, + "grad_norm": 0.8827308505679203, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 13044 + }, + { + "epoch": 0.13045, + "grad_norm": 0.7717593571762313, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 13045 + }, + { + "epoch": 0.13046, + "grad_norm": 0.8679082897595768, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 13046 + }, + { + "epoch": 0.13047, + "grad_norm": 0.8309256501054642, + "learning_rate": 0.003, + "loss": 4.085, + "step": 13047 + }, + { + "epoch": 0.13048, + "grad_norm": 0.8377738423423138, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 13048 + }, + { + "epoch": 0.13049, + "grad_norm": 0.7119157355427475, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 13049 + }, + { + "epoch": 0.1305, + "grad_norm": 0.6521149593515434, + "learning_rate": 0.003, + "loss": 4.045, + "step": 13050 + }, + { + "epoch": 0.13051, + "grad_norm": 0.6187860253881744, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 13051 + }, + { + "epoch": 0.13052, + "grad_norm": 0.721820839150363, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 13052 + }, + { + "epoch": 0.13053, + "grad_norm": 0.881763567897137, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 13053 + }, + { + "epoch": 0.13054, + "grad_norm": 0.9884496746256645, + "learning_rate": 0.003, + "loss": 4.093, + "step": 13054 + }, + { + "epoch": 0.13055, + "grad_norm": 1.145426405822364, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 13055 + }, + { + "epoch": 0.13056, + "grad_norm": 0.9911722617006568, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13056 + }, + { + "epoch": 0.13057, + "grad_norm": 1.0815519581620563, + "learning_rate": 0.003, + "loss": 4.096, + "step": 13057 + }, + { + "epoch": 0.13058, + "grad_norm": 1.1161554753926237, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 13058 + }, + { + "epoch": 0.13059, + "grad_norm": 1.0353923070365907, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 13059 + }, + { + "epoch": 0.1306, + "grad_norm": 1.0491545620726093, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 13060 + }, + { + "epoch": 0.13061, + "grad_norm": 0.9176883461055413, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 13061 + }, + { + "epoch": 0.13062, + "grad_norm": 0.9654262517848932, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 13062 + }, + { + "epoch": 0.13063, + "grad_norm": 0.9733430114778608, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 13063 + }, + { + "epoch": 0.13064, + "grad_norm": 0.9159542932584501, + "learning_rate": 0.003, + "loss": 4.1171, + "step": 13064 + }, + { + "epoch": 0.13065, + "grad_norm": 0.9137437128996506, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 13065 + }, + { + "epoch": 0.13066, + "grad_norm": 0.9965392933215164, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 13066 + }, + { + "epoch": 0.13067, + "grad_norm": 1.0799242888190321, + "learning_rate": 0.003, + "loss": 4.1365, + "step": 13067 + }, + { + "epoch": 0.13068, + "grad_norm": 0.8908478738348087, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 13068 + }, + { + "epoch": 0.13069, + "grad_norm": 0.8505816538765397, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 13069 + }, + { + "epoch": 0.1307, + "grad_norm": 0.7138405962694484, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 13070 + }, + { + "epoch": 0.13071, + "grad_norm": 0.6344437386784378, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 13071 + }, + { + "epoch": 0.13072, + "grad_norm": 0.5760631033472555, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 13072 + }, + { + "epoch": 0.13073, + "grad_norm": 0.5379709264467936, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 13073 + }, + { + "epoch": 0.13074, + "grad_norm": 0.578691080934062, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 13074 + }, + { + "epoch": 0.13075, + "grad_norm": 0.609909360116078, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 13075 + }, + { + "epoch": 0.13076, + "grad_norm": 0.6066412195555408, + "learning_rate": 0.003, + "loss": 4.079, + "step": 13076 + }, + { + "epoch": 0.13077, + "grad_norm": 0.6342014969921537, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 13077 + }, + { + "epoch": 0.13078, + "grad_norm": 0.7264191472838323, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 13078 + }, + { + "epoch": 0.13079, + "grad_norm": 0.6417007823997498, + "learning_rate": 0.003, + "loss": 4.038, + "step": 13079 + }, + { + "epoch": 0.1308, + "grad_norm": 0.6143304071375301, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 13080 + }, + { + "epoch": 0.13081, + "grad_norm": 0.6149109507383749, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 13081 + }, + { + "epoch": 0.13082, + "grad_norm": 0.5095949171740436, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 13082 + }, + { + "epoch": 0.13083, + "grad_norm": 0.62434362937003, + "learning_rate": 0.003, + "loss": 4.054, + "step": 13083 + }, + { + "epoch": 0.13084, + "grad_norm": 0.8406162652846959, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 13084 + }, + { + "epoch": 0.13085, + "grad_norm": 1.2752997785470583, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 13085 + }, + { + "epoch": 0.13086, + "grad_norm": 0.8614518385858537, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 13086 + }, + { + "epoch": 0.13087, + "grad_norm": 0.6625540584171613, + "learning_rate": 0.003, + "loss": 4.051, + "step": 13087 + }, + { + "epoch": 0.13088, + "grad_norm": 0.6474896194729026, + "learning_rate": 0.003, + "loss": 4.072, + "step": 13088 + }, + { + "epoch": 0.13089, + "grad_norm": 0.7318139577371409, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 13089 + }, + { + "epoch": 0.1309, + "grad_norm": 0.8177182020866904, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 13090 + }, + { + "epoch": 0.13091, + "grad_norm": 0.7604785102669028, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 13091 + }, + { + "epoch": 0.13092, + "grad_norm": 0.6706188706026742, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13092 + }, + { + "epoch": 0.13093, + "grad_norm": 0.7083463370726262, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 13093 + }, + { + "epoch": 0.13094, + "grad_norm": 0.8246775184543369, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 13094 + }, + { + "epoch": 0.13095, + "grad_norm": 0.8959539616378489, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 13095 + }, + { + "epoch": 0.13096, + "grad_norm": 1.1426664720440958, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 13096 + }, + { + "epoch": 0.13097, + "grad_norm": 0.9314302334293945, + "learning_rate": 0.003, + "loss": 4.092, + "step": 13097 + }, + { + "epoch": 0.13098, + "grad_norm": 0.8097032024710651, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 13098 + }, + { + "epoch": 0.13099, + "grad_norm": 0.9162781554443832, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 13099 + }, + { + "epoch": 0.131, + "grad_norm": 0.9343947726975026, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 13100 + }, + { + "epoch": 0.13101, + "grad_norm": 0.7942321020277784, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 13101 + }, + { + "epoch": 0.13102, + "grad_norm": 0.9429946702699512, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 13102 + }, + { + "epoch": 0.13103, + "grad_norm": 0.943910967721569, + "learning_rate": 0.003, + "loss": 4.095, + "step": 13103 + }, + { + "epoch": 0.13104, + "grad_norm": 0.99596788147382, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 13104 + }, + { + "epoch": 0.13105, + "grad_norm": 1.122752514585132, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 13105 + }, + { + "epoch": 0.13106, + "grad_norm": 0.9153266233651645, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 13106 + }, + { + "epoch": 0.13107, + "grad_norm": 0.8791341139465975, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 13107 + }, + { + "epoch": 0.13108, + "grad_norm": 0.8672799144422796, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 13108 + }, + { + "epoch": 0.13109, + "grad_norm": 0.7203451203721, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 13109 + }, + { + "epoch": 0.1311, + "grad_norm": 0.6602568170161984, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 13110 + }, + { + "epoch": 0.13111, + "grad_norm": 0.7527357657881135, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 13111 + }, + { + "epoch": 0.13112, + "grad_norm": 1.1102494497800772, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 13112 + }, + { + "epoch": 0.13113, + "grad_norm": 1.0245672109883843, + "learning_rate": 0.003, + "loss": 4.1131, + "step": 13113 + }, + { + "epoch": 0.13114, + "grad_norm": 1.0114187315722039, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 13114 + }, + { + "epoch": 0.13115, + "grad_norm": 1.1803051995778815, + "learning_rate": 0.003, + "loss": 4.1, + "step": 13115 + }, + { + "epoch": 0.13116, + "grad_norm": 0.7990502832944206, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 13116 + }, + { + "epoch": 0.13117, + "grad_norm": 0.7845891788833669, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 13117 + }, + { + "epoch": 0.13118, + "grad_norm": 0.895614630018304, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 13118 + }, + { + "epoch": 0.13119, + "grad_norm": 0.9073412439270881, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 13119 + }, + { + "epoch": 0.1312, + "grad_norm": 0.8443048639033641, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 13120 + }, + { + "epoch": 0.13121, + "grad_norm": 0.841496267158033, + "learning_rate": 0.003, + "loss": 4.073, + "step": 13121 + }, + { + "epoch": 0.13122, + "grad_norm": 0.9764370092772551, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 13122 + }, + { + "epoch": 0.13123, + "grad_norm": 0.9963997793009255, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 13123 + }, + { + "epoch": 0.13124, + "grad_norm": 0.9514972446333433, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 13124 + }, + { + "epoch": 0.13125, + "grad_norm": 0.8162910350678603, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 13125 + }, + { + "epoch": 0.13126, + "grad_norm": 0.8469684407231409, + "learning_rate": 0.003, + "loss": 4.087, + "step": 13126 + }, + { + "epoch": 0.13127, + "grad_norm": 0.9282328198645152, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 13127 + }, + { + "epoch": 0.13128, + "grad_norm": 1.0681435375555972, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 13128 + }, + { + "epoch": 0.13129, + "grad_norm": 1.0618651926043272, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 13129 + }, + { + "epoch": 0.1313, + "grad_norm": 0.9039759339251511, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 13130 + }, + { + "epoch": 0.13131, + "grad_norm": 0.8369774349939699, + "learning_rate": 0.003, + "loss": 4.09, + "step": 13131 + }, + { + "epoch": 0.13132, + "grad_norm": 0.745261133516445, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 13132 + }, + { + "epoch": 0.13133, + "grad_norm": 0.75806078975999, + "learning_rate": 0.003, + "loss": 4.1089, + "step": 13133 + }, + { + "epoch": 0.13134, + "grad_norm": 0.8801813665835895, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 13134 + }, + { + "epoch": 0.13135, + "grad_norm": 0.969489240654843, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 13135 + }, + { + "epoch": 0.13136, + "grad_norm": 1.1686724945289257, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 13136 + }, + { + "epoch": 0.13137, + "grad_norm": 0.9697056041396325, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 13137 + }, + { + "epoch": 0.13138, + "grad_norm": 1.0511729522703308, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 13138 + }, + { + "epoch": 0.13139, + "grad_norm": 0.9425962853681444, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 13139 + }, + { + "epoch": 0.1314, + "grad_norm": 0.856047400876727, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 13140 + }, + { + "epoch": 0.13141, + "grad_norm": 0.86737845061535, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 13141 + }, + { + "epoch": 0.13142, + "grad_norm": 0.7888503334575826, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 13142 + }, + { + "epoch": 0.13143, + "grad_norm": 0.7754088984735394, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 13143 + }, + { + "epoch": 0.13144, + "grad_norm": 0.6275481544428124, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 13144 + }, + { + "epoch": 0.13145, + "grad_norm": 0.5847896020574547, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 13145 + }, + { + "epoch": 0.13146, + "grad_norm": 0.5082445841466955, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 13146 + }, + { + "epoch": 0.13147, + "grad_norm": 0.5455089289145286, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 13147 + }, + { + "epoch": 0.13148, + "grad_norm": 0.5665523262959993, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 13148 + }, + { + "epoch": 0.13149, + "grad_norm": 0.7012564519777836, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 13149 + }, + { + "epoch": 0.1315, + "grad_norm": 0.9812983457873712, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 13150 + }, + { + "epoch": 0.13151, + "grad_norm": 1.3901380597883277, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 13151 + }, + { + "epoch": 0.13152, + "grad_norm": 0.49944231895829955, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 13152 + }, + { + "epoch": 0.13153, + "grad_norm": 0.8671324059203369, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 13153 + }, + { + "epoch": 0.13154, + "grad_norm": 1.1300493237968723, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 13154 + }, + { + "epoch": 0.13155, + "grad_norm": 0.7987944954842804, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 13155 + }, + { + "epoch": 0.13156, + "grad_norm": 0.6761337520479488, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 13156 + }, + { + "epoch": 0.13157, + "grad_norm": 0.6596813875344903, + "learning_rate": 0.003, + "loss": 4.1075, + "step": 13157 + }, + { + "epoch": 0.13158, + "grad_norm": 0.5263151854628152, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 13158 + }, + { + "epoch": 0.13159, + "grad_norm": 0.5402213954657354, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 13159 + }, + { + "epoch": 0.1316, + "grad_norm": 0.6789202818058099, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 13160 + }, + { + "epoch": 0.13161, + "grad_norm": 0.6385281448634009, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 13161 + }, + { + "epoch": 0.13162, + "grad_norm": 0.601959584583655, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 13162 + }, + { + "epoch": 0.13163, + "grad_norm": 0.6117058188699801, + "learning_rate": 0.003, + "loss": 4.104, + "step": 13163 + }, + { + "epoch": 0.13164, + "grad_norm": 0.725438513292309, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 13164 + }, + { + "epoch": 0.13165, + "grad_norm": 0.867140457954247, + "learning_rate": 0.003, + "loss": 4.077, + "step": 13165 + }, + { + "epoch": 0.13166, + "grad_norm": 1.006111367922786, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13166 + }, + { + "epoch": 0.13167, + "grad_norm": 1.187665915597781, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 13167 + }, + { + "epoch": 0.13168, + "grad_norm": 0.9338917443860894, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 13168 + }, + { + "epoch": 0.13169, + "grad_norm": 0.9536661443703157, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 13169 + }, + { + "epoch": 0.1317, + "grad_norm": 0.9511285463417876, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 13170 + }, + { + "epoch": 0.13171, + "grad_norm": 0.9164751396731089, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 13171 + }, + { + "epoch": 0.13172, + "grad_norm": 0.9934512041007826, + "learning_rate": 0.003, + "loss": 4.071, + "step": 13172 + }, + { + "epoch": 0.13173, + "grad_norm": 1.053517198022562, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 13173 + }, + { + "epoch": 0.13174, + "grad_norm": 1.0092202746747236, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 13174 + }, + { + "epoch": 0.13175, + "grad_norm": 1.2267742892670814, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 13175 + }, + { + "epoch": 0.13176, + "grad_norm": 0.9811606246202388, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 13176 + }, + { + "epoch": 0.13177, + "grad_norm": 0.8342770824928509, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 13177 + }, + { + "epoch": 0.13178, + "grad_norm": 0.8419320414717685, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 13178 + }, + { + "epoch": 0.13179, + "grad_norm": 0.8048464154508961, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 13179 + }, + { + "epoch": 0.1318, + "grad_norm": 0.757893781766694, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13180 + }, + { + "epoch": 0.13181, + "grad_norm": 0.6878957941523194, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 13181 + }, + { + "epoch": 0.13182, + "grad_norm": 0.668821590616849, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 13182 + }, + { + "epoch": 0.13183, + "grad_norm": 0.9340885385093695, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 13183 + }, + { + "epoch": 0.13184, + "grad_norm": 1.1274754922823589, + "learning_rate": 0.003, + "loss": 4.085, + "step": 13184 + }, + { + "epoch": 0.13185, + "grad_norm": 0.9331556957925303, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 13185 + }, + { + "epoch": 0.13186, + "grad_norm": 0.8216673544501654, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 13186 + }, + { + "epoch": 0.13187, + "grad_norm": 0.7140573227719084, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 13187 + }, + { + "epoch": 0.13188, + "grad_norm": 0.6069945344851568, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 13188 + }, + { + "epoch": 0.13189, + "grad_norm": 0.6018208352122972, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 13189 + }, + { + "epoch": 0.1319, + "grad_norm": 0.563879967962961, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 13190 + }, + { + "epoch": 0.13191, + "grad_norm": 0.6599430768938669, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 13191 + }, + { + "epoch": 0.13192, + "grad_norm": 0.8184650611546166, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 13192 + }, + { + "epoch": 0.13193, + "grad_norm": 0.881523759877944, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 13193 + }, + { + "epoch": 0.13194, + "grad_norm": 0.9461369100877264, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 13194 + }, + { + "epoch": 0.13195, + "grad_norm": 1.153724315123479, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 13195 + }, + { + "epoch": 0.13196, + "grad_norm": 1.033498270848711, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 13196 + }, + { + "epoch": 0.13197, + "grad_norm": 0.9548637052073567, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13197 + }, + { + "epoch": 0.13198, + "grad_norm": 0.8518694067242146, + "learning_rate": 0.003, + "loss": 4.075, + "step": 13198 + }, + { + "epoch": 0.13199, + "grad_norm": 0.9343486281432397, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 13199 + }, + { + "epoch": 0.132, + "grad_norm": 0.9524390913324494, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 13200 + }, + { + "epoch": 0.13201, + "grad_norm": 0.913308437512326, + "learning_rate": 0.003, + "loss": 4.057, + "step": 13201 + }, + { + "epoch": 0.13202, + "grad_norm": 0.8916982587930954, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13202 + }, + { + "epoch": 0.13203, + "grad_norm": 0.8518831238121933, + "learning_rate": 0.003, + "loss": 4.089, + "step": 13203 + }, + { + "epoch": 0.13204, + "grad_norm": 0.87098341177306, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 13204 + }, + { + "epoch": 0.13205, + "grad_norm": 0.7907988168737735, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 13205 + }, + { + "epoch": 0.13206, + "grad_norm": 0.6765463916777932, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 13206 + }, + { + "epoch": 0.13207, + "grad_norm": 0.6624752649612393, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 13207 + }, + { + "epoch": 0.13208, + "grad_norm": 0.6076925646094622, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 13208 + }, + { + "epoch": 0.13209, + "grad_norm": 0.7220498401414271, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 13209 + }, + { + "epoch": 0.1321, + "grad_norm": 0.8643535197737705, + "learning_rate": 0.003, + "loss": 4.115, + "step": 13210 + }, + { + "epoch": 0.13211, + "grad_norm": 1.1158427207401402, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 13211 + }, + { + "epoch": 0.13212, + "grad_norm": 1.211885219651439, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 13212 + }, + { + "epoch": 0.13213, + "grad_norm": 0.9472241994569061, + "learning_rate": 0.003, + "loss": 4.099, + "step": 13213 + }, + { + "epoch": 0.13214, + "grad_norm": 0.9278604297079831, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 13214 + }, + { + "epoch": 0.13215, + "grad_norm": 1.051499165609013, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 13215 + }, + { + "epoch": 0.13216, + "grad_norm": 0.9914093115313234, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 13216 + }, + { + "epoch": 0.13217, + "grad_norm": 1.081604203808225, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 13217 + }, + { + "epoch": 0.13218, + "grad_norm": 0.9413066234986573, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 13218 + }, + { + "epoch": 0.13219, + "grad_norm": 0.8790787588417616, + "learning_rate": 0.003, + "loss": 4.139, + "step": 13219 + }, + { + "epoch": 0.1322, + "grad_norm": 0.88143566929359, + "learning_rate": 0.003, + "loss": 4.1247, + "step": 13220 + }, + { + "epoch": 0.13221, + "grad_norm": 0.7886865446284567, + "learning_rate": 0.003, + "loss": 4.09, + "step": 13221 + }, + { + "epoch": 0.13222, + "grad_norm": 0.8703866688049373, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 13222 + }, + { + "epoch": 0.13223, + "grad_norm": 1.0610641583952305, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 13223 + }, + { + "epoch": 0.13224, + "grad_norm": 0.8877308142908692, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 13224 + }, + { + "epoch": 0.13225, + "grad_norm": 0.7923502126794982, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 13225 + }, + { + "epoch": 0.13226, + "grad_norm": 0.8037426063643605, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 13226 + }, + { + "epoch": 0.13227, + "grad_norm": 0.8298313561068729, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 13227 + }, + { + "epoch": 0.13228, + "grad_norm": 0.8313912515009332, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 13228 + }, + { + "epoch": 0.13229, + "grad_norm": 0.7641603605460101, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 13229 + }, + { + "epoch": 0.1323, + "grad_norm": 0.7234519928662755, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 13230 + }, + { + "epoch": 0.13231, + "grad_norm": 0.7013429757158008, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 13231 + }, + { + "epoch": 0.13232, + "grad_norm": 0.7272956344706222, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 13232 + }, + { + "epoch": 0.13233, + "grad_norm": 0.7018840386890066, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 13233 + }, + { + "epoch": 0.13234, + "grad_norm": 0.6483590280546878, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 13234 + }, + { + "epoch": 0.13235, + "grad_norm": 0.5960213433595248, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 13235 + }, + { + "epoch": 0.13236, + "grad_norm": 0.6362690194768988, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 13236 + }, + { + "epoch": 0.13237, + "grad_norm": 0.7016524575189277, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 13237 + }, + { + "epoch": 0.13238, + "grad_norm": 0.8818072070251921, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 13238 + }, + { + "epoch": 0.13239, + "grad_norm": 1.1771151101524453, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 13239 + }, + { + "epoch": 0.1324, + "grad_norm": 0.7848927790510807, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 13240 + }, + { + "epoch": 0.13241, + "grad_norm": 0.7294357892230204, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 13241 + }, + { + "epoch": 0.13242, + "grad_norm": 0.7922721302088672, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 13242 + }, + { + "epoch": 0.13243, + "grad_norm": 0.8779073015391228, + "learning_rate": 0.003, + "loss": 4.065, + "step": 13243 + }, + { + "epoch": 0.13244, + "grad_norm": 0.9028766978779963, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 13244 + }, + { + "epoch": 0.13245, + "grad_norm": 0.8846836016770767, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 13245 + }, + { + "epoch": 0.13246, + "grad_norm": 0.9073640025981549, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 13246 + }, + { + "epoch": 0.13247, + "grad_norm": 1.0184152912935793, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 13247 + }, + { + "epoch": 0.13248, + "grad_norm": 1.1201902379000934, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 13248 + }, + { + "epoch": 0.13249, + "grad_norm": 0.8199137014662327, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 13249 + }, + { + "epoch": 0.1325, + "grad_norm": 0.6982941709290479, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13250 + }, + { + "epoch": 0.13251, + "grad_norm": 0.6265302384897115, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 13251 + }, + { + "epoch": 0.13252, + "grad_norm": 0.731938249235553, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 13252 + }, + { + "epoch": 0.13253, + "grad_norm": 0.6513251744498714, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 13253 + }, + { + "epoch": 0.13254, + "grad_norm": 0.713972326989097, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 13254 + }, + { + "epoch": 0.13255, + "grad_norm": 0.728018823629036, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 13255 + }, + { + "epoch": 0.13256, + "grad_norm": 0.7843870536109315, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 13256 + }, + { + "epoch": 0.13257, + "grad_norm": 0.8055451385935462, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 13257 + }, + { + "epoch": 0.13258, + "grad_norm": 0.7562579848398723, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 13258 + }, + { + "epoch": 0.13259, + "grad_norm": 0.8669239597629805, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 13259 + }, + { + "epoch": 0.1326, + "grad_norm": 0.9138324374423776, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 13260 + }, + { + "epoch": 0.13261, + "grad_norm": 1.0301598630577251, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 13261 + }, + { + "epoch": 0.13262, + "grad_norm": 1.0191619611685017, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 13262 + }, + { + "epoch": 0.13263, + "grad_norm": 1.190057684517433, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 13263 + }, + { + "epoch": 0.13264, + "grad_norm": 0.8701662649204158, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 13264 + }, + { + "epoch": 0.13265, + "grad_norm": 0.8490125306455042, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 13265 + }, + { + "epoch": 0.13266, + "grad_norm": 0.8607121453115351, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 13266 + }, + { + "epoch": 0.13267, + "grad_norm": 0.9724870173082061, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 13267 + }, + { + "epoch": 0.13268, + "grad_norm": 1.0567976063712055, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 13268 + }, + { + "epoch": 0.13269, + "grad_norm": 1.1196162465438617, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 13269 + }, + { + "epoch": 0.1327, + "grad_norm": 0.7152977888864376, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 13270 + }, + { + "epoch": 0.13271, + "grad_norm": 0.5952106093514813, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 13271 + }, + { + "epoch": 0.13272, + "grad_norm": 0.7761989531262784, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 13272 + }, + { + "epoch": 0.13273, + "grad_norm": 0.9037516366698034, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13273 + }, + { + "epoch": 0.13274, + "grad_norm": 1.1117710770327596, + "learning_rate": 0.003, + "loss": 4.1123, + "step": 13274 + }, + { + "epoch": 0.13275, + "grad_norm": 0.8593479491958381, + "learning_rate": 0.003, + "loss": 4.092, + "step": 13275 + }, + { + "epoch": 0.13276, + "grad_norm": 0.7862787517919937, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 13276 + }, + { + "epoch": 0.13277, + "grad_norm": 0.7995918193250531, + "learning_rate": 0.003, + "loss": 4.088, + "step": 13277 + }, + { + "epoch": 0.13278, + "grad_norm": 0.8826709339067829, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 13278 + }, + { + "epoch": 0.13279, + "grad_norm": 0.9899054575114248, + "learning_rate": 0.003, + "loss": 4.105, + "step": 13279 + }, + { + "epoch": 0.1328, + "grad_norm": 1.034194784788381, + "learning_rate": 0.003, + "loss": 4.063, + "step": 13280 + }, + { + "epoch": 0.13281, + "grad_norm": 0.9311590661929713, + "learning_rate": 0.003, + "loss": 4.088, + "step": 13281 + }, + { + "epoch": 0.13282, + "grad_norm": 0.9933747884472336, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 13282 + }, + { + "epoch": 0.13283, + "grad_norm": 0.9179990199647776, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 13283 + }, + { + "epoch": 0.13284, + "grad_norm": 0.9088146382820541, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 13284 + }, + { + "epoch": 0.13285, + "grad_norm": 0.841846082693096, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 13285 + }, + { + "epoch": 0.13286, + "grad_norm": 0.7560675081483034, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 13286 + }, + { + "epoch": 0.13287, + "grad_norm": 0.7787318757030279, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 13287 + }, + { + "epoch": 0.13288, + "grad_norm": 0.832811008255041, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 13288 + }, + { + "epoch": 0.13289, + "grad_norm": 0.9210899944282458, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 13289 + }, + { + "epoch": 0.1329, + "grad_norm": 1.0727385091488706, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 13290 + }, + { + "epoch": 0.13291, + "grad_norm": 1.0679495390104992, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 13291 + }, + { + "epoch": 0.13292, + "grad_norm": 0.8306751001853409, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 13292 + }, + { + "epoch": 0.13293, + "grad_norm": 0.7722162493002047, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 13293 + }, + { + "epoch": 0.13294, + "grad_norm": 0.8366701719709925, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 13294 + }, + { + "epoch": 0.13295, + "grad_norm": 0.765906423235061, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 13295 + }, + { + "epoch": 0.13296, + "grad_norm": 0.7109085834456828, + "learning_rate": 0.003, + "loss": 4.113, + "step": 13296 + }, + { + "epoch": 0.13297, + "grad_norm": 0.6491217065070526, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 13297 + }, + { + "epoch": 0.13298, + "grad_norm": 0.6042714833628209, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 13298 + }, + { + "epoch": 0.13299, + "grad_norm": 0.583957760179777, + "learning_rate": 0.003, + "loss": 4.1199, + "step": 13299 + }, + { + "epoch": 0.133, + "grad_norm": 0.6678133310698912, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 13300 + }, + { + "epoch": 0.13301, + "grad_norm": 0.7783720570215592, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 13301 + }, + { + "epoch": 0.13302, + "grad_norm": 0.7787634468990381, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 13302 + }, + { + "epoch": 0.13303, + "grad_norm": 0.7032459797829061, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 13303 + }, + { + "epoch": 0.13304, + "grad_norm": 0.8343625749860474, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 13304 + }, + { + "epoch": 0.13305, + "grad_norm": 0.9545945039442477, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 13305 + }, + { + "epoch": 0.13306, + "grad_norm": 1.0194914472166352, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 13306 + }, + { + "epoch": 0.13307, + "grad_norm": 1.0114463071538164, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 13307 + }, + { + "epoch": 0.13308, + "grad_norm": 0.9032839932518736, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 13308 + }, + { + "epoch": 0.13309, + "grad_norm": 0.8370451164267789, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 13309 + }, + { + "epoch": 0.1331, + "grad_norm": 0.711444674343713, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 13310 + }, + { + "epoch": 0.13311, + "grad_norm": 0.6965854414550621, + "learning_rate": 0.003, + "loss": 4.051, + "step": 13311 + }, + { + "epoch": 0.13312, + "grad_norm": 0.7510502291152806, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 13312 + }, + { + "epoch": 0.13313, + "grad_norm": 0.7811519956382372, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 13313 + }, + { + "epoch": 0.13314, + "grad_norm": 0.9225126210582554, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 13314 + }, + { + "epoch": 0.13315, + "grad_norm": 1.019599447545897, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 13315 + }, + { + "epoch": 0.13316, + "grad_norm": 0.9755339455442736, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 13316 + }, + { + "epoch": 0.13317, + "grad_norm": 1.0768744991340515, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 13317 + }, + { + "epoch": 0.13318, + "grad_norm": 0.9731692669405603, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 13318 + }, + { + "epoch": 0.13319, + "grad_norm": 0.9778596312954543, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 13319 + }, + { + "epoch": 0.1332, + "grad_norm": 0.8251283506942848, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 13320 + }, + { + "epoch": 0.13321, + "grad_norm": 0.7626372142889066, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 13321 + }, + { + "epoch": 0.13322, + "grad_norm": 0.6777162098362033, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 13322 + }, + { + "epoch": 0.13323, + "grad_norm": 0.7758716723902352, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 13323 + }, + { + "epoch": 0.13324, + "grad_norm": 1.027747368040209, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 13324 + }, + { + "epoch": 0.13325, + "grad_norm": 1.0318317724565758, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 13325 + }, + { + "epoch": 0.13326, + "grad_norm": 0.9787207770477119, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 13326 + }, + { + "epoch": 0.13327, + "grad_norm": 1.1574350505554019, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 13327 + }, + { + "epoch": 0.13328, + "grad_norm": 0.8247833451234012, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 13328 + }, + { + "epoch": 0.13329, + "grad_norm": 0.7027475649475333, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 13329 + }, + { + "epoch": 0.1333, + "grad_norm": 0.6976677720156358, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 13330 + }, + { + "epoch": 0.13331, + "grad_norm": 0.6676022161383816, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 13331 + }, + { + "epoch": 0.13332, + "grad_norm": 0.7003501667646581, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 13332 + }, + { + "epoch": 0.13333, + "grad_norm": 0.731020837366609, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 13333 + }, + { + "epoch": 0.13334, + "grad_norm": 0.8051657766869936, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 13334 + }, + { + "epoch": 0.13335, + "grad_norm": 0.7729201067377107, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 13335 + }, + { + "epoch": 0.13336, + "grad_norm": 0.8503074795049055, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 13336 + }, + { + "epoch": 0.13337, + "grad_norm": 0.8750344965338823, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 13337 + }, + { + "epoch": 0.13338, + "grad_norm": 0.8004965448174175, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 13338 + }, + { + "epoch": 0.13339, + "grad_norm": 0.8382574319131801, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 13339 + }, + { + "epoch": 0.1334, + "grad_norm": 0.8946647901187211, + "learning_rate": 0.003, + "loss": 4.066, + "step": 13340 + }, + { + "epoch": 0.13341, + "grad_norm": 0.8920149557750658, + "learning_rate": 0.003, + "loss": 4.1186, + "step": 13341 + }, + { + "epoch": 0.13342, + "grad_norm": 0.9319479396827326, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 13342 + }, + { + "epoch": 0.13343, + "grad_norm": 1.1367942821984693, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 13343 + }, + { + "epoch": 0.13344, + "grad_norm": 1.2276221027203078, + "learning_rate": 0.003, + "loss": 4.1238, + "step": 13344 + }, + { + "epoch": 0.13345, + "grad_norm": 0.7607096301855646, + "learning_rate": 0.003, + "loss": 4.1175, + "step": 13345 + }, + { + "epoch": 0.13346, + "grad_norm": 0.6825358870284136, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 13346 + }, + { + "epoch": 0.13347, + "grad_norm": 0.7571488375875286, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 13347 + }, + { + "epoch": 0.13348, + "grad_norm": 0.7154570176565842, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13348 + }, + { + "epoch": 0.13349, + "grad_norm": 0.7812634900646294, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 13349 + }, + { + "epoch": 0.1335, + "grad_norm": 0.8282451356725966, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 13350 + }, + { + "epoch": 0.13351, + "grad_norm": 0.8979240341739255, + "learning_rate": 0.003, + "loss": 4.085, + "step": 13351 + }, + { + "epoch": 0.13352, + "grad_norm": 0.9390285109647729, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 13352 + }, + { + "epoch": 0.13353, + "grad_norm": 0.9202136357231417, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 13353 + }, + { + "epoch": 0.13354, + "grad_norm": 0.9058592985531205, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 13354 + }, + { + "epoch": 0.13355, + "grad_norm": 0.8557205833569185, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 13355 + }, + { + "epoch": 0.13356, + "grad_norm": 0.7528300927078995, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 13356 + }, + { + "epoch": 0.13357, + "grad_norm": 0.662604417061924, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 13357 + }, + { + "epoch": 0.13358, + "grad_norm": 0.7000789974285319, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 13358 + }, + { + "epoch": 0.13359, + "grad_norm": 0.7622887648402079, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 13359 + }, + { + "epoch": 0.1336, + "grad_norm": 0.817243636191248, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 13360 + }, + { + "epoch": 0.13361, + "grad_norm": 0.8562119574046776, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 13361 + }, + { + "epoch": 0.13362, + "grad_norm": 1.010234313955433, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 13362 + }, + { + "epoch": 0.13363, + "grad_norm": 1.1214538024604714, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 13363 + }, + { + "epoch": 0.13364, + "grad_norm": 0.9007334539006093, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 13364 + }, + { + "epoch": 0.13365, + "grad_norm": 0.782430415406902, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 13365 + }, + { + "epoch": 0.13366, + "grad_norm": 0.6656368480111795, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 13366 + }, + { + "epoch": 0.13367, + "grad_norm": 0.7581005396936631, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 13367 + }, + { + "epoch": 0.13368, + "grad_norm": 0.8526986605439665, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 13368 + }, + { + "epoch": 0.13369, + "grad_norm": 1.07039059656977, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 13369 + }, + { + "epoch": 0.1337, + "grad_norm": 1.167795834790775, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 13370 + }, + { + "epoch": 0.13371, + "grad_norm": 0.7673312662291831, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 13371 + }, + { + "epoch": 0.13372, + "grad_norm": 0.7018354684911177, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 13372 + }, + { + "epoch": 0.13373, + "grad_norm": 0.6618742873530599, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 13373 + }, + { + "epoch": 0.13374, + "grad_norm": 0.7461046736923796, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 13374 + }, + { + "epoch": 0.13375, + "grad_norm": 0.8093240859237089, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 13375 + }, + { + "epoch": 0.13376, + "grad_norm": 0.8188481304026195, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 13376 + }, + { + "epoch": 0.13377, + "grad_norm": 0.8843136314523814, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 13377 + }, + { + "epoch": 0.13378, + "grad_norm": 0.9555559955370252, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 13378 + }, + { + "epoch": 0.13379, + "grad_norm": 0.997572197642538, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 13379 + }, + { + "epoch": 0.1338, + "grad_norm": 1.0855199724802314, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 13380 + }, + { + "epoch": 0.13381, + "grad_norm": 0.9330965514895911, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 13381 + }, + { + "epoch": 0.13382, + "grad_norm": 0.993615014824088, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 13382 + }, + { + "epoch": 0.13383, + "grad_norm": 1.06876028029054, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 13383 + }, + { + "epoch": 0.13384, + "grad_norm": 0.9544246482417755, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 13384 + }, + { + "epoch": 0.13385, + "grad_norm": 0.9293864018413087, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 13385 + }, + { + "epoch": 0.13386, + "grad_norm": 0.96226321750807, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 13386 + }, + { + "epoch": 0.13387, + "grad_norm": 0.946212009523684, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 13387 + }, + { + "epoch": 0.13388, + "grad_norm": 0.806514044512673, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 13388 + }, + { + "epoch": 0.13389, + "grad_norm": 0.8000459375315871, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 13389 + }, + { + "epoch": 0.1339, + "grad_norm": 0.7366439724037857, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 13390 + }, + { + "epoch": 0.13391, + "grad_norm": 0.8181400796841161, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 13391 + }, + { + "epoch": 0.13392, + "grad_norm": 0.999696214534662, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 13392 + }, + { + "epoch": 0.13393, + "grad_norm": 1.2717687360989915, + "learning_rate": 0.003, + "loss": 4.0983, + "step": 13393 + }, + { + "epoch": 0.13394, + "grad_norm": 0.6855942854787728, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 13394 + }, + { + "epoch": 0.13395, + "grad_norm": 0.6210133008885812, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 13395 + }, + { + "epoch": 0.13396, + "grad_norm": 0.742261110987861, + "learning_rate": 0.003, + "loss": 4.062, + "step": 13396 + }, + { + "epoch": 0.13397, + "grad_norm": 0.7906941965969082, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 13397 + }, + { + "epoch": 0.13398, + "grad_norm": 0.8269568020833963, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 13398 + }, + { + "epoch": 0.13399, + "grad_norm": 0.8887188100677542, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 13399 + }, + { + "epoch": 0.134, + "grad_norm": 0.9132823458032262, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 13400 + }, + { + "epoch": 0.13401, + "grad_norm": 0.8876835827299703, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 13401 + }, + { + "epoch": 0.13402, + "grad_norm": 0.9344253021723582, + "learning_rate": 0.003, + "loss": 4.033, + "step": 13402 + }, + { + "epoch": 0.13403, + "grad_norm": 1.1870397445205836, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 13403 + }, + { + "epoch": 0.13404, + "grad_norm": 0.8520339969930502, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 13404 + }, + { + "epoch": 0.13405, + "grad_norm": 0.7530756929638318, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 13405 + }, + { + "epoch": 0.13406, + "grad_norm": 0.7291047875714687, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 13406 + }, + { + "epoch": 0.13407, + "grad_norm": 0.8192456380386125, + "learning_rate": 0.003, + "loss": 4.083, + "step": 13407 + }, + { + "epoch": 0.13408, + "grad_norm": 1.0734420527156827, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 13408 + }, + { + "epoch": 0.13409, + "grad_norm": 1.1175490283853078, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 13409 + }, + { + "epoch": 0.1341, + "grad_norm": 0.7573190993520491, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 13410 + }, + { + "epoch": 0.13411, + "grad_norm": 0.5935748761477859, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 13411 + }, + { + "epoch": 0.13412, + "grad_norm": 0.6489974564660388, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 13412 + }, + { + "epoch": 0.13413, + "grad_norm": 0.7229439519999141, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 13413 + }, + { + "epoch": 0.13414, + "grad_norm": 0.7689091743956452, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 13414 + }, + { + "epoch": 0.13415, + "grad_norm": 0.7754417224407258, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 13415 + }, + { + "epoch": 0.13416, + "grad_norm": 0.6745797484626426, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 13416 + }, + { + "epoch": 0.13417, + "grad_norm": 0.8217901459147499, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 13417 + }, + { + "epoch": 0.13418, + "grad_norm": 0.9182152387918147, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 13418 + }, + { + "epoch": 0.13419, + "grad_norm": 0.9469773874113262, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 13419 + }, + { + "epoch": 0.1342, + "grad_norm": 1.1086457175612325, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 13420 + }, + { + "epoch": 0.13421, + "grad_norm": 0.9384061110337768, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 13421 + }, + { + "epoch": 0.13422, + "grad_norm": 0.954951689668296, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 13422 + }, + { + "epoch": 0.13423, + "grad_norm": 1.052374563535259, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 13423 + }, + { + "epoch": 0.13424, + "grad_norm": 0.9805125849208332, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 13424 + }, + { + "epoch": 0.13425, + "grad_norm": 0.8483129123086609, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 13425 + }, + { + "epoch": 0.13426, + "grad_norm": 0.7966598067261172, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 13426 + }, + { + "epoch": 0.13427, + "grad_norm": 0.8762148637542061, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 13427 + }, + { + "epoch": 0.13428, + "grad_norm": 0.9094155326975562, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 13428 + }, + { + "epoch": 0.13429, + "grad_norm": 0.9663884563079317, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 13429 + }, + { + "epoch": 0.1343, + "grad_norm": 1.1211345291455057, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 13430 + }, + { + "epoch": 0.13431, + "grad_norm": 1.0140114717752684, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 13431 + }, + { + "epoch": 0.13432, + "grad_norm": 0.8217997658187381, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 13432 + }, + { + "epoch": 0.13433, + "grad_norm": 0.7248351378919788, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 13433 + }, + { + "epoch": 0.13434, + "grad_norm": 0.6280610647430119, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 13434 + }, + { + "epoch": 0.13435, + "grad_norm": 0.6075072218051248, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 13435 + }, + { + "epoch": 0.13436, + "grad_norm": 0.6456207752877368, + "learning_rate": 0.003, + "loss": 4.088, + "step": 13436 + }, + { + "epoch": 0.13437, + "grad_norm": 0.6145605351226269, + "learning_rate": 0.003, + "loss": 4.062, + "step": 13437 + }, + { + "epoch": 0.13438, + "grad_norm": 0.5517181867891378, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 13438 + }, + { + "epoch": 0.13439, + "grad_norm": 0.6578844697806577, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 13439 + }, + { + "epoch": 0.1344, + "grad_norm": 0.6665378345671775, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 13440 + }, + { + "epoch": 0.13441, + "grad_norm": 0.6418518042646504, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 13441 + }, + { + "epoch": 0.13442, + "grad_norm": 0.6843940993097485, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 13442 + }, + { + "epoch": 0.13443, + "grad_norm": 0.8707193985221511, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 13443 + }, + { + "epoch": 0.13444, + "grad_norm": 0.9847548524745664, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 13444 + }, + { + "epoch": 0.13445, + "grad_norm": 1.128845903324881, + "learning_rate": 0.003, + "loss": 4.074, + "step": 13445 + }, + { + "epoch": 0.13446, + "grad_norm": 1.2068093097783414, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 13446 + }, + { + "epoch": 0.13447, + "grad_norm": 0.7287091593638473, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 13447 + }, + { + "epoch": 0.13448, + "grad_norm": 0.5785864530858607, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 13448 + }, + { + "epoch": 0.13449, + "grad_norm": 0.578733738934119, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 13449 + }, + { + "epoch": 0.1345, + "grad_norm": 0.5265843405909042, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 13450 + }, + { + "epoch": 0.13451, + "grad_norm": 0.5563921477003801, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 13451 + }, + { + "epoch": 0.13452, + "grad_norm": 0.6096950372974864, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 13452 + }, + { + "epoch": 0.13453, + "grad_norm": 0.7508132198296539, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 13453 + }, + { + "epoch": 0.13454, + "grad_norm": 0.7993193243181995, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 13454 + }, + { + "epoch": 0.13455, + "grad_norm": 0.7682487969894645, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 13455 + }, + { + "epoch": 0.13456, + "grad_norm": 0.8796777200899208, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 13456 + }, + { + "epoch": 0.13457, + "grad_norm": 1.1469397610089969, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 13457 + }, + { + "epoch": 0.13458, + "grad_norm": 1.146202488625937, + "learning_rate": 0.003, + "loss": 4.101, + "step": 13458 + }, + { + "epoch": 0.13459, + "grad_norm": 0.8963160468442611, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 13459 + }, + { + "epoch": 0.1346, + "grad_norm": 0.8324177540997719, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 13460 + }, + { + "epoch": 0.13461, + "grad_norm": 0.8768079654245492, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 13461 + }, + { + "epoch": 0.13462, + "grad_norm": 0.8814361611365389, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 13462 + }, + { + "epoch": 0.13463, + "grad_norm": 0.8304217889455825, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 13463 + }, + { + "epoch": 0.13464, + "grad_norm": 0.7849247804892645, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 13464 + }, + { + "epoch": 0.13465, + "grad_norm": 1.0067140334691227, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 13465 + }, + { + "epoch": 0.13466, + "grad_norm": 1.0827853047168359, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 13466 + }, + { + "epoch": 0.13467, + "grad_norm": 0.9863994122771002, + "learning_rate": 0.003, + "loss": 4.109, + "step": 13467 + }, + { + "epoch": 0.13468, + "grad_norm": 1.1519198315158714, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 13468 + }, + { + "epoch": 0.13469, + "grad_norm": 0.9094830470535434, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 13469 + }, + { + "epoch": 0.1347, + "grad_norm": 0.9615241547488029, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 13470 + }, + { + "epoch": 0.13471, + "grad_norm": 1.0026615449923861, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 13471 + }, + { + "epoch": 0.13472, + "grad_norm": 0.8963248151652958, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 13472 + }, + { + "epoch": 0.13473, + "grad_norm": 0.8388906368889392, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 13473 + }, + { + "epoch": 0.13474, + "grad_norm": 0.8659010536701638, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 13474 + }, + { + "epoch": 0.13475, + "grad_norm": 1.1420323194450308, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 13475 + }, + { + "epoch": 0.13476, + "grad_norm": 1.1189402465759628, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 13476 + }, + { + "epoch": 0.13477, + "grad_norm": 0.9838216383823422, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 13477 + }, + { + "epoch": 0.13478, + "grad_norm": 0.9836383431393855, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 13478 + }, + { + "epoch": 0.13479, + "grad_norm": 0.9535032512085219, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 13479 + }, + { + "epoch": 0.1348, + "grad_norm": 1.0082703455759217, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 13480 + }, + { + "epoch": 0.13481, + "grad_norm": 0.9423372318082879, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 13481 + }, + { + "epoch": 0.13482, + "grad_norm": 0.737918943866043, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 13482 + }, + { + "epoch": 0.13483, + "grad_norm": 0.696892101834753, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 13483 + }, + { + "epoch": 0.13484, + "grad_norm": 0.6846874888722756, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 13484 + }, + { + "epoch": 0.13485, + "grad_norm": 0.7041497552775802, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 13485 + }, + { + "epoch": 0.13486, + "grad_norm": 0.7623820637051687, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 13486 + }, + { + "epoch": 0.13487, + "grad_norm": 0.8490472011425173, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13487 + }, + { + "epoch": 0.13488, + "grad_norm": 0.9612767459994789, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 13488 + }, + { + "epoch": 0.13489, + "grad_norm": 0.9657936696913836, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13489 + }, + { + "epoch": 0.1349, + "grad_norm": 0.9130234524107858, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 13490 + }, + { + "epoch": 0.13491, + "grad_norm": 0.8790648583629521, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13491 + }, + { + "epoch": 0.13492, + "grad_norm": 0.9532097034274005, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 13492 + }, + { + "epoch": 0.13493, + "grad_norm": 1.0608055718960732, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 13493 + }, + { + "epoch": 0.13494, + "grad_norm": 1.3057994901362042, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 13494 + }, + { + "epoch": 0.13495, + "grad_norm": 0.7033861599216819, + "learning_rate": 0.003, + "loss": 4.1118, + "step": 13495 + }, + { + "epoch": 0.13496, + "grad_norm": 0.6617324852095671, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 13496 + }, + { + "epoch": 0.13497, + "grad_norm": 0.7023942775756882, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 13497 + }, + { + "epoch": 0.13498, + "grad_norm": 0.669109474440314, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 13498 + }, + { + "epoch": 0.13499, + "grad_norm": 0.6316320567905482, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 13499 + }, + { + "epoch": 0.135, + "grad_norm": 0.6942712253269201, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 13500 + }, + { + "epoch": 0.13501, + "grad_norm": 0.803069854384175, + "learning_rate": 0.003, + "loss": 4.092, + "step": 13501 + }, + { + "epoch": 0.13502, + "grad_norm": 0.869373680749491, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 13502 + }, + { + "epoch": 0.13503, + "grad_norm": 0.9541104536726139, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 13503 + }, + { + "epoch": 0.13504, + "grad_norm": 1.0188968412109567, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 13504 + }, + { + "epoch": 0.13505, + "grad_norm": 1.0129334158742416, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 13505 + }, + { + "epoch": 0.13506, + "grad_norm": 0.9867883103916336, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 13506 + }, + { + "epoch": 0.13507, + "grad_norm": 0.8006738248554235, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 13507 + }, + { + "epoch": 0.13508, + "grad_norm": 0.756787490611719, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 13508 + }, + { + "epoch": 0.13509, + "grad_norm": 0.7731031670930801, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 13509 + }, + { + "epoch": 0.1351, + "grad_norm": 0.8686810584219077, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 13510 + }, + { + "epoch": 0.13511, + "grad_norm": 0.8508271823239518, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 13511 + }, + { + "epoch": 0.13512, + "grad_norm": 0.8478064096923771, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 13512 + }, + { + "epoch": 0.13513, + "grad_norm": 0.7839227854594315, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13513 + }, + { + "epoch": 0.13514, + "grad_norm": 0.7301078450648706, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 13514 + }, + { + "epoch": 0.13515, + "grad_norm": 0.7336051058766169, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 13515 + }, + { + "epoch": 0.13516, + "grad_norm": 0.7245613565259936, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 13516 + }, + { + "epoch": 0.13517, + "grad_norm": 0.8211092942650285, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 13517 + }, + { + "epoch": 0.13518, + "grad_norm": 0.7879538257024276, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 13518 + }, + { + "epoch": 0.13519, + "grad_norm": 0.9148074094319852, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 13519 + }, + { + "epoch": 0.1352, + "grad_norm": 1.0521107275543449, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 13520 + }, + { + "epoch": 0.13521, + "grad_norm": 1.1302849728490965, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 13521 + }, + { + "epoch": 0.13522, + "grad_norm": 0.9697364533787071, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 13522 + }, + { + "epoch": 0.13523, + "grad_norm": 1.0552404851435344, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 13523 + }, + { + "epoch": 0.13524, + "grad_norm": 0.9744665347810808, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 13524 + }, + { + "epoch": 0.13525, + "grad_norm": 0.9272379642376642, + "learning_rate": 0.003, + "loss": 4.082, + "step": 13525 + }, + { + "epoch": 0.13526, + "grad_norm": 0.9557207161887343, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 13526 + }, + { + "epoch": 0.13527, + "grad_norm": 0.9293390684670042, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 13527 + }, + { + "epoch": 0.13528, + "grad_norm": 1.0052828200881132, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 13528 + }, + { + "epoch": 0.13529, + "grad_norm": 1.0371311360910564, + "learning_rate": 0.003, + "loss": 4.1158, + "step": 13529 + }, + { + "epoch": 0.1353, + "grad_norm": 1.0126977222126474, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 13530 + }, + { + "epoch": 0.13531, + "grad_norm": 0.7933620857620177, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 13531 + }, + { + "epoch": 0.13532, + "grad_norm": 0.7219190481703145, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 13532 + }, + { + "epoch": 0.13533, + "grad_norm": 0.7990634460705841, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 13533 + }, + { + "epoch": 0.13534, + "grad_norm": 0.9732672531781391, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 13534 + }, + { + "epoch": 0.13535, + "grad_norm": 1.1699596888991803, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 13535 + }, + { + "epoch": 0.13536, + "grad_norm": 0.8445467016318637, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 13536 + }, + { + "epoch": 0.13537, + "grad_norm": 0.756263504466685, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 13537 + }, + { + "epoch": 0.13538, + "grad_norm": 0.7757852409412948, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 13538 + }, + { + "epoch": 0.13539, + "grad_norm": 0.7805717985227083, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 13539 + }, + { + "epoch": 0.1354, + "grad_norm": 0.7861378914606122, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 13540 + }, + { + "epoch": 0.13541, + "grad_norm": 0.7840620668567959, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 13541 + }, + { + "epoch": 0.13542, + "grad_norm": 0.7729986654803618, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 13542 + }, + { + "epoch": 0.13543, + "grad_norm": 0.659090066702033, + "learning_rate": 0.003, + "loss": 4.06, + "step": 13543 + }, + { + "epoch": 0.13544, + "grad_norm": 0.703870682999343, + "learning_rate": 0.003, + "loss": 4.078, + "step": 13544 + }, + { + "epoch": 0.13545, + "grad_norm": 0.6759376770950815, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 13545 + }, + { + "epoch": 0.13546, + "grad_norm": 0.7681361437877832, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 13546 + }, + { + "epoch": 0.13547, + "grad_norm": 0.7985128232105765, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 13547 + }, + { + "epoch": 0.13548, + "grad_norm": 0.7456046344594788, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 13548 + }, + { + "epoch": 0.13549, + "grad_norm": 0.8366654808188815, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 13549 + }, + { + "epoch": 0.1355, + "grad_norm": 1.0109224036728328, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 13550 + }, + { + "epoch": 0.13551, + "grad_norm": 1.109188580324219, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 13551 + }, + { + "epoch": 0.13552, + "grad_norm": 0.9710169157720988, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 13552 + }, + { + "epoch": 0.13553, + "grad_norm": 0.8362989825847552, + "learning_rate": 0.003, + "loss": 4.1086, + "step": 13553 + }, + { + "epoch": 0.13554, + "grad_norm": 0.7901306253623673, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 13554 + }, + { + "epoch": 0.13555, + "grad_norm": 0.8240789603855017, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 13555 + }, + { + "epoch": 0.13556, + "grad_norm": 0.8048037134669658, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 13556 + }, + { + "epoch": 0.13557, + "grad_norm": 0.8028701465113719, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 13557 + }, + { + "epoch": 0.13558, + "grad_norm": 0.833805799784846, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 13558 + }, + { + "epoch": 0.13559, + "grad_norm": 0.9000878822488261, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 13559 + }, + { + "epoch": 0.1356, + "grad_norm": 0.9092498325799294, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 13560 + }, + { + "epoch": 0.13561, + "grad_norm": 1.089421266137573, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 13561 + }, + { + "epoch": 0.13562, + "grad_norm": 1.1912402355735798, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 13562 + }, + { + "epoch": 0.13563, + "grad_norm": 0.8159061983358631, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 13563 + }, + { + "epoch": 0.13564, + "grad_norm": 0.6140979932194808, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 13564 + }, + { + "epoch": 0.13565, + "grad_norm": 0.5625859668192599, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 13565 + }, + { + "epoch": 0.13566, + "grad_norm": 0.6050358428049795, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 13566 + }, + { + "epoch": 0.13567, + "grad_norm": 0.6868758702178857, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 13567 + }, + { + "epoch": 0.13568, + "grad_norm": 0.7205192721231606, + "learning_rate": 0.003, + "loss": 4.063, + "step": 13568 + }, + { + "epoch": 0.13569, + "grad_norm": 0.7278685945933725, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 13569 + }, + { + "epoch": 0.1357, + "grad_norm": 0.6764390908788951, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 13570 + }, + { + "epoch": 0.13571, + "grad_norm": 0.6973655855305073, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 13571 + }, + { + "epoch": 0.13572, + "grad_norm": 0.7012650191061759, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 13572 + }, + { + "epoch": 0.13573, + "grad_norm": 0.8050491163634, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 13573 + }, + { + "epoch": 0.13574, + "grad_norm": 0.9743995420341659, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 13574 + }, + { + "epoch": 0.13575, + "grad_norm": 1.2222001451313191, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 13575 + }, + { + "epoch": 0.13576, + "grad_norm": 0.7012402180003754, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 13576 + }, + { + "epoch": 0.13577, + "grad_norm": 0.753848322169635, + "learning_rate": 0.003, + "loss": 4.077, + "step": 13577 + }, + { + "epoch": 0.13578, + "grad_norm": 1.0667244804526879, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 13578 + }, + { + "epoch": 0.13579, + "grad_norm": 0.9979644252700127, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 13579 + }, + { + "epoch": 0.1358, + "grad_norm": 0.8636628689836429, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 13580 + }, + { + "epoch": 0.13581, + "grad_norm": 0.8122661263515126, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 13581 + }, + { + "epoch": 0.13582, + "grad_norm": 0.7823422529780943, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 13582 + }, + { + "epoch": 0.13583, + "grad_norm": 0.8973264048047405, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 13583 + }, + { + "epoch": 0.13584, + "grad_norm": 1.0198634801706925, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 13584 + }, + { + "epoch": 0.13585, + "grad_norm": 0.9758609896953625, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 13585 + }, + { + "epoch": 0.13586, + "grad_norm": 0.9574803853337032, + "learning_rate": 0.003, + "loss": 4.1225, + "step": 13586 + }, + { + "epoch": 0.13587, + "grad_norm": 1.0153232515470276, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 13587 + }, + { + "epoch": 0.13588, + "grad_norm": 1.1243945262891535, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 13588 + }, + { + "epoch": 0.13589, + "grad_norm": 1.0555024684934773, + "learning_rate": 0.003, + "loss": 4.092, + "step": 13589 + }, + { + "epoch": 0.1359, + "grad_norm": 0.9386569712532187, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 13590 + }, + { + "epoch": 0.13591, + "grad_norm": 0.9423432065056779, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 13591 + }, + { + "epoch": 0.13592, + "grad_norm": 0.9570236944619993, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 13592 + }, + { + "epoch": 0.13593, + "grad_norm": 0.9994032976286987, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 13593 + }, + { + "epoch": 0.13594, + "grad_norm": 1.1293489136713144, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 13594 + }, + { + "epoch": 0.13595, + "grad_norm": 0.8770596374307061, + "learning_rate": 0.003, + "loss": 4.089, + "step": 13595 + }, + { + "epoch": 0.13596, + "grad_norm": 0.9540307039164383, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 13596 + }, + { + "epoch": 0.13597, + "grad_norm": 1.0348464305881202, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 13597 + }, + { + "epoch": 0.13598, + "grad_norm": 0.7952445402465652, + "learning_rate": 0.003, + "loss": 4.1214, + "step": 13598 + }, + { + "epoch": 0.13599, + "grad_norm": 0.838767728060577, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 13599 + }, + { + "epoch": 0.136, + "grad_norm": 0.7859214529580825, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 13600 + }, + { + "epoch": 0.13601, + "grad_norm": 0.8664034168727403, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 13601 + }, + { + "epoch": 0.13602, + "grad_norm": 1.05749460632109, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 13602 + }, + { + "epoch": 0.13603, + "grad_norm": 0.9115800757473246, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 13603 + }, + { + "epoch": 0.13604, + "grad_norm": 1.051125147838006, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 13604 + }, + { + "epoch": 0.13605, + "grad_norm": 0.9750073294288395, + "learning_rate": 0.003, + "loss": 4.038, + "step": 13605 + }, + { + "epoch": 0.13606, + "grad_norm": 0.8980573196185354, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 13606 + }, + { + "epoch": 0.13607, + "grad_norm": 0.8747864491235297, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 13607 + }, + { + "epoch": 0.13608, + "grad_norm": 0.9574554287453303, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 13608 + }, + { + "epoch": 0.13609, + "grad_norm": 0.9258162718847411, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 13609 + }, + { + "epoch": 0.1361, + "grad_norm": 0.7573297562448439, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 13610 + }, + { + "epoch": 0.13611, + "grad_norm": 0.6582701023818656, + "learning_rate": 0.003, + "loss": 4.066, + "step": 13611 + }, + { + "epoch": 0.13612, + "grad_norm": 0.6137564095547992, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 13612 + }, + { + "epoch": 0.13613, + "grad_norm": 0.6906795147923297, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 13613 + }, + { + "epoch": 0.13614, + "grad_norm": 0.7317210056279667, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 13614 + }, + { + "epoch": 0.13615, + "grad_norm": 0.7973083599241934, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 13615 + }, + { + "epoch": 0.13616, + "grad_norm": 0.8122996454877814, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 13616 + }, + { + "epoch": 0.13617, + "grad_norm": 0.6800092785282158, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 13617 + }, + { + "epoch": 0.13618, + "grad_norm": 0.7395444303121143, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 13618 + }, + { + "epoch": 0.13619, + "grad_norm": 0.8601458402892523, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 13619 + }, + { + "epoch": 0.1362, + "grad_norm": 0.7866587914691752, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 13620 + }, + { + "epoch": 0.13621, + "grad_norm": 0.6894813065274821, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 13621 + }, + { + "epoch": 0.13622, + "grad_norm": 0.7751245638506891, + "learning_rate": 0.003, + "loss": 4.069, + "step": 13622 + }, + { + "epoch": 0.13623, + "grad_norm": 0.7811576890318376, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 13623 + }, + { + "epoch": 0.13624, + "grad_norm": 1.0791880392784912, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 13624 + }, + { + "epoch": 0.13625, + "grad_norm": 1.1661365389411822, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 13625 + }, + { + "epoch": 0.13626, + "grad_norm": 0.8310605669671869, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 13626 + }, + { + "epoch": 0.13627, + "grad_norm": 0.8116350463802214, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 13627 + }, + { + "epoch": 0.13628, + "grad_norm": 0.732431590284875, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 13628 + }, + { + "epoch": 0.13629, + "grad_norm": 0.7708504377596485, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 13629 + }, + { + "epoch": 0.1363, + "grad_norm": 0.8370934469455775, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 13630 + }, + { + "epoch": 0.13631, + "grad_norm": 0.8721551278857689, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 13631 + }, + { + "epoch": 0.13632, + "grad_norm": 0.8001208046961691, + "learning_rate": 0.003, + "loss": 4.096, + "step": 13632 + }, + { + "epoch": 0.13633, + "grad_norm": 0.6855495881523573, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 13633 + }, + { + "epoch": 0.13634, + "grad_norm": 0.7438299753527488, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 13634 + }, + { + "epoch": 0.13635, + "grad_norm": 0.7952533897916455, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 13635 + }, + { + "epoch": 0.13636, + "grad_norm": 0.8363194500609099, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 13636 + }, + { + "epoch": 0.13637, + "grad_norm": 0.7601839652731163, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 13637 + }, + { + "epoch": 0.13638, + "grad_norm": 0.6780506683164645, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 13638 + }, + { + "epoch": 0.13639, + "grad_norm": 0.683180007369749, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 13639 + }, + { + "epoch": 0.1364, + "grad_norm": 0.6438945473416481, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 13640 + }, + { + "epoch": 0.13641, + "grad_norm": 0.6499667187267382, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 13641 + }, + { + "epoch": 0.13642, + "grad_norm": 0.6850170351662737, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 13642 + }, + { + "epoch": 0.13643, + "grad_norm": 0.8136727294891523, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 13643 + }, + { + "epoch": 0.13644, + "grad_norm": 1.2156858921156055, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 13644 + }, + { + "epoch": 0.13645, + "grad_norm": 1.0436887520306182, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 13645 + }, + { + "epoch": 0.13646, + "grad_norm": 0.8449363069264667, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 13646 + }, + { + "epoch": 0.13647, + "grad_norm": 0.850875163767057, + "learning_rate": 0.003, + "loss": 4.077, + "step": 13647 + }, + { + "epoch": 0.13648, + "grad_norm": 0.95375218441901, + "learning_rate": 0.003, + "loss": 4.041, + "step": 13648 + }, + { + "epoch": 0.13649, + "grad_norm": 1.0050975309029315, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 13649 + }, + { + "epoch": 0.1365, + "grad_norm": 0.9843391727985866, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 13650 + }, + { + "epoch": 0.13651, + "grad_norm": 0.9427451081512389, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 13651 + }, + { + "epoch": 0.13652, + "grad_norm": 0.9763292838665173, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 13652 + }, + { + "epoch": 0.13653, + "grad_norm": 0.8634704292443027, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 13653 + }, + { + "epoch": 0.13654, + "grad_norm": 0.9868341046720903, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 13654 + }, + { + "epoch": 0.13655, + "grad_norm": 1.0639061905588236, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 13655 + }, + { + "epoch": 0.13656, + "grad_norm": 1.091557124889247, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 13656 + }, + { + "epoch": 0.13657, + "grad_norm": 0.9231903366512457, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 13657 + }, + { + "epoch": 0.13658, + "grad_norm": 0.8486992017924497, + "learning_rate": 0.003, + "loss": 4.1105, + "step": 13658 + }, + { + "epoch": 0.13659, + "grad_norm": 0.7664700162395804, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 13659 + }, + { + "epoch": 0.1366, + "grad_norm": 0.7065379273868619, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 13660 + }, + { + "epoch": 0.13661, + "grad_norm": 0.7256696633312764, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 13661 + }, + { + "epoch": 0.13662, + "grad_norm": 0.7690012211967215, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 13662 + }, + { + "epoch": 0.13663, + "grad_norm": 0.9283692376422882, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 13663 + }, + { + "epoch": 0.13664, + "grad_norm": 1.013928354570009, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 13664 + }, + { + "epoch": 0.13665, + "grad_norm": 1.060865621299839, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 13665 + }, + { + "epoch": 0.13666, + "grad_norm": 0.9465780308512567, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 13666 + }, + { + "epoch": 0.13667, + "grad_norm": 0.9670648703906136, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 13667 + }, + { + "epoch": 0.13668, + "grad_norm": 0.8473843407878847, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 13668 + }, + { + "epoch": 0.13669, + "grad_norm": 0.7314897721635479, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 13669 + }, + { + "epoch": 0.1367, + "grad_norm": 0.7531751860487068, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 13670 + }, + { + "epoch": 0.13671, + "grad_norm": 0.8112896300846317, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 13671 + }, + { + "epoch": 0.13672, + "grad_norm": 0.8532693255583029, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 13672 + }, + { + "epoch": 0.13673, + "grad_norm": 0.9427863519402445, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 13673 + }, + { + "epoch": 0.13674, + "grad_norm": 1.084580485879212, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 13674 + }, + { + "epoch": 0.13675, + "grad_norm": 0.8723766795237089, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 13675 + }, + { + "epoch": 0.13676, + "grad_norm": 0.8475241093068473, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 13676 + }, + { + "epoch": 0.13677, + "grad_norm": 0.9196363658028469, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 13677 + }, + { + "epoch": 0.13678, + "grad_norm": 0.9787266625530816, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 13678 + }, + { + "epoch": 0.13679, + "grad_norm": 0.9010503311425638, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 13679 + }, + { + "epoch": 0.1368, + "grad_norm": 0.7852953043656846, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 13680 + }, + { + "epoch": 0.13681, + "grad_norm": 0.803772106831219, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 13681 + }, + { + "epoch": 0.13682, + "grad_norm": 0.7772930432252959, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 13682 + }, + { + "epoch": 0.13683, + "grad_norm": 0.8199857952108855, + "learning_rate": 0.003, + "loss": 4.073, + "step": 13683 + }, + { + "epoch": 0.13684, + "grad_norm": 0.8137760939512062, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 13684 + }, + { + "epoch": 0.13685, + "grad_norm": 0.8146180859156968, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 13685 + }, + { + "epoch": 0.13686, + "grad_norm": 0.921238466382573, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 13686 + }, + { + "epoch": 0.13687, + "grad_norm": 0.9444575147851021, + "learning_rate": 0.003, + "loss": 4.068, + "step": 13687 + }, + { + "epoch": 0.13688, + "grad_norm": 1.0487047341776896, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 13688 + }, + { + "epoch": 0.13689, + "grad_norm": 1.0075912491629209, + "learning_rate": 0.003, + "loss": 4.098, + "step": 13689 + }, + { + "epoch": 0.1369, + "grad_norm": 0.870940565134766, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 13690 + }, + { + "epoch": 0.13691, + "grad_norm": 0.9240675323645391, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 13691 + }, + { + "epoch": 0.13692, + "grad_norm": 0.9599669386817655, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 13692 + }, + { + "epoch": 0.13693, + "grad_norm": 0.9415637855056248, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 13693 + }, + { + "epoch": 0.13694, + "grad_norm": 0.9501177605434156, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 13694 + }, + { + "epoch": 0.13695, + "grad_norm": 0.8100198531867937, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 13695 + }, + { + "epoch": 0.13696, + "grad_norm": 0.6679951906896466, + "learning_rate": 0.003, + "loss": 4.08, + "step": 13696 + }, + { + "epoch": 0.13697, + "grad_norm": 0.6921532768967519, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 13697 + }, + { + "epoch": 0.13698, + "grad_norm": 0.7470812239613058, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 13698 + }, + { + "epoch": 0.13699, + "grad_norm": 0.9108484007535782, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 13699 + }, + { + "epoch": 0.137, + "grad_norm": 1.1447311238807532, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 13700 + }, + { + "epoch": 0.13701, + "grad_norm": 0.937413956264893, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 13701 + }, + { + "epoch": 0.13702, + "grad_norm": 0.8178232351099999, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 13702 + }, + { + "epoch": 0.13703, + "grad_norm": 0.7017352086026089, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 13703 + }, + { + "epoch": 0.13704, + "grad_norm": 0.6560245541563535, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 13704 + }, + { + "epoch": 0.13705, + "grad_norm": 0.6797524992943338, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 13705 + }, + { + "epoch": 0.13706, + "grad_norm": 0.7946133213095659, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 13706 + }, + { + "epoch": 0.13707, + "grad_norm": 0.78560764436037, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 13707 + }, + { + "epoch": 0.13708, + "grad_norm": 0.8277643879380545, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 13708 + }, + { + "epoch": 0.13709, + "grad_norm": 0.9993663880079139, + "learning_rate": 0.003, + "loss": 4.065, + "step": 13709 + }, + { + "epoch": 0.1371, + "grad_norm": 1.2098981925658665, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 13710 + }, + { + "epoch": 0.13711, + "grad_norm": 0.9019527402259153, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 13711 + }, + { + "epoch": 0.13712, + "grad_norm": 0.8275757039455064, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 13712 + }, + { + "epoch": 0.13713, + "grad_norm": 0.7899362750168583, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 13713 + }, + { + "epoch": 0.13714, + "grad_norm": 0.8177296214244287, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 13714 + }, + { + "epoch": 0.13715, + "grad_norm": 0.7864591178039964, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 13715 + }, + { + "epoch": 0.13716, + "grad_norm": 0.6699660162112218, + "learning_rate": 0.003, + "loss": 4.037, + "step": 13716 + }, + { + "epoch": 0.13717, + "grad_norm": 0.6090747823320872, + "learning_rate": 0.003, + "loss": 4.064, + "step": 13717 + }, + { + "epoch": 0.13718, + "grad_norm": 0.7320085282083384, + "learning_rate": 0.003, + "loss": 4.081, + "step": 13718 + }, + { + "epoch": 0.13719, + "grad_norm": 0.9370165910644526, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 13719 + }, + { + "epoch": 0.1372, + "grad_norm": 1.18129103744682, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 13720 + }, + { + "epoch": 0.13721, + "grad_norm": 0.7980167825842891, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 13721 + }, + { + "epoch": 0.13722, + "grad_norm": 0.7206463655346406, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 13722 + }, + { + "epoch": 0.13723, + "grad_norm": 0.7338634631935627, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 13723 + }, + { + "epoch": 0.13724, + "grad_norm": 0.7909052635785321, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 13724 + }, + { + "epoch": 0.13725, + "grad_norm": 0.8325526086003372, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13725 + }, + { + "epoch": 0.13726, + "grad_norm": 0.8030937153949256, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 13726 + }, + { + "epoch": 0.13727, + "grad_norm": 0.8264953386721401, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 13727 + }, + { + "epoch": 0.13728, + "grad_norm": 0.8508217607353825, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 13728 + }, + { + "epoch": 0.13729, + "grad_norm": 0.946430824699291, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 13729 + }, + { + "epoch": 0.1373, + "grad_norm": 1.0914020403010114, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 13730 + }, + { + "epoch": 0.13731, + "grad_norm": 0.9288554160214816, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13731 + }, + { + "epoch": 0.13732, + "grad_norm": 0.8798609706288661, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 13732 + }, + { + "epoch": 0.13733, + "grad_norm": 1.0763765339207674, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 13733 + }, + { + "epoch": 0.13734, + "grad_norm": 1.1094576116187247, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 13734 + }, + { + "epoch": 0.13735, + "grad_norm": 0.72949427796053, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 13735 + }, + { + "epoch": 0.13736, + "grad_norm": 0.6207999180124579, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 13736 + }, + { + "epoch": 0.13737, + "grad_norm": 0.6539839556099966, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 13737 + }, + { + "epoch": 0.13738, + "grad_norm": 0.6969971908777107, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 13738 + }, + { + "epoch": 0.13739, + "grad_norm": 0.7285473962073619, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 13739 + }, + { + "epoch": 0.1374, + "grad_norm": 0.8102903398863351, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 13740 + }, + { + "epoch": 0.13741, + "grad_norm": 1.0221097889574713, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 13741 + }, + { + "epoch": 0.13742, + "grad_norm": 0.9903955434145519, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 13742 + }, + { + "epoch": 0.13743, + "grad_norm": 0.8947687196571883, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 13743 + }, + { + "epoch": 0.13744, + "grad_norm": 0.714024632599002, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 13744 + }, + { + "epoch": 0.13745, + "grad_norm": 0.7498165544155051, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 13745 + }, + { + "epoch": 0.13746, + "grad_norm": 0.8127335402003885, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 13746 + }, + { + "epoch": 0.13747, + "grad_norm": 0.8200482630097694, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 13747 + }, + { + "epoch": 0.13748, + "grad_norm": 0.8783438830484342, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 13748 + }, + { + "epoch": 0.13749, + "grad_norm": 0.8888799915051669, + "learning_rate": 0.003, + "loss": 4.065, + "step": 13749 + }, + { + "epoch": 0.1375, + "grad_norm": 0.9885813087307731, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 13750 + }, + { + "epoch": 0.13751, + "grad_norm": 1.2135856859992784, + "learning_rate": 0.003, + "loss": 4.111, + "step": 13751 + }, + { + "epoch": 0.13752, + "grad_norm": 0.7798500906983413, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 13752 + }, + { + "epoch": 0.13753, + "grad_norm": 0.7356701486495796, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 13753 + }, + { + "epoch": 0.13754, + "grad_norm": 0.9016638345091263, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 13754 + }, + { + "epoch": 0.13755, + "grad_norm": 1.1033413079780579, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 13755 + }, + { + "epoch": 0.13756, + "grad_norm": 0.927188631548279, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 13756 + }, + { + "epoch": 0.13757, + "grad_norm": 0.7729270789457843, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 13757 + }, + { + "epoch": 0.13758, + "grad_norm": 0.7363115802661809, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 13758 + }, + { + "epoch": 0.13759, + "grad_norm": 0.7592211922772076, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 13759 + }, + { + "epoch": 0.1376, + "grad_norm": 0.7446501455003881, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 13760 + }, + { + "epoch": 0.13761, + "grad_norm": 0.7212238566156061, + "learning_rate": 0.003, + "loss": 4.034, + "step": 13761 + }, + { + "epoch": 0.13762, + "grad_norm": 0.7581711637419875, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 13762 + }, + { + "epoch": 0.13763, + "grad_norm": 0.8006710198015181, + "learning_rate": 0.003, + "loss": 4.1064, + "step": 13763 + }, + { + "epoch": 0.13764, + "grad_norm": 0.9664995701855025, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 13764 + }, + { + "epoch": 0.13765, + "grad_norm": 1.1672346999485474, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 13765 + }, + { + "epoch": 0.13766, + "grad_norm": 0.9109583284164834, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 13766 + }, + { + "epoch": 0.13767, + "grad_norm": 0.8452015780168584, + "learning_rate": 0.003, + "loss": 4.054, + "step": 13767 + }, + { + "epoch": 0.13768, + "grad_norm": 0.8119502560786048, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 13768 + }, + { + "epoch": 0.13769, + "grad_norm": 1.0072815571314082, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 13769 + }, + { + "epoch": 0.1377, + "grad_norm": 1.165869816482718, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 13770 + }, + { + "epoch": 0.13771, + "grad_norm": 0.9084630764295751, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 13771 + }, + { + "epoch": 0.13772, + "grad_norm": 0.844520953849131, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 13772 + }, + { + "epoch": 0.13773, + "grad_norm": 0.8726872555065581, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 13773 + }, + { + "epoch": 0.13774, + "grad_norm": 0.7793340636550928, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 13774 + }, + { + "epoch": 0.13775, + "grad_norm": 0.8126996419655287, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 13775 + }, + { + "epoch": 0.13776, + "grad_norm": 0.8801569005034962, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 13776 + }, + { + "epoch": 0.13777, + "grad_norm": 0.9551286417300697, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 13777 + }, + { + "epoch": 0.13778, + "grad_norm": 0.8165579703433642, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 13778 + }, + { + "epoch": 0.13779, + "grad_norm": 0.7789435433001016, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 13779 + }, + { + "epoch": 0.1378, + "grad_norm": 0.7928376917151682, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 13780 + }, + { + "epoch": 0.13781, + "grad_norm": 0.7481096316132387, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 13781 + }, + { + "epoch": 0.13782, + "grad_norm": 0.7777503947233947, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 13782 + }, + { + "epoch": 0.13783, + "grad_norm": 0.8542991874290908, + "learning_rate": 0.003, + "loss": 4.1193, + "step": 13783 + }, + { + "epoch": 0.13784, + "grad_norm": 0.9720862804136864, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 13784 + }, + { + "epoch": 0.13785, + "grad_norm": 1.1154877986375358, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 13785 + }, + { + "epoch": 0.13786, + "grad_norm": 0.9630735815246285, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 13786 + }, + { + "epoch": 0.13787, + "grad_norm": 0.7948417831976556, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 13787 + }, + { + "epoch": 0.13788, + "grad_norm": 0.648743357212664, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 13788 + }, + { + "epoch": 0.13789, + "grad_norm": 0.6241391617920274, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 13789 + }, + { + "epoch": 0.1379, + "grad_norm": 0.5971428516667047, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 13790 + }, + { + "epoch": 0.13791, + "grad_norm": 0.5367881789167211, + "learning_rate": 0.003, + "loss": 4.078, + "step": 13791 + }, + { + "epoch": 0.13792, + "grad_norm": 0.5559013858329298, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 13792 + }, + { + "epoch": 0.13793, + "grad_norm": 0.5767161875300437, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 13793 + }, + { + "epoch": 0.13794, + "grad_norm": 0.5799682646889748, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13794 + }, + { + "epoch": 0.13795, + "grad_norm": 0.7147042718304768, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 13795 + }, + { + "epoch": 0.13796, + "grad_norm": 0.8674555532184266, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 13796 + }, + { + "epoch": 0.13797, + "grad_norm": 0.9454446297792318, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 13797 + }, + { + "epoch": 0.13798, + "grad_norm": 0.9166854521804426, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 13798 + }, + { + "epoch": 0.13799, + "grad_norm": 1.114005996475773, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 13799 + }, + { + "epoch": 0.138, + "grad_norm": 1.114153798356774, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 13800 + }, + { + "epoch": 0.13801, + "grad_norm": 1.1213716295631775, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 13801 + }, + { + "epoch": 0.13802, + "grad_norm": 0.8689056280736214, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 13802 + }, + { + "epoch": 0.13803, + "grad_norm": 0.7907977137360918, + "learning_rate": 0.003, + "loss": 4.076, + "step": 13803 + }, + { + "epoch": 0.13804, + "grad_norm": 0.8141518765569847, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 13804 + }, + { + "epoch": 0.13805, + "grad_norm": 0.7293812965504043, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 13805 + }, + { + "epoch": 0.13806, + "grad_norm": 0.709596218108507, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 13806 + }, + { + "epoch": 0.13807, + "grad_norm": 0.7848457864855878, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 13807 + }, + { + "epoch": 0.13808, + "grad_norm": 0.8585058692782394, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 13808 + }, + { + "epoch": 0.13809, + "grad_norm": 0.8959977653543688, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 13809 + }, + { + "epoch": 0.1381, + "grad_norm": 0.7245440460794622, + "learning_rate": 0.003, + "loss": 4.047, + "step": 13810 + }, + { + "epoch": 0.13811, + "grad_norm": 0.6400324366048498, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 13811 + }, + { + "epoch": 0.13812, + "grad_norm": 0.6964389374234248, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 13812 + }, + { + "epoch": 0.13813, + "grad_norm": 0.6628085333605657, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 13813 + }, + { + "epoch": 0.13814, + "grad_norm": 0.6258434802751189, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 13814 + }, + { + "epoch": 0.13815, + "grad_norm": 0.6415033432035515, + "learning_rate": 0.003, + "loss": 4.038, + "step": 13815 + }, + { + "epoch": 0.13816, + "grad_norm": 0.7131256086359586, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 13816 + }, + { + "epoch": 0.13817, + "grad_norm": 0.8539536962396401, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 13817 + }, + { + "epoch": 0.13818, + "grad_norm": 1.2659528268865883, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 13818 + }, + { + "epoch": 0.13819, + "grad_norm": 0.9090084828272725, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 13819 + }, + { + "epoch": 0.1382, + "grad_norm": 0.8812080247924137, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 13820 + }, + { + "epoch": 0.13821, + "grad_norm": 1.0378943064189554, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 13821 + }, + { + "epoch": 0.13822, + "grad_norm": 1.1041204426762394, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 13822 + }, + { + "epoch": 0.13823, + "grad_norm": 0.8189568980915312, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 13823 + }, + { + "epoch": 0.13824, + "grad_norm": 0.8858936483460652, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 13824 + }, + { + "epoch": 0.13825, + "grad_norm": 1.1098405437999121, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 13825 + }, + { + "epoch": 0.13826, + "grad_norm": 1.0283270979570271, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 13826 + }, + { + "epoch": 0.13827, + "grad_norm": 0.9376045888039426, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 13827 + }, + { + "epoch": 0.13828, + "grad_norm": 0.9563883985403528, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 13828 + }, + { + "epoch": 0.13829, + "grad_norm": 0.7715868042355665, + "learning_rate": 0.003, + "loss": 4.038, + "step": 13829 + }, + { + "epoch": 0.1383, + "grad_norm": 0.7879364212968953, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 13830 + }, + { + "epoch": 0.13831, + "grad_norm": 0.9418861363077929, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 13831 + }, + { + "epoch": 0.13832, + "grad_norm": 1.1475358778348934, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 13832 + }, + { + "epoch": 0.13833, + "grad_norm": 1.1540925669805697, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 13833 + }, + { + "epoch": 0.13834, + "grad_norm": 0.8875999565754119, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 13834 + }, + { + "epoch": 0.13835, + "grad_norm": 0.6877634991361685, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 13835 + }, + { + "epoch": 0.13836, + "grad_norm": 0.6578700584264132, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 13836 + }, + { + "epoch": 0.13837, + "grad_norm": 0.6227258100422538, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 13837 + }, + { + "epoch": 0.13838, + "grad_norm": 0.7002738833360858, + "learning_rate": 0.003, + "loss": 4.027, + "step": 13838 + }, + { + "epoch": 0.13839, + "grad_norm": 0.749248817125079, + "learning_rate": 0.003, + "loss": 4.08, + "step": 13839 + }, + { + "epoch": 0.1384, + "grad_norm": 0.9289441131530395, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 13840 + }, + { + "epoch": 0.13841, + "grad_norm": 1.1013445196068063, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 13841 + }, + { + "epoch": 0.13842, + "grad_norm": 0.932321072376433, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 13842 + }, + { + "epoch": 0.13843, + "grad_norm": 0.977524130860534, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 13843 + }, + { + "epoch": 0.13844, + "grad_norm": 1.0408277592904422, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 13844 + }, + { + "epoch": 0.13845, + "grad_norm": 1.1276841668984763, + "learning_rate": 0.003, + "loss": 4.09, + "step": 13845 + }, + { + "epoch": 0.13846, + "grad_norm": 0.9033755025070581, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 13846 + }, + { + "epoch": 0.13847, + "grad_norm": 0.8063325033290983, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 13847 + }, + { + "epoch": 0.13848, + "grad_norm": 0.7751890589883782, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 13848 + }, + { + "epoch": 0.13849, + "grad_norm": 0.7130523394391949, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 13849 + }, + { + "epoch": 0.1385, + "grad_norm": 0.7422819135138875, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 13850 + }, + { + "epoch": 0.13851, + "grad_norm": 0.772559891336259, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 13851 + }, + { + "epoch": 0.13852, + "grad_norm": 1.0305335746506892, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 13852 + }, + { + "epoch": 0.13853, + "grad_norm": 1.0686821057604816, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 13853 + }, + { + "epoch": 0.13854, + "grad_norm": 0.8617161136862561, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 13854 + }, + { + "epoch": 0.13855, + "grad_norm": 0.8959961298539898, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 13855 + }, + { + "epoch": 0.13856, + "grad_norm": 0.8942780945647872, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 13856 + }, + { + "epoch": 0.13857, + "grad_norm": 0.9724266400760307, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 13857 + }, + { + "epoch": 0.13858, + "grad_norm": 0.892095929966666, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 13858 + }, + { + "epoch": 0.13859, + "grad_norm": 0.7845456057332171, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 13859 + }, + { + "epoch": 0.1386, + "grad_norm": 0.7863202510170799, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 13860 + }, + { + "epoch": 0.13861, + "grad_norm": 0.724850270661534, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 13861 + }, + { + "epoch": 0.13862, + "grad_norm": 0.682626036200262, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 13862 + }, + { + "epoch": 0.13863, + "grad_norm": 0.7313618271309567, + "learning_rate": 0.003, + "loss": 4.1088, + "step": 13863 + }, + { + "epoch": 0.13864, + "grad_norm": 0.62749590097859, + "learning_rate": 0.003, + "loss": 4.09, + "step": 13864 + }, + { + "epoch": 0.13865, + "grad_norm": 0.6049866909369687, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 13865 + }, + { + "epoch": 0.13866, + "grad_norm": 0.9049127148877567, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 13866 + }, + { + "epoch": 0.13867, + "grad_norm": 1.3325377235073774, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 13867 + }, + { + "epoch": 0.13868, + "grad_norm": 0.7819830335163918, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 13868 + }, + { + "epoch": 0.13869, + "grad_norm": 0.6834694667388045, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 13869 + }, + { + "epoch": 0.1387, + "grad_norm": 0.6118272056142949, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 13870 + }, + { + "epoch": 0.13871, + "grad_norm": 0.6834278316785658, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 13871 + }, + { + "epoch": 0.13872, + "grad_norm": 0.9213148671100461, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 13872 + }, + { + "epoch": 0.13873, + "grad_norm": 1.0221627581595059, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 13873 + }, + { + "epoch": 0.13874, + "grad_norm": 1.0267626825893557, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 13874 + }, + { + "epoch": 0.13875, + "grad_norm": 0.8997216322344997, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 13875 + }, + { + "epoch": 0.13876, + "grad_norm": 0.904524049711079, + "learning_rate": 0.003, + "loss": 4.089, + "step": 13876 + }, + { + "epoch": 0.13877, + "grad_norm": 1.075224551654725, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 13877 + }, + { + "epoch": 0.13878, + "grad_norm": 0.8739164555337202, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 13878 + }, + { + "epoch": 0.13879, + "grad_norm": 0.7676271792656019, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 13879 + }, + { + "epoch": 0.1388, + "grad_norm": 0.8698458064329295, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 13880 + }, + { + "epoch": 0.13881, + "grad_norm": 0.997623534384448, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 13881 + }, + { + "epoch": 0.13882, + "grad_norm": 1.0346764527746597, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 13882 + }, + { + "epoch": 0.13883, + "grad_norm": 1.146461851359024, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 13883 + }, + { + "epoch": 0.13884, + "grad_norm": 0.9582886184544946, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 13884 + }, + { + "epoch": 0.13885, + "grad_norm": 1.120398293538235, + "learning_rate": 0.003, + "loss": 4.122, + "step": 13885 + }, + { + "epoch": 0.13886, + "grad_norm": 0.9425712394898523, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 13886 + }, + { + "epoch": 0.13887, + "grad_norm": 0.8458723555471758, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13887 + }, + { + "epoch": 0.13888, + "grad_norm": 0.7586531946489982, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 13888 + }, + { + "epoch": 0.13889, + "grad_norm": 0.8178178766092938, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 13889 + }, + { + "epoch": 0.1389, + "grad_norm": 1.0274999663987239, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 13890 + }, + { + "epoch": 0.13891, + "grad_norm": 1.088422536589401, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 13891 + }, + { + "epoch": 0.13892, + "grad_norm": 0.9934566817900852, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 13892 + }, + { + "epoch": 0.13893, + "grad_norm": 1.0979690315596429, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 13893 + }, + { + "epoch": 0.13894, + "grad_norm": 0.7359317410490038, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 13894 + }, + { + "epoch": 0.13895, + "grad_norm": 0.6660798775406379, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 13895 + }, + { + "epoch": 0.13896, + "grad_norm": 0.6068604475042304, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 13896 + }, + { + "epoch": 0.13897, + "grad_norm": 0.5554031840270844, + "learning_rate": 0.003, + "loss": 4.067, + "step": 13897 + }, + { + "epoch": 0.13898, + "grad_norm": 0.5343212773662731, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 13898 + }, + { + "epoch": 0.13899, + "grad_norm": 0.5942944529885672, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 13899 + }, + { + "epoch": 0.139, + "grad_norm": 0.5655268819606185, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 13900 + }, + { + "epoch": 0.13901, + "grad_norm": 0.5735486715388576, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 13901 + }, + { + "epoch": 0.13902, + "grad_norm": 0.5578141470648175, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 13902 + }, + { + "epoch": 0.13903, + "grad_norm": 0.5952485590520994, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 13903 + }, + { + "epoch": 0.13904, + "grad_norm": 0.702839100454558, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 13904 + }, + { + "epoch": 0.13905, + "grad_norm": 0.9194180864002266, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 13905 + }, + { + "epoch": 0.13906, + "grad_norm": 0.9918127068613594, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 13906 + }, + { + "epoch": 0.13907, + "grad_norm": 0.9982642946562648, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 13907 + }, + { + "epoch": 0.13908, + "grad_norm": 1.1028592223759681, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 13908 + }, + { + "epoch": 0.13909, + "grad_norm": 0.9119818024419278, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 13909 + }, + { + "epoch": 0.1391, + "grad_norm": 0.9586893804229246, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 13910 + }, + { + "epoch": 0.13911, + "grad_norm": 1.0439638832992049, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 13911 + }, + { + "epoch": 0.13912, + "grad_norm": 0.8210400684919287, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 13912 + }, + { + "epoch": 0.13913, + "grad_norm": 0.8086768694094838, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 13913 + }, + { + "epoch": 0.13914, + "grad_norm": 0.8101091875046591, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 13914 + }, + { + "epoch": 0.13915, + "grad_norm": 0.8725203305820736, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 13915 + }, + { + "epoch": 0.13916, + "grad_norm": 0.8990905891535143, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 13916 + }, + { + "epoch": 0.13917, + "grad_norm": 0.9909167123501083, + "learning_rate": 0.003, + "loss": 4.1084, + "step": 13917 + }, + { + "epoch": 0.13918, + "grad_norm": 1.0230599514140737, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 13918 + }, + { + "epoch": 0.13919, + "grad_norm": 1.07078297100314, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 13919 + }, + { + "epoch": 0.1392, + "grad_norm": 1.1701770532662963, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 13920 + }, + { + "epoch": 0.13921, + "grad_norm": 1.125390222000617, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 13921 + }, + { + "epoch": 0.13922, + "grad_norm": 1.1757748833898825, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 13922 + }, + { + "epoch": 0.13923, + "grad_norm": 0.9656144153356182, + "learning_rate": 0.003, + "loss": 4.1, + "step": 13923 + }, + { + "epoch": 0.13924, + "grad_norm": 0.9040822950204922, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 13924 + }, + { + "epoch": 0.13925, + "grad_norm": 0.9315380370175754, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 13925 + }, + { + "epoch": 0.13926, + "grad_norm": 0.8737668104257097, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 13926 + }, + { + "epoch": 0.13927, + "grad_norm": 0.8291343492556099, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 13927 + }, + { + "epoch": 0.13928, + "grad_norm": 0.7113404829726243, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 13928 + }, + { + "epoch": 0.13929, + "grad_norm": 0.741200172950455, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 13929 + }, + { + "epoch": 0.1393, + "grad_norm": 0.6613773919670253, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 13930 + }, + { + "epoch": 0.13931, + "grad_norm": 0.6653353843683376, + "learning_rate": 0.003, + "loss": 4.006, + "step": 13931 + }, + { + "epoch": 0.13932, + "grad_norm": 0.6287574812459211, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 13932 + }, + { + "epoch": 0.13933, + "grad_norm": 0.5674413496101179, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 13933 + }, + { + "epoch": 0.13934, + "grad_norm": 0.5880944332131156, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 13934 + }, + { + "epoch": 0.13935, + "grad_norm": 0.6342207765203128, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 13935 + }, + { + "epoch": 0.13936, + "grad_norm": 0.6101277789479459, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 13936 + }, + { + "epoch": 0.13937, + "grad_norm": 0.6865857511421336, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 13937 + }, + { + "epoch": 0.13938, + "grad_norm": 0.854569847730593, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 13938 + }, + { + "epoch": 0.13939, + "grad_norm": 1.0856281334857132, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 13939 + }, + { + "epoch": 0.1394, + "grad_norm": 1.147959372718614, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 13940 + }, + { + "epoch": 0.13941, + "grad_norm": 0.8908493754537409, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 13941 + }, + { + "epoch": 0.13942, + "grad_norm": 0.9028758990435468, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 13942 + }, + { + "epoch": 0.13943, + "grad_norm": 0.9649037667031776, + "learning_rate": 0.003, + "loss": 4.1223, + "step": 13943 + }, + { + "epoch": 0.13944, + "grad_norm": 0.9874675628318524, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 13944 + }, + { + "epoch": 0.13945, + "grad_norm": 1.018584331238473, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 13945 + }, + { + "epoch": 0.13946, + "grad_norm": 0.9947494917345606, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 13946 + }, + { + "epoch": 0.13947, + "grad_norm": 0.9749607135922929, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 13947 + }, + { + "epoch": 0.13948, + "grad_norm": 0.9238473106107602, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 13948 + }, + { + "epoch": 0.13949, + "grad_norm": 0.9349268951940659, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 13949 + }, + { + "epoch": 0.1395, + "grad_norm": 1.0428619217148167, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 13950 + }, + { + "epoch": 0.13951, + "grad_norm": 0.8459673684923922, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 13951 + }, + { + "epoch": 0.13952, + "grad_norm": 0.6852193986489556, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 13952 + }, + { + "epoch": 0.13953, + "grad_norm": 0.617506539409373, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 13953 + }, + { + "epoch": 0.13954, + "grad_norm": 0.6406273910256604, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 13954 + }, + { + "epoch": 0.13955, + "grad_norm": 0.7470615445445485, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 13955 + }, + { + "epoch": 0.13956, + "grad_norm": 0.8323240315799879, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 13956 + }, + { + "epoch": 0.13957, + "grad_norm": 0.9840882166301426, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 13957 + }, + { + "epoch": 0.13958, + "grad_norm": 1.1136670107251858, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 13958 + }, + { + "epoch": 0.13959, + "grad_norm": 0.7949677641025017, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 13959 + }, + { + "epoch": 0.1396, + "grad_norm": 0.7616451119844375, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 13960 + }, + { + "epoch": 0.13961, + "grad_norm": 0.7505373073112688, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 13961 + }, + { + "epoch": 0.13962, + "grad_norm": 0.7526777815873916, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 13962 + }, + { + "epoch": 0.13963, + "grad_norm": 0.6195642763313239, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 13963 + }, + { + "epoch": 0.13964, + "grad_norm": 0.5757858013174563, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 13964 + }, + { + "epoch": 0.13965, + "grad_norm": 0.581188960888462, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 13965 + }, + { + "epoch": 0.13966, + "grad_norm": 0.634007454821538, + "learning_rate": 0.003, + "loss": 4.061, + "step": 13966 + }, + { + "epoch": 0.13967, + "grad_norm": 0.7685363948603434, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 13967 + }, + { + "epoch": 0.13968, + "grad_norm": 0.8791380930040772, + "learning_rate": 0.003, + "loss": 4.032, + "step": 13968 + }, + { + "epoch": 0.13969, + "grad_norm": 0.9370012053971709, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 13969 + }, + { + "epoch": 0.1397, + "grad_norm": 1.007104403630713, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 13970 + }, + { + "epoch": 0.13971, + "grad_norm": 1.1017121204227867, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 13971 + }, + { + "epoch": 0.13972, + "grad_norm": 0.9215751691915052, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 13972 + }, + { + "epoch": 0.13973, + "grad_norm": 1.0149330901881009, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 13973 + }, + { + "epoch": 0.13974, + "grad_norm": 1.2031116163852842, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 13974 + }, + { + "epoch": 0.13975, + "grad_norm": 0.9048277045744538, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 13975 + }, + { + "epoch": 0.13976, + "grad_norm": 0.8467577858000327, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 13976 + }, + { + "epoch": 0.13977, + "grad_norm": 0.9347928724178286, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 13977 + }, + { + "epoch": 0.13978, + "grad_norm": 0.9441370106156344, + "learning_rate": 0.003, + "loss": 4.1319, + "step": 13978 + }, + { + "epoch": 0.13979, + "grad_norm": 0.9813737971909722, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 13979 + }, + { + "epoch": 0.1398, + "grad_norm": 1.0800987666044952, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 13980 + }, + { + "epoch": 0.13981, + "grad_norm": 0.9155655036171332, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 13981 + }, + { + "epoch": 0.13982, + "grad_norm": 0.9348220152415775, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 13982 + }, + { + "epoch": 0.13983, + "grad_norm": 0.8703325281705804, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 13983 + }, + { + "epoch": 0.13984, + "grad_norm": 0.8021340331951412, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 13984 + }, + { + "epoch": 0.13985, + "grad_norm": 0.8026066895289523, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 13985 + }, + { + "epoch": 0.13986, + "grad_norm": 0.80465103900619, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 13986 + }, + { + "epoch": 0.13987, + "grad_norm": 0.782418310682838, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 13987 + }, + { + "epoch": 0.13988, + "grad_norm": 0.7134729135125821, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 13988 + }, + { + "epoch": 0.13989, + "grad_norm": 0.7007064251116866, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 13989 + }, + { + "epoch": 0.1399, + "grad_norm": 0.934409382302109, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 13990 + }, + { + "epoch": 0.13991, + "grad_norm": 1.0791515335095017, + "learning_rate": 0.003, + "loss": 4.083, + "step": 13991 + }, + { + "epoch": 0.13992, + "grad_norm": 0.9650430756826706, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 13992 + }, + { + "epoch": 0.13993, + "grad_norm": 0.8339000310403641, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 13993 + }, + { + "epoch": 0.13994, + "grad_norm": 0.7723832134323424, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 13994 + }, + { + "epoch": 0.13995, + "grad_norm": 0.7469181907748006, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 13995 + }, + { + "epoch": 0.13996, + "grad_norm": 0.6762581253213434, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 13996 + }, + { + "epoch": 0.13997, + "grad_norm": 0.6616456388943689, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 13997 + }, + { + "epoch": 0.13998, + "grad_norm": 0.7101556519082999, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 13998 + }, + { + "epoch": 0.13999, + "grad_norm": 0.7122169886672998, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 13999 + }, + { + "epoch": 0.14, + "grad_norm": 0.6968307619173715, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 14000 + }, + { + "epoch": 0.14001, + "grad_norm": 0.698056742340295, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 14001 + }, + { + "epoch": 0.14002, + "grad_norm": 0.7907075724163473, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 14002 + }, + { + "epoch": 0.14003, + "grad_norm": 0.7428160597908467, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 14003 + }, + { + "epoch": 0.14004, + "grad_norm": 0.6858756472286215, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 14004 + }, + { + "epoch": 0.14005, + "grad_norm": 0.92068368447873, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 14005 + }, + { + "epoch": 0.14006, + "grad_norm": 1.1320609772883197, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 14006 + }, + { + "epoch": 0.14007, + "grad_norm": 1.0124595191225807, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 14007 + }, + { + "epoch": 0.14008, + "grad_norm": 1.1553992392890877, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 14008 + }, + { + "epoch": 0.14009, + "grad_norm": 0.9028931663771032, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 14009 + }, + { + "epoch": 0.1401, + "grad_norm": 0.9385550516447129, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 14010 + }, + { + "epoch": 0.14011, + "grad_norm": 0.8041275843331671, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 14011 + }, + { + "epoch": 0.14012, + "grad_norm": 0.7327470492759001, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 14012 + }, + { + "epoch": 0.14013, + "grad_norm": 0.7791426325933855, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 14013 + }, + { + "epoch": 0.14014, + "grad_norm": 0.7998075523884633, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 14014 + }, + { + "epoch": 0.14015, + "grad_norm": 0.7834602187439688, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 14015 + }, + { + "epoch": 0.14016, + "grad_norm": 0.8049002907527643, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 14016 + }, + { + "epoch": 0.14017, + "grad_norm": 0.8442423762394555, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 14017 + }, + { + "epoch": 0.14018, + "grad_norm": 0.8949825666045306, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 14018 + }, + { + "epoch": 0.14019, + "grad_norm": 0.9529272290674913, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 14019 + }, + { + "epoch": 0.1402, + "grad_norm": 1.0588908963705896, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 14020 + }, + { + "epoch": 0.14021, + "grad_norm": 0.8502772012425337, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 14021 + }, + { + "epoch": 0.14022, + "grad_norm": 0.8729125321703824, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 14022 + }, + { + "epoch": 0.14023, + "grad_norm": 0.9281444206531265, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 14023 + }, + { + "epoch": 0.14024, + "grad_norm": 0.9767142603182802, + "learning_rate": 0.003, + "loss": 4.083, + "step": 14024 + }, + { + "epoch": 0.14025, + "grad_norm": 0.8811350534417398, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 14025 + }, + { + "epoch": 0.14026, + "grad_norm": 0.7731178104895816, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 14026 + }, + { + "epoch": 0.14027, + "grad_norm": 0.8121738878256008, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 14027 + }, + { + "epoch": 0.14028, + "grad_norm": 1.0527593861995281, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 14028 + }, + { + "epoch": 0.14029, + "grad_norm": 1.0364354641585516, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 14029 + }, + { + "epoch": 0.1403, + "grad_norm": 0.9356382728995162, + "learning_rate": 0.003, + "loss": 4.071, + "step": 14030 + }, + { + "epoch": 0.14031, + "grad_norm": 0.9734215622713746, + "learning_rate": 0.003, + "loss": 4.062, + "step": 14031 + }, + { + "epoch": 0.14032, + "grad_norm": 0.9221041776060591, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 14032 + }, + { + "epoch": 0.14033, + "grad_norm": 0.942392631432111, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 14033 + }, + { + "epoch": 0.14034, + "grad_norm": 0.953675799144503, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 14034 + }, + { + "epoch": 0.14035, + "grad_norm": 1.0798346096269986, + "learning_rate": 0.003, + "loss": 4.1119, + "step": 14035 + }, + { + "epoch": 0.14036, + "grad_norm": 0.768493447080825, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 14036 + }, + { + "epoch": 0.14037, + "grad_norm": 0.6658366423896053, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 14037 + }, + { + "epoch": 0.14038, + "grad_norm": 0.6700488420832278, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 14038 + }, + { + "epoch": 0.14039, + "grad_norm": 0.6994751424346235, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 14039 + }, + { + "epoch": 0.1404, + "grad_norm": 0.6805216234807403, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 14040 + }, + { + "epoch": 0.14041, + "grad_norm": 0.7096213198173723, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 14041 + }, + { + "epoch": 0.14042, + "grad_norm": 0.9024095684360148, + "learning_rate": 0.003, + "loss": 4.088, + "step": 14042 + }, + { + "epoch": 0.14043, + "grad_norm": 1.004987005227543, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 14043 + }, + { + "epoch": 0.14044, + "grad_norm": 0.8703713622281967, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 14044 + }, + { + "epoch": 0.14045, + "grad_norm": 0.8162368451407708, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 14045 + }, + { + "epoch": 0.14046, + "grad_norm": 0.822083556580899, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 14046 + }, + { + "epoch": 0.14047, + "grad_norm": 1.0099070720931629, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 14047 + }, + { + "epoch": 0.14048, + "grad_norm": 1.2234046385348645, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 14048 + }, + { + "epoch": 0.14049, + "grad_norm": 1.0961981256367888, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 14049 + }, + { + "epoch": 0.1405, + "grad_norm": 0.9805499525283211, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 14050 + }, + { + "epoch": 0.14051, + "grad_norm": 0.8895750604840732, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 14051 + }, + { + "epoch": 0.14052, + "grad_norm": 0.7226459377749787, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 14052 + }, + { + "epoch": 0.14053, + "grad_norm": 0.6821221410680418, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 14053 + }, + { + "epoch": 0.14054, + "grad_norm": 0.6815291345070602, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 14054 + }, + { + "epoch": 0.14055, + "grad_norm": 0.6225126205146272, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 14055 + }, + { + "epoch": 0.14056, + "grad_norm": 0.5848437124738132, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14056 + }, + { + "epoch": 0.14057, + "grad_norm": 0.6192948262227931, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 14057 + }, + { + "epoch": 0.14058, + "grad_norm": 0.6817486571460282, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 14058 + }, + { + "epoch": 0.14059, + "grad_norm": 0.8923821241111431, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 14059 + }, + { + "epoch": 0.1406, + "grad_norm": 1.0556781965538722, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 14060 + }, + { + "epoch": 0.14061, + "grad_norm": 0.9167875500653558, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 14061 + }, + { + "epoch": 0.14062, + "grad_norm": 0.7317744511706858, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 14062 + }, + { + "epoch": 0.14063, + "grad_norm": 0.6455136777817637, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 14063 + }, + { + "epoch": 0.14064, + "grad_norm": 0.611610992771704, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 14064 + }, + { + "epoch": 0.14065, + "grad_norm": 0.6983464827194192, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 14065 + }, + { + "epoch": 0.14066, + "grad_norm": 0.7025104082890304, + "learning_rate": 0.003, + "loss": 4.064, + "step": 14066 + }, + { + "epoch": 0.14067, + "grad_norm": 0.7056840136314639, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 14067 + }, + { + "epoch": 0.14068, + "grad_norm": 0.7171403191106246, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 14068 + }, + { + "epoch": 0.14069, + "grad_norm": 0.7964383532066432, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 14069 + }, + { + "epoch": 0.1407, + "grad_norm": 0.992490117489188, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 14070 + }, + { + "epoch": 0.14071, + "grad_norm": 1.1888031121284164, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 14071 + }, + { + "epoch": 0.14072, + "grad_norm": 0.7550982764018704, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 14072 + }, + { + "epoch": 0.14073, + "grad_norm": 0.7178006563476692, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 14073 + }, + { + "epoch": 0.14074, + "grad_norm": 0.6979945759359226, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 14074 + }, + { + "epoch": 0.14075, + "grad_norm": 0.7517115145725634, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 14075 + }, + { + "epoch": 0.14076, + "grad_norm": 0.8971750250500409, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 14076 + }, + { + "epoch": 0.14077, + "grad_norm": 1.0061786676270656, + "learning_rate": 0.003, + "loss": 4.064, + "step": 14077 + }, + { + "epoch": 0.14078, + "grad_norm": 1.0662018235332384, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14078 + }, + { + "epoch": 0.14079, + "grad_norm": 1.001016966281043, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 14079 + }, + { + "epoch": 0.1408, + "grad_norm": 1.0905427139186263, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 14080 + }, + { + "epoch": 0.14081, + "grad_norm": 0.8601233047418335, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 14081 + }, + { + "epoch": 0.14082, + "grad_norm": 0.8464053360349869, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 14082 + }, + { + "epoch": 0.14083, + "grad_norm": 0.8163832757468629, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 14083 + }, + { + "epoch": 0.14084, + "grad_norm": 0.8979789297652752, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 14084 + }, + { + "epoch": 0.14085, + "grad_norm": 0.9452687142481188, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 14085 + }, + { + "epoch": 0.14086, + "grad_norm": 1.2115870138728335, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 14086 + }, + { + "epoch": 0.14087, + "grad_norm": 0.8309684268595496, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 14087 + }, + { + "epoch": 0.14088, + "grad_norm": 0.6117436484071088, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 14088 + }, + { + "epoch": 0.14089, + "grad_norm": 0.7118110256820298, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14089 + }, + { + "epoch": 0.1409, + "grad_norm": 0.8719998742617723, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 14090 + }, + { + "epoch": 0.14091, + "grad_norm": 1.0052133984711733, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 14091 + }, + { + "epoch": 0.14092, + "grad_norm": 0.9740470677250189, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 14092 + }, + { + "epoch": 0.14093, + "grad_norm": 1.0051493119311377, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 14093 + }, + { + "epoch": 0.14094, + "grad_norm": 0.9458928117251939, + "learning_rate": 0.003, + "loss": 4.046, + "step": 14094 + }, + { + "epoch": 0.14095, + "grad_norm": 0.975823588806917, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 14095 + }, + { + "epoch": 0.14096, + "grad_norm": 0.9906952928170704, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 14096 + }, + { + "epoch": 0.14097, + "grad_norm": 0.8720130556605404, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 14097 + }, + { + "epoch": 0.14098, + "grad_norm": 0.7929794084939011, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 14098 + }, + { + "epoch": 0.14099, + "grad_norm": 0.7608051857301364, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 14099 + }, + { + "epoch": 0.141, + "grad_norm": 0.6072227619693424, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 14100 + }, + { + "epoch": 0.14101, + "grad_norm": 0.6354583908656244, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 14101 + }, + { + "epoch": 0.14102, + "grad_norm": 0.7648589924969916, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 14102 + }, + { + "epoch": 0.14103, + "grad_norm": 0.75073125925008, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 14103 + }, + { + "epoch": 0.14104, + "grad_norm": 0.8944629521861538, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 14104 + }, + { + "epoch": 0.14105, + "grad_norm": 1.1252243979552374, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 14105 + }, + { + "epoch": 0.14106, + "grad_norm": 0.9598877414388742, + "learning_rate": 0.003, + "loss": 4.072, + "step": 14106 + }, + { + "epoch": 0.14107, + "grad_norm": 0.8757505916312833, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 14107 + }, + { + "epoch": 0.14108, + "grad_norm": 0.8919262266661061, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 14108 + }, + { + "epoch": 0.14109, + "grad_norm": 0.863228884529217, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 14109 + }, + { + "epoch": 0.1411, + "grad_norm": 0.905161408561225, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 14110 + }, + { + "epoch": 0.14111, + "grad_norm": 0.8490438955584823, + "learning_rate": 0.003, + "loss": 4.085, + "step": 14111 + }, + { + "epoch": 0.14112, + "grad_norm": 0.8766583441383862, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 14112 + }, + { + "epoch": 0.14113, + "grad_norm": 0.8638268884569851, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 14113 + }, + { + "epoch": 0.14114, + "grad_norm": 0.8155943482946549, + "learning_rate": 0.003, + "loss": 4.076, + "step": 14114 + }, + { + "epoch": 0.14115, + "grad_norm": 0.8161826207286162, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 14115 + }, + { + "epoch": 0.14116, + "grad_norm": 0.8791333392131072, + "learning_rate": 0.003, + "loss": 4.077, + "step": 14116 + }, + { + "epoch": 0.14117, + "grad_norm": 1.0696105564074323, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 14117 + }, + { + "epoch": 0.14118, + "grad_norm": 0.8707842677857357, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 14118 + }, + { + "epoch": 0.14119, + "grad_norm": 0.7663916898824941, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 14119 + }, + { + "epoch": 0.1412, + "grad_norm": 0.6122112788474803, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 14120 + }, + { + "epoch": 0.14121, + "grad_norm": 0.6259906539234324, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 14121 + }, + { + "epoch": 0.14122, + "grad_norm": 0.7133645033810277, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 14122 + }, + { + "epoch": 0.14123, + "grad_norm": 0.781718056704064, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 14123 + }, + { + "epoch": 0.14124, + "grad_norm": 0.8639987865513068, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 14124 + }, + { + "epoch": 0.14125, + "grad_norm": 1.0610918576022226, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 14125 + }, + { + "epoch": 0.14126, + "grad_norm": 1.1447163595017156, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 14126 + }, + { + "epoch": 0.14127, + "grad_norm": 0.8163293324630697, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 14127 + }, + { + "epoch": 0.14128, + "grad_norm": 0.7343315992884264, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 14128 + }, + { + "epoch": 0.14129, + "grad_norm": 0.7480710336682936, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 14129 + }, + { + "epoch": 0.1413, + "grad_norm": 0.7368728548340789, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 14130 + }, + { + "epoch": 0.14131, + "grad_norm": 0.9105225962776399, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 14131 + }, + { + "epoch": 0.14132, + "grad_norm": 1.1385488373812027, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 14132 + }, + { + "epoch": 0.14133, + "grad_norm": 0.913119046010273, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 14133 + }, + { + "epoch": 0.14134, + "grad_norm": 0.7657609624981269, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 14134 + }, + { + "epoch": 0.14135, + "grad_norm": 0.7609010398091507, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 14135 + }, + { + "epoch": 0.14136, + "grad_norm": 0.8340366546380981, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 14136 + }, + { + "epoch": 0.14137, + "grad_norm": 0.87974034458899, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 14137 + }, + { + "epoch": 0.14138, + "grad_norm": 0.9060336588360559, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 14138 + }, + { + "epoch": 0.14139, + "grad_norm": 0.9785030316979524, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 14139 + }, + { + "epoch": 0.1414, + "grad_norm": 1.0251792333533782, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 14140 + }, + { + "epoch": 0.14141, + "grad_norm": 1.0165317489503691, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 14141 + }, + { + "epoch": 0.14142, + "grad_norm": 1.1538977045900214, + "learning_rate": 0.003, + "loss": 4.073, + "step": 14142 + }, + { + "epoch": 0.14143, + "grad_norm": 1.0788233762164792, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 14143 + }, + { + "epoch": 0.14144, + "grad_norm": 0.9010833889002156, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 14144 + }, + { + "epoch": 0.14145, + "grad_norm": 0.9613137075857177, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 14145 + }, + { + "epoch": 0.14146, + "grad_norm": 1.0433242475261193, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 14146 + }, + { + "epoch": 0.14147, + "grad_norm": 1.0346052233749885, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 14147 + }, + { + "epoch": 0.14148, + "grad_norm": 1.0047935776327652, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 14148 + }, + { + "epoch": 0.14149, + "grad_norm": 1.0070191874423993, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 14149 + }, + { + "epoch": 0.1415, + "grad_norm": 0.9403234730370085, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 14150 + }, + { + "epoch": 0.14151, + "grad_norm": 0.9340113204611356, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 14151 + }, + { + "epoch": 0.14152, + "grad_norm": 0.9095788780203912, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 14152 + }, + { + "epoch": 0.14153, + "grad_norm": 0.9235734595743118, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 14153 + }, + { + "epoch": 0.14154, + "grad_norm": 0.8878509247876392, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 14154 + }, + { + "epoch": 0.14155, + "grad_norm": 0.8436709962119316, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 14155 + }, + { + "epoch": 0.14156, + "grad_norm": 0.7725028756807933, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 14156 + }, + { + "epoch": 0.14157, + "grad_norm": 0.7607021345434494, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 14157 + }, + { + "epoch": 0.14158, + "grad_norm": 0.7837363952161858, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 14158 + }, + { + "epoch": 0.14159, + "grad_norm": 0.7377747534018623, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 14159 + }, + { + "epoch": 0.1416, + "grad_norm": 0.8685123610464288, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 14160 + }, + { + "epoch": 0.14161, + "grad_norm": 1.05678529283597, + "learning_rate": 0.003, + "loss": 4.1218, + "step": 14161 + }, + { + "epoch": 0.14162, + "grad_norm": 1.3045732595480373, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 14162 + }, + { + "epoch": 0.14163, + "grad_norm": 0.6138736597121288, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 14163 + }, + { + "epoch": 0.14164, + "grad_norm": 0.6664174437649203, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 14164 + }, + { + "epoch": 0.14165, + "grad_norm": 0.7351929638196077, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 14165 + }, + { + "epoch": 0.14166, + "grad_norm": 0.6837082944117285, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 14166 + }, + { + "epoch": 0.14167, + "grad_norm": 0.6485593388343944, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 14167 + }, + { + "epoch": 0.14168, + "grad_norm": 0.6502219497216053, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 14168 + }, + { + "epoch": 0.14169, + "grad_norm": 0.6530460950303256, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 14169 + }, + { + "epoch": 0.1417, + "grad_norm": 0.5314371458766793, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 14170 + }, + { + "epoch": 0.14171, + "grad_norm": 0.5397748506892242, + "learning_rate": 0.003, + "loss": 4.07, + "step": 14171 + }, + { + "epoch": 0.14172, + "grad_norm": 0.5907464608832607, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 14172 + }, + { + "epoch": 0.14173, + "grad_norm": 0.5760272288509668, + "learning_rate": 0.003, + "loss": 4.071, + "step": 14173 + }, + { + "epoch": 0.14174, + "grad_norm": 0.7023072638492589, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 14174 + }, + { + "epoch": 0.14175, + "grad_norm": 1.0348162640920575, + "learning_rate": 0.003, + "loss": 4.086, + "step": 14175 + }, + { + "epoch": 0.14176, + "grad_norm": 1.4191969408141354, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 14176 + }, + { + "epoch": 0.14177, + "grad_norm": 0.5762605622983992, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 14177 + }, + { + "epoch": 0.14178, + "grad_norm": 0.8349905059365021, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 14178 + }, + { + "epoch": 0.14179, + "grad_norm": 1.0238856329640418, + "learning_rate": 0.003, + "loss": 4.083, + "step": 14179 + }, + { + "epoch": 0.1418, + "grad_norm": 0.8958196113982968, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 14180 + }, + { + "epoch": 0.14181, + "grad_norm": 0.8488290140329743, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 14181 + }, + { + "epoch": 0.14182, + "grad_norm": 0.85594951057921, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 14182 + }, + { + "epoch": 0.14183, + "grad_norm": 0.8445824988413334, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 14183 + }, + { + "epoch": 0.14184, + "grad_norm": 1.0823833226303805, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 14184 + }, + { + "epoch": 0.14185, + "grad_norm": 1.2134570167918746, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 14185 + }, + { + "epoch": 0.14186, + "grad_norm": 0.8395744683994786, + "learning_rate": 0.003, + "loss": 4.074, + "step": 14186 + }, + { + "epoch": 0.14187, + "grad_norm": 0.736679291662909, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 14187 + }, + { + "epoch": 0.14188, + "grad_norm": 0.8722552700424218, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 14188 + }, + { + "epoch": 0.14189, + "grad_norm": 0.7933914534797809, + "learning_rate": 0.003, + "loss": 4.065, + "step": 14189 + }, + { + "epoch": 0.1419, + "grad_norm": 0.7511860515790721, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 14190 + }, + { + "epoch": 0.14191, + "grad_norm": 0.7577322350020178, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 14191 + }, + { + "epoch": 0.14192, + "grad_norm": 0.7852520407334463, + "learning_rate": 0.003, + "loss": 4.1143, + "step": 14192 + }, + { + "epoch": 0.14193, + "grad_norm": 0.8740960536673468, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 14193 + }, + { + "epoch": 0.14194, + "grad_norm": 0.9454865942734588, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 14194 + }, + { + "epoch": 0.14195, + "grad_norm": 1.0260824379078455, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 14195 + }, + { + "epoch": 0.14196, + "grad_norm": 1.128710367507716, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 14196 + }, + { + "epoch": 0.14197, + "grad_norm": 0.8702706333594432, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14197 + }, + { + "epoch": 0.14198, + "grad_norm": 0.855801841251651, + "learning_rate": 0.003, + "loss": 4.074, + "step": 14198 + }, + { + "epoch": 0.14199, + "grad_norm": 0.79570611148028, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 14199 + }, + { + "epoch": 0.142, + "grad_norm": 0.7762153897839794, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 14200 + }, + { + "epoch": 0.14201, + "grad_norm": 0.8313138130976775, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 14201 + }, + { + "epoch": 0.14202, + "grad_norm": 0.9102058475281254, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 14202 + }, + { + "epoch": 0.14203, + "grad_norm": 0.9879444850830352, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 14203 + }, + { + "epoch": 0.14204, + "grad_norm": 0.9727375382419631, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 14204 + }, + { + "epoch": 0.14205, + "grad_norm": 1.072343905396286, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 14205 + }, + { + "epoch": 0.14206, + "grad_norm": 1.0918392136913837, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 14206 + }, + { + "epoch": 0.14207, + "grad_norm": 0.9079756520513552, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 14207 + }, + { + "epoch": 0.14208, + "grad_norm": 1.0456318462186016, + "learning_rate": 0.003, + "loss": 4.068, + "step": 14208 + }, + { + "epoch": 0.14209, + "grad_norm": 0.9103258186594584, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 14209 + }, + { + "epoch": 0.1421, + "grad_norm": 0.9195259346016901, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 14210 + }, + { + "epoch": 0.14211, + "grad_norm": 0.8677025003910135, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 14211 + }, + { + "epoch": 0.14212, + "grad_norm": 0.7573509185278031, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 14212 + }, + { + "epoch": 0.14213, + "grad_norm": 0.7953171501146511, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 14213 + }, + { + "epoch": 0.14214, + "grad_norm": 0.8858347423608933, + "learning_rate": 0.003, + "loss": 4.074, + "step": 14214 + }, + { + "epoch": 0.14215, + "grad_norm": 1.1031124908567087, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 14215 + }, + { + "epoch": 0.14216, + "grad_norm": 1.131079615414483, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 14216 + }, + { + "epoch": 0.14217, + "grad_norm": 0.8741853018247859, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 14217 + }, + { + "epoch": 0.14218, + "grad_norm": 0.7734197898654148, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 14218 + }, + { + "epoch": 0.14219, + "grad_norm": 0.8727939958879936, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 14219 + }, + { + "epoch": 0.1422, + "grad_norm": 0.888259714477054, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 14220 + }, + { + "epoch": 0.14221, + "grad_norm": 0.8408869079351118, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 14221 + }, + { + "epoch": 0.14222, + "grad_norm": 0.9160714951657148, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 14222 + }, + { + "epoch": 0.14223, + "grad_norm": 0.8923006023358101, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 14223 + }, + { + "epoch": 0.14224, + "grad_norm": 0.7541129078942927, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 14224 + }, + { + "epoch": 0.14225, + "grad_norm": 0.7017399774791405, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 14225 + }, + { + "epoch": 0.14226, + "grad_norm": 0.7175844800580948, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 14226 + }, + { + "epoch": 0.14227, + "grad_norm": 0.8061323182082613, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 14227 + }, + { + "epoch": 0.14228, + "grad_norm": 0.8549019882180127, + "learning_rate": 0.003, + "loss": 4.058, + "step": 14228 + }, + { + "epoch": 0.14229, + "grad_norm": 0.7618872119689732, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 14229 + }, + { + "epoch": 0.1423, + "grad_norm": 0.748617160877437, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 14230 + }, + { + "epoch": 0.14231, + "grad_norm": 0.8537449120706244, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 14231 + }, + { + "epoch": 0.14232, + "grad_norm": 0.9538841898644234, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 14232 + }, + { + "epoch": 0.14233, + "grad_norm": 1.2323470714454452, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 14233 + }, + { + "epoch": 0.14234, + "grad_norm": 0.9252380311555533, + "learning_rate": 0.003, + "loss": 4.054, + "step": 14234 + }, + { + "epoch": 0.14235, + "grad_norm": 0.8357527733858238, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14235 + }, + { + "epoch": 0.14236, + "grad_norm": 0.7383299511457709, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 14236 + }, + { + "epoch": 0.14237, + "grad_norm": 0.7148127127044163, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 14237 + }, + { + "epoch": 0.14238, + "grad_norm": 0.6575998983357735, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 14238 + }, + { + "epoch": 0.14239, + "grad_norm": 0.7224088928415203, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 14239 + }, + { + "epoch": 0.1424, + "grad_norm": 0.7258406529257267, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 14240 + }, + { + "epoch": 0.14241, + "grad_norm": 0.6700010559315331, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 14241 + }, + { + "epoch": 0.14242, + "grad_norm": 0.7034472369150453, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 14242 + }, + { + "epoch": 0.14243, + "grad_norm": 0.9655966873187218, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 14243 + }, + { + "epoch": 0.14244, + "grad_norm": 1.3492244069241692, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 14244 + }, + { + "epoch": 0.14245, + "grad_norm": 0.787432760836881, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 14245 + }, + { + "epoch": 0.14246, + "grad_norm": 0.6589245508435077, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 14246 + }, + { + "epoch": 0.14247, + "grad_norm": 0.7176742670893751, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 14247 + }, + { + "epoch": 0.14248, + "grad_norm": 0.7004591529124673, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 14248 + }, + { + "epoch": 0.14249, + "grad_norm": 0.7872892677745946, + "learning_rate": 0.003, + "loss": 4.024, + "step": 14249 + }, + { + "epoch": 0.1425, + "grad_norm": 0.9122621574614087, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 14250 + }, + { + "epoch": 0.14251, + "grad_norm": 1.0861952597559, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 14251 + }, + { + "epoch": 0.14252, + "grad_norm": 0.8040420734689884, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 14252 + }, + { + "epoch": 0.14253, + "grad_norm": 0.673228848158347, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 14253 + }, + { + "epoch": 0.14254, + "grad_norm": 0.6178610509082365, + "learning_rate": 0.003, + "loss": 4.054, + "step": 14254 + }, + { + "epoch": 0.14255, + "grad_norm": 0.7803330205829802, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 14255 + }, + { + "epoch": 0.14256, + "grad_norm": 0.8507086918819735, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 14256 + }, + { + "epoch": 0.14257, + "grad_norm": 0.837260019958009, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 14257 + }, + { + "epoch": 0.14258, + "grad_norm": 0.9369770791697932, + "learning_rate": 0.003, + "loss": 4.054, + "step": 14258 + }, + { + "epoch": 0.14259, + "grad_norm": 1.1719760621145092, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14259 + }, + { + "epoch": 0.1426, + "grad_norm": 1.0047106805433939, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 14260 + }, + { + "epoch": 0.14261, + "grad_norm": 1.0201426190948089, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 14261 + }, + { + "epoch": 0.14262, + "grad_norm": 0.8794796375528614, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 14262 + }, + { + "epoch": 0.14263, + "grad_norm": 0.8599179547945105, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14263 + }, + { + "epoch": 0.14264, + "grad_norm": 0.853001400341203, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 14264 + }, + { + "epoch": 0.14265, + "grad_norm": 0.9796941134897704, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 14265 + }, + { + "epoch": 0.14266, + "grad_norm": 1.143809317255042, + "learning_rate": 0.003, + "loss": 4.098, + "step": 14266 + }, + { + "epoch": 0.14267, + "grad_norm": 1.0123655465496928, + "learning_rate": 0.003, + "loss": 4.096, + "step": 14267 + }, + { + "epoch": 0.14268, + "grad_norm": 1.156157055455667, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 14268 + }, + { + "epoch": 0.14269, + "grad_norm": 0.9417857169223783, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 14269 + }, + { + "epoch": 0.1427, + "grad_norm": 1.002625041902675, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 14270 + }, + { + "epoch": 0.14271, + "grad_norm": 1.1455257576736626, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 14271 + }, + { + "epoch": 0.14272, + "grad_norm": 0.9169999227937337, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 14272 + }, + { + "epoch": 0.14273, + "grad_norm": 1.0288608407310105, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 14273 + }, + { + "epoch": 0.14274, + "grad_norm": 0.9895127236305712, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 14274 + }, + { + "epoch": 0.14275, + "grad_norm": 1.024068444256658, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 14275 + }, + { + "epoch": 0.14276, + "grad_norm": 1.0358327615667164, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 14276 + }, + { + "epoch": 0.14277, + "grad_norm": 1.0995848603797826, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 14277 + }, + { + "epoch": 0.14278, + "grad_norm": 0.9710395555199974, + "learning_rate": 0.003, + "loss": 4.067, + "step": 14278 + }, + { + "epoch": 0.14279, + "grad_norm": 0.7935574068237154, + "learning_rate": 0.003, + "loss": 4.1085, + "step": 14279 + }, + { + "epoch": 0.1428, + "grad_norm": 0.7440767417035967, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 14280 + }, + { + "epoch": 0.14281, + "grad_norm": 0.7249925095996536, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14281 + }, + { + "epoch": 0.14282, + "grad_norm": 0.7738584216331064, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 14282 + }, + { + "epoch": 0.14283, + "grad_norm": 0.8075419416152382, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 14283 + }, + { + "epoch": 0.14284, + "grad_norm": 0.808656055655579, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 14284 + }, + { + "epoch": 0.14285, + "grad_norm": 0.7434415777020318, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 14285 + }, + { + "epoch": 0.14286, + "grad_norm": 0.782537850481025, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 14286 + }, + { + "epoch": 0.14287, + "grad_norm": 0.7398803590584546, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 14287 + }, + { + "epoch": 0.14288, + "grad_norm": 0.6498718731345486, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 14288 + }, + { + "epoch": 0.14289, + "grad_norm": 0.57167150621938, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 14289 + }, + { + "epoch": 0.1429, + "grad_norm": 0.629917071736773, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 14290 + }, + { + "epoch": 0.14291, + "grad_norm": 0.7282520539482328, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 14291 + }, + { + "epoch": 0.14292, + "grad_norm": 0.8448216364800304, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 14292 + }, + { + "epoch": 0.14293, + "grad_norm": 0.9898291474286086, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 14293 + }, + { + "epoch": 0.14294, + "grad_norm": 1.384130185668138, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 14294 + }, + { + "epoch": 0.14295, + "grad_norm": 0.8132712159924654, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 14295 + }, + { + "epoch": 0.14296, + "grad_norm": 0.5878901615618249, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 14296 + }, + { + "epoch": 0.14297, + "grad_norm": 0.5235229402678793, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 14297 + }, + { + "epoch": 0.14298, + "grad_norm": 0.5940769162928273, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 14298 + }, + { + "epoch": 0.14299, + "grad_norm": 0.635954759733811, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 14299 + }, + { + "epoch": 0.143, + "grad_norm": 0.7604566779202656, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 14300 + }, + { + "epoch": 0.14301, + "grad_norm": 0.9621812171223754, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 14301 + }, + { + "epoch": 0.14302, + "grad_norm": 1.1332426046276227, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 14302 + }, + { + "epoch": 0.14303, + "grad_norm": 0.8826723697906451, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 14303 + }, + { + "epoch": 0.14304, + "grad_norm": 0.8146302717388536, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 14304 + }, + { + "epoch": 0.14305, + "grad_norm": 0.8862814720311086, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 14305 + }, + { + "epoch": 0.14306, + "grad_norm": 1.052781435999485, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 14306 + }, + { + "epoch": 0.14307, + "grad_norm": 0.9466303101454425, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 14307 + }, + { + "epoch": 0.14308, + "grad_norm": 0.9569390726669585, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 14308 + }, + { + "epoch": 0.14309, + "grad_norm": 0.9767673438310612, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 14309 + }, + { + "epoch": 0.1431, + "grad_norm": 1.2068390135277705, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 14310 + }, + { + "epoch": 0.14311, + "grad_norm": 0.9118515998023901, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 14311 + }, + { + "epoch": 0.14312, + "grad_norm": 0.9234441036395075, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 14312 + }, + { + "epoch": 0.14313, + "grad_norm": 0.9379283578197763, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 14313 + }, + { + "epoch": 0.14314, + "grad_norm": 1.0481055206862089, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 14314 + }, + { + "epoch": 0.14315, + "grad_norm": 1.0170438017287065, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 14315 + }, + { + "epoch": 0.14316, + "grad_norm": 1.1314036767196796, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14316 + }, + { + "epoch": 0.14317, + "grad_norm": 0.9667708712613111, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 14317 + }, + { + "epoch": 0.14318, + "grad_norm": 1.0540721435901028, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 14318 + }, + { + "epoch": 0.14319, + "grad_norm": 0.8125763824735159, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 14319 + }, + { + "epoch": 0.1432, + "grad_norm": 0.6690389583956455, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 14320 + }, + { + "epoch": 0.14321, + "grad_norm": 0.715576186711564, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 14321 + }, + { + "epoch": 0.14322, + "grad_norm": 0.7034013270653181, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 14322 + }, + { + "epoch": 0.14323, + "grad_norm": 0.6650656285716945, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 14323 + }, + { + "epoch": 0.14324, + "grad_norm": 0.8507560154157894, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 14324 + }, + { + "epoch": 0.14325, + "grad_norm": 0.893132375990626, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 14325 + }, + { + "epoch": 0.14326, + "grad_norm": 0.8316403162519251, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 14326 + }, + { + "epoch": 0.14327, + "grad_norm": 0.8211797982823278, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 14327 + }, + { + "epoch": 0.14328, + "grad_norm": 0.854730121681877, + "learning_rate": 0.003, + "loss": 4.1114, + "step": 14328 + }, + { + "epoch": 0.14329, + "grad_norm": 0.8488314778432745, + "learning_rate": 0.003, + "loss": 4.061, + "step": 14329 + }, + { + "epoch": 0.1433, + "grad_norm": 0.8173694929630654, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 14330 + }, + { + "epoch": 0.14331, + "grad_norm": 0.8451422403651959, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 14331 + }, + { + "epoch": 0.14332, + "grad_norm": 0.953725488533161, + "learning_rate": 0.003, + "loss": 4.071, + "step": 14332 + }, + { + "epoch": 0.14333, + "grad_norm": 0.9935817251768986, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14333 + }, + { + "epoch": 0.14334, + "grad_norm": 1.0069330550136053, + "learning_rate": 0.003, + "loss": 4.077, + "step": 14334 + }, + { + "epoch": 0.14335, + "grad_norm": 1.0128321819662722, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 14335 + }, + { + "epoch": 0.14336, + "grad_norm": 0.9720243567033409, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 14336 + }, + { + "epoch": 0.14337, + "grad_norm": 1.034406502119741, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 14337 + }, + { + "epoch": 0.14338, + "grad_norm": 1.028489631883045, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 14338 + }, + { + "epoch": 0.14339, + "grad_norm": 1.0164498014126644, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 14339 + }, + { + "epoch": 0.1434, + "grad_norm": 0.9007186917592227, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 14340 + }, + { + "epoch": 0.14341, + "grad_norm": 0.9064557694182467, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 14341 + }, + { + "epoch": 0.14342, + "grad_norm": 0.9527853939982664, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 14342 + }, + { + "epoch": 0.14343, + "grad_norm": 0.8255592350256148, + "learning_rate": 0.003, + "loss": 4.1136, + "step": 14343 + }, + { + "epoch": 0.14344, + "grad_norm": 0.6414147580958486, + "learning_rate": 0.003, + "loss": 4.1081, + "step": 14344 + }, + { + "epoch": 0.14345, + "grad_norm": 0.6873869318678593, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 14345 + }, + { + "epoch": 0.14346, + "grad_norm": 0.7330891013138953, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 14346 + }, + { + "epoch": 0.14347, + "grad_norm": 0.8101567143121826, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 14347 + }, + { + "epoch": 0.14348, + "grad_norm": 0.9702199997855077, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 14348 + }, + { + "epoch": 0.14349, + "grad_norm": 0.9904100138960906, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 14349 + }, + { + "epoch": 0.1435, + "grad_norm": 1.1019773919500955, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 14350 + }, + { + "epoch": 0.14351, + "grad_norm": 0.9978511333891973, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 14351 + }, + { + "epoch": 0.14352, + "grad_norm": 1.014835951520754, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 14352 + }, + { + "epoch": 0.14353, + "grad_norm": 0.9486519931027005, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 14353 + }, + { + "epoch": 0.14354, + "grad_norm": 0.8014608748984109, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 14354 + }, + { + "epoch": 0.14355, + "grad_norm": 0.6893169107280758, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 14355 + }, + { + "epoch": 0.14356, + "grad_norm": 0.7790746564890838, + "learning_rate": 0.003, + "loss": 4.068, + "step": 14356 + }, + { + "epoch": 0.14357, + "grad_norm": 0.8571150262837032, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 14357 + }, + { + "epoch": 0.14358, + "grad_norm": 0.9166295189774966, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 14358 + }, + { + "epoch": 0.14359, + "grad_norm": 0.9473566514431786, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 14359 + }, + { + "epoch": 0.1436, + "grad_norm": 0.9837585839604661, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 14360 + }, + { + "epoch": 0.14361, + "grad_norm": 1.0196945016539802, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 14361 + }, + { + "epoch": 0.14362, + "grad_norm": 1.052706176676344, + "learning_rate": 0.003, + "loss": 4.1167, + "step": 14362 + }, + { + "epoch": 0.14363, + "grad_norm": 1.0314646065600368, + "learning_rate": 0.003, + "loss": 4.07, + "step": 14363 + }, + { + "epoch": 0.14364, + "grad_norm": 0.8482256306999743, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 14364 + }, + { + "epoch": 0.14365, + "grad_norm": 0.8388761850218024, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 14365 + }, + { + "epoch": 0.14366, + "grad_norm": 0.7663169047495818, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 14366 + }, + { + "epoch": 0.14367, + "grad_norm": 0.6499443201499312, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 14367 + }, + { + "epoch": 0.14368, + "grad_norm": 0.7728910831473662, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 14368 + }, + { + "epoch": 0.14369, + "grad_norm": 0.9151885531778485, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 14369 + }, + { + "epoch": 0.1437, + "grad_norm": 1.012549276988548, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 14370 + }, + { + "epoch": 0.14371, + "grad_norm": 1.0316331881002632, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 14371 + }, + { + "epoch": 0.14372, + "grad_norm": 0.9309894338228544, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 14372 + }, + { + "epoch": 0.14373, + "grad_norm": 0.8719919389914711, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 14373 + }, + { + "epoch": 0.14374, + "grad_norm": 0.8183268003383068, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 14374 + }, + { + "epoch": 0.14375, + "grad_norm": 0.822230156604052, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 14375 + }, + { + "epoch": 0.14376, + "grad_norm": 0.8210409550089975, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 14376 + }, + { + "epoch": 0.14377, + "grad_norm": 0.7918484286405145, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 14377 + }, + { + "epoch": 0.14378, + "grad_norm": 0.6871245341077445, + "learning_rate": 0.003, + "loss": 4.059, + "step": 14378 + }, + { + "epoch": 0.14379, + "grad_norm": 0.61337738208206, + "learning_rate": 0.003, + "loss": 4.062, + "step": 14379 + }, + { + "epoch": 0.1438, + "grad_norm": 0.7949733746083621, + "learning_rate": 0.003, + "loss": 4.061, + "step": 14380 + }, + { + "epoch": 0.14381, + "grad_norm": 0.7966095482585109, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 14381 + }, + { + "epoch": 0.14382, + "grad_norm": 0.742684410456428, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 14382 + }, + { + "epoch": 0.14383, + "grad_norm": 0.6808568373631283, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 14383 + }, + { + "epoch": 0.14384, + "grad_norm": 0.6144707846148258, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 14384 + }, + { + "epoch": 0.14385, + "grad_norm": 0.6397262884987464, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 14385 + }, + { + "epoch": 0.14386, + "grad_norm": 0.9079018023080869, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 14386 + }, + { + "epoch": 0.14387, + "grad_norm": 1.276655150279495, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 14387 + }, + { + "epoch": 0.14388, + "grad_norm": 0.8802185657108448, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 14388 + }, + { + "epoch": 0.14389, + "grad_norm": 0.7748540022902647, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 14389 + }, + { + "epoch": 0.1439, + "grad_norm": 0.7910642923199699, + "learning_rate": 0.003, + "loss": 4.066, + "step": 14390 + }, + { + "epoch": 0.14391, + "grad_norm": 0.7814914376937365, + "learning_rate": 0.003, + "loss": 4.054, + "step": 14391 + }, + { + "epoch": 0.14392, + "grad_norm": 0.711033803024645, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 14392 + }, + { + "epoch": 0.14393, + "grad_norm": 0.6689814772603192, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 14393 + }, + { + "epoch": 0.14394, + "grad_norm": 0.7743410260570905, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 14394 + }, + { + "epoch": 0.14395, + "grad_norm": 0.8267055962453908, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 14395 + }, + { + "epoch": 0.14396, + "grad_norm": 0.8280054805253843, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 14396 + }, + { + "epoch": 0.14397, + "grad_norm": 0.8410900205580703, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 14397 + }, + { + "epoch": 0.14398, + "grad_norm": 0.74892299183487, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 14398 + }, + { + "epoch": 0.14399, + "grad_norm": 0.7541148933857669, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 14399 + }, + { + "epoch": 0.144, + "grad_norm": 0.7203519769604699, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 14400 + }, + { + "epoch": 0.14401, + "grad_norm": 0.7863202506879009, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 14401 + }, + { + "epoch": 0.14402, + "grad_norm": 0.8304267299415876, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 14402 + }, + { + "epoch": 0.14403, + "grad_norm": 1.0910369478178739, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 14403 + }, + { + "epoch": 0.14404, + "grad_norm": 1.1771650605747097, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 14404 + }, + { + "epoch": 0.14405, + "grad_norm": 0.892076820447087, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 14405 + }, + { + "epoch": 0.14406, + "grad_norm": 0.8772684958664766, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 14406 + }, + { + "epoch": 0.14407, + "grad_norm": 0.9973513358644183, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 14407 + }, + { + "epoch": 0.14408, + "grad_norm": 1.1490641624873017, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 14408 + }, + { + "epoch": 0.14409, + "grad_norm": 0.815542294684307, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 14409 + }, + { + "epoch": 0.1441, + "grad_norm": 0.8081892759281243, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 14410 + }, + { + "epoch": 0.14411, + "grad_norm": 0.9192651921903972, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 14411 + }, + { + "epoch": 0.14412, + "grad_norm": 0.8645285669631384, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 14412 + }, + { + "epoch": 0.14413, + "grad_norm": 0.8281700030855166, + "learning_rate": 0.003, + "loss": 4.084, + "step": 14413 + }, + { + "epoch": 0.14414, + "grad_norm": 1.0239009006094857, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 14414 + }, + { + "epoch": 0.14415, + "grad_norm": 1.0868552660199027, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 14415 + }, + { + "epoch": 0.14416, + "grad_norm": 1.0161286840519521, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 14416 + }, + { + "epoch": 0.14417, + "grad_norm": 1.174833107721702, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 14417 + }, + { + "epoch": 0.14418, + "grad_norm": 1.091906753190101, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 14418 + }, + { + "epoch": 0.14419, + "grad_norm": 0.9390134669254883, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 14419 + }, + { + "epoch": 0.1442, + "grad_norm": 0.905316492652339, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 14420 + }, + { + "epoch": 0.14421, + "grad_norm": 0.8973630440941875, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 14421 + }, + { + "epoch": 0.14422, + "grad_norm": 0.8593800792454128, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 14422 + }, + { + "epoch": 0.14423, + "grad_norm": 0.9097836307265622, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14423 + }, + { + "epoch": 0.14424, + "grad_norm": 0.8769136095747978, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 14424 + }, + { + "epoch": 0.14425, + "grad_norm": 0.8781004531945429, + "learning_rate": 0.003, + "loss": 4.052, + "step": 14425 + }, + { + "epoch": 0.14426, + "grad_norm": 0.8523510008630767, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 14426 + }, + { + "epoch": 0.14427, + "grad_norm": 0.7697966118866878, + "learning_rate": 0.003, + "loss": 4.08, + "step": 14427 + }, + { + "epoch": 0.14428, + "grad_norm": 0.6659967765645805, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 14428 + }, + { + "epoch": 0.14429, + "grad_norm": 0.7283987451534251, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 14429 + }, + { + "epoch": 0.1443, + "grad_norm": 0.6524668161944647, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 14430 + }, + { + "epoch": 0.14431, + "grad_norm": 0.6577762341755574, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 14431 + }, + { + "epoch": 0.14432, + "grad_norm": 0.7670158304520253, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 14432 + }, + { + "epoch": 0.14433, + "grad_norm": 0.8497958660369926, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 14433 + }, + { + "epoch": 0.14434, + "grad_norm": 1.0063921227645782, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 14434 + }, + { + "epoch": 0.14435, + "grad_norm": 1.232546830404364, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 14435 + }, + { + "epoch": 0.14436, + "grad_norm": 0.7803306088988813, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 14436 + }, + { + "epoch": 0.14437, + "grad_norm": 0.6385659966701883, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 14437 + }, + { + "epoch": 0.14438, + "grad_norm": 0.7387603247427053, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 14438 + }, + { + "epoch": 0.14439, + "grad_norm": 0.8775053705168816, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 14439 + }, + { + "epoch": 0.1444, + "grad_norm": 0.9734258283263654, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 14440 + }, + { + "epoch": 0.14441, + "grad_norm": 0.9705028390071423, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 14441 + }, + { + "epoch": 0.14442, + "grad_norm": 0.9960062972690155, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14442 + }, + { + "epoch": 0.14443, + "grad_norm": 0.8818896239814513, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 14443 + }, + { + "epoch": 0.14444, + "grad_norm": 0.7289286765268309, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 14444 + }, + { + "epoch": 0.14445, + "grad_norm": 0.7740432225462396, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 14445 + }, + { + "epoch": 0.14446, + "grad_norm": 0.9291240503602122, + "learning_rate": 0.003, + "loss": 4.057, + "step": 14446 + }, + { + "epoch": 0.14447, + "grad_norm": 1.1489522686322768, + "learning_rate": 0.003, + "loss": 4.1039, + "step": 14447 + }, + { + "epoch": 0.14448, + "grad_norm": 0.9043742838406463, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 14448 + }, + { + "epoch": 0.14449, + "grad_norm": 0.8886525100096326, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 14449 + }, + { + "epoch": 0.1445, + "grad_norm": 0.9442135755576159, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 14450 + }, + { + "epoch": 0.14451, + "grad_norm": 0.88853199038061, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 14451 + }, + { + "epoch": 0.14452, + "grad_norm": 0.9187092218973602, + "learning_rate": 0.003, + "loss": 4.066, + "step": 14452 + }, + { + "epoch": 0.14453, + "grad_norm": 0.8451833134059921, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 14453 + }, + { + "epoch": 0.14454, + "grad_norm": 0.8513118210882025, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14454 + }, + { + "epoch": 0.14455, + "grad_norm": 0.7722749130937169, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 14455 + }, + { + "epoch": 0.14456, + "grad_norm": 0.7169890500245386, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 14456 + }, + { + "epoch": 0.14457, + "grad_norm": 0.6697986292702363, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 14457 + }, + { + "epoch": 0.14458, + "grad_norm": 0.7755031570144965, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 14458 + }, + { + "epoch": 0.14459, + "grad_norm": 1.0605824232348093, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 14459 + }, + { + "epoch": 0.1446, + "grad_norm": 1.2293405457221496, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 14460 + }, + { + "epoch": 0.14461, + "grad_norm": 0.7976780762548757, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 14461 + }, + { + "epoch": 0.14462, + "grad_norm": 0.7849320724391274, + "learning_rate": 0.003, + "loss": 4.064, + "step": 14462 + }, + { + "epoch": 0.14463, + "grad_norm": 0.8266960310682302, + "learning_rate": 0.003, + "loss": 4.083, + "step": 14463 + }, + { + "epoch": 0.14464, + "grad_norm": 0.7460753583256715, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 14464 + }, + { + "epoch": 0.14465, + "grad_norm": 0.7821210362976887, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 14465 + }, + { + "epoch": 0.14466, + "grad_norm": 0.8691872038026347, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 14466 + }, + { + "epoch": 0.14467, + "grad_norm": 0.8688669781328593, + "learning_rate": 0.003, + "loss": 4.1116, + "step": 14467 + }, + { + "epoch": 0.14468, + "grad_norm": 1.0157383621401073, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 14468 + }, + { + "epoch": 0.14469, + "grad_norm": 1.0488500564055532, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 14469 + }, + { + "epoch": 0.1447, + "grad_norm": 0.8453306124392075, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 14470 + }, + { + "epoch": 0.14471, + "grad_norm": 0.7496859708795902, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 14471 + }, + { + "epoch": 0.14472, + "grad_norm": 0.7702865369170115, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 14472 + }, + { + "epoch": 0.14473, + "grad_norm": 0.7927025686268351, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 14473 + }, + { + "epoch": 0.14474, + "grad_norm": 0.7028001411167409, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 14474 + }, + { + "epoch": 0.14475, + "grad_norm": 0.6366764775222469, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 14475 + }, + { + "epoch": 0.14476, + "grad_norm": 0.7266403812115433, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 14476 + }, + { + "epoch": 0.14477, + "grad_norm": 0.8285669818610948, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 14477 + }, + { + "epoch": 0.14478, + "grad_norm": 1.0689441553197612, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 14478 + }, + { + "epoch": 0.14479, + "grad_norm": 1.2619968094331953, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 14479 + }, + { + "epoch": 0.1448, + "grad_norm": 0.681802879160255, + "learning_rate": 0.003, + "loss": 4.059, + "step": 14480 + }, + { + "epoch": 0.14481, + "grad_norm": 0.7895871815518442, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 14481 + }, + { + "epoch": 0.14482, + "grad_norm": 0.7162699951289682, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 14482 + }, + { + "epoch": 0.14483, + "grad_norm": 0.7559369806480447, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 14483 + }, + { + "epoch": 0.14484, + "grad_norm": 0.6854003499725579, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 14484 + }, + { + "epoch": 0.14485, + "grad_norm": 0.7261168346272312, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 14485 + }, + { + "epoch": 0.14486, + "grad_norm": 0.7581916674639567, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 14486 + }, + { + "epoch": 0.14487, + "grad_norm": 0.8458017056734358, + "learning_rate": 0.003, + "loss": 4.058, + "step": 14487 + }, + { + "epoch": 0.14488, + "grad_norm": 0.9407237060809456, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 14488 + }, + { + "epoch": 0.14489, + "grad_norm": 1.2838500110985553, + "learning_rate": 0.003, + "loss": 4.068, + "step": 14489 + }, + { + "epoch": 0.1449, + "grad_norm": 0.8253643573774834, + "learning_rate": 0.003, + "loss": 4.053, + "step": 14490 + }, + { + "epoch": 0.14491, + "grad_norm": 0.6833212631082725, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 14491 + }, + { + "epoch": 0.14492, + "grad_norm": 0.5907013243321121, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 14492 + }, + { + "epoch": 0.14493, + "grad_norm": 0.5477081995181327, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 14493 + }, + { + "epoch": 0.14494, + "grad_norm": 0.6138069260250624, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 14494 + }, + { + "epoch": 0.14495, + "grad_norm": 0.6810644475113934, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 14495 + }, + { + "epoch": 0.14496, + "grad_norm": 0.846006442349366, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 14496 + }, + { + "epoch": 0.14497, + "grad_norm": 0.9759374354820222, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 14497 + }, + { + "epoch": 0.14498, + "grad_norm": 1.1631680204402455, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 14498 + }, + { + "epoch": 0.14499, + "grad_norm": 0.8528800368239491, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 14499 + }, + { + "epoch": 0.145, + "grad_norm": 0.8007628472017968, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 14500 + }, + { + "epoch": 0.14501, + "grad_norm": 0.8621911835551398, + "learning_rate": 0.003, + "loss": 4.079, + "step": 14501 + }, + { + "epoch": 0.14502, + "grad_norm": 0.8392413556988816, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 14502 + }, + { + "epoch": 0.14503, + "grad_norm": 0.9062842404881979, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 14503 + }, + { + "epoch": 0.14504, + "grad_norm": 1.0475072658786253, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 14504 + }, + { + "epoch": 0.14505, + "grad_norm": 1.0771198598753233, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 14505 + }, + { + "epoch": 0.14506, + "grad_norm": 0.8522010187012606, + "learning_rate": 0.003, + "loss": 4.056, + "step": 14506 + }, + { + "epoch": 0.14507, + "grad_norm": 0.9266244073098737, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 14507 + }, + { + "epoch": 0.14508, + "grad_norm": 1.0972487087219982, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 14508 + }, + { + "epoch": 0.14509, + "grad_norm": 1.1536680843828024, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 14509 + }, + { + "epoch": 0.1451, + "grad_norm": 1.0259378881237784, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 14510 + }, + { + "epoch": 0.14511, + "grad_norm": 1.0730596033394761, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 14511 + }, + { + "epoch": 0.14512, + "grad_norm": 0.9411542390444215, + "learning_rate": 0.003, + "loss": 4.1187, + "step": 14512 + }, + { + "epoch": 0.14513, + "grad_norm": 0.8994694044524201, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 14513 + }, + { + "epoch": 0.14514, + "grad_norm": 0.8067485770896665, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 14514 + }, + { + "epoch": 0.14515, + "grad_norm": 1.01746158851945, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 14515 + }, + { + "epoch": 0.14516, + "grad_norm": 1.2153894499015556, + "learning_rate": 0.003, + "loss": 4.11, + "step": 14516 + }, + { + "epoch": 0.14517, + "grad_norm": 0.9040870044826432, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 14517 + }, + { + "epoch": 0.14518, + "grad_norm": 0.9696100920898809, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 14518 + }, + { + "epoch": 0.14519, + "grad_norm": 1.0582262440718013, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 14519 + }, + { + "epoch": 0.1452, + "grad_norm": 0.9543756723548299, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 14520 + }, + { + "epoch": 0.14521, + "grad_norm": 0.899627836617662, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 14521 + }, + { + "epoch": 0.14522, + "grad_norm": 0.7550291121578049, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 14522 + }, + { + "epoch": 0.14523, + "grad_norm": 0.7813326825471762, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 14523 + }, + { + "epoch": 0.14524, + "grad_norm": 0.694317802060896, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 14524 + }, + { + "epoch": 0.14525, + "grad_norm": 0.6915796088984916, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 14525 + }, + { + "epoch": 0.14526, + "grad_norm": 0.7253829223209967, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 14526 + }, + { + "epoch": 0.14527, + "grad_norm": 0.8022210925623505, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 14527 + }, + { + "epoch": 0.14528, + "grad_norm": 0.9031507779943743, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 14528 + }, + { + "epoch": 0.14529, + "grad_norm": 0.9972110549426437, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 14529 + }, + { + "epoch": 0.1453, + "grad_norm": 1.0832360084774422, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 14530 + }, + { + "epoch": 0.14531, + "grad_norm": 1.105797035647914, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 14531 + }, + { + "epoch": 0.14532, + "grad_norm": 0.786615455076114, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 14532 + }, + { + "epoch": 0.14533, + "grad_norm": 0.7082786807676374, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 14533 + }, + { + "epoch": 0.14534, + "grad_norm": 0.8112960746828972, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 14534 + }, + { + "epoch": 0.14535, + "grad_norm": 0.9063381907497825, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 14535 + }, + { + "epoch": 0.14536, + "grad_norm": 0.8144238186101873, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 14536 + }, + { + "epoch": 0.14537, + "grad_norm": 0.7247677066233076, + "learning_rate": 0.003, + "loss": 4.05, + "step": 14537 + }, + { + "epoch": 0.14538, + "grad_norm": 0.703189835688407, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 14538 + }, + { + "epoch": 0.14539, + "grad_norm": 0.7847737719060214, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 14539 + }, + { + "epoch": 0.1454, + "grad_norm": 0.8689603553966835, + "learning_rate": 0.003, + "loss": 4.073, + "step": 14540 + }, + { + "epoch": 0.14541, + "grad_norm": 1.078177458358583, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 14541 + }, + { + "epoch": 0.14542, + "grad_norm": 1.0968398742626888, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 14542 + }, + { + "epoch": 0.14543, + "grad_norm": 1.057736346844444, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 14543 + }, + { + "epoch": 0.14544, + "grad_norm": 1.0096394014447299, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 14544 + }, + { + "epoch": 0.14545, + "grad_norm": 0.9595567075326513, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 14545 + }, + { + "epoch": 0.14546, + "grad_norm": 1.0237956273787805, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 14546 + }, + { + "epoch": 0.14547, + "grad_norm": 0.9296569813294112, + "learning_rate": 0.003, + "loss": 4.0974, + "step": 14547 + }, + { + "epoch": 0.14548, + "grad_norm": 0.8580821455270444, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 14548 + }, + { + "epoch": 0.14549, + "grad_norm": 0.872113276229092, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 14549 + }, + { + "epoch": 0.1455, + "grad_norm": 0.977749134256614, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 14550 + }, + { + "epoch": 0.14551, + "grad_norm": 0.9399304829514836, + "learning_rate": 0.003, + "loss": 4.085, + "step": 14551 + }, + { + "epoch": 0.14552, + "grad_norm": 0.8663647607782625, + "learning_rate": 0.003, + "loss": 4.1033, + "step": 14552 + }, + { + "epoch": 0.14553, + "grad_norm": 0.9136647582260118, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 14553 + }, + { + "epoch": 0.14554, + "grad_norm": 0.7399310086217266, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 14554 + }, + { + "epoch": 0.14555, + "grad_norm": 0.6743750311640493, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 14555 + }, + { + "epoch": 0.14556, + "grad_norm": 0.7830085187202371, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 14556 + }, + { + "epoch": 0.14557, + "grad_norm": 0.8716227519681807, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 14557 + }, + { + "epoch": 0.14558, + "grad_norm": 1.0059827438311395, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 14558 + }, + { + "epoch": 0.14559, + "grad_norm": 1.3009668224997837, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 14559 + }, + { + "epoch": 0.1456, + "grad_norm": 0.776695489158605, + "learning_rate": 0.003, + "loss": 4.056, + "step": 14560 + }, + { + "epoch": 0.14561, + "grad_norm": 0.6180285588487668, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 14561 + }, + { + "epoch": 0.14562, + "grad_norm": 0.7558658696472682, + "learning_rate": 0.003, + "loss": 4.064, + "step": 14562 + }, + { + "epoch": 0.14563, + "grad_norm": 0.8641124822104197, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 14563 + }, + { + "epoch": 0.14564, + "grad_norm": 0.9708162293916572, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 14564 + }, + { + "epoch": 0.14565, + "grad_norm": 0.9456146169123835, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 14565 + }, + { + "epoch": 0.14566, + "grad_norm": 0.8643872779025452, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 14566 + }, + { + "epoch": 0.14567, + "grad_norm": 0.8171882167001808, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 14567 + }, + { + "epoch": 0.14568, + "grad_norm": 0.8279861681618138, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 14568 + }, + { + "epoch": 0.14569, + "grad_norm": 0.6954121162267392, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 14569 + }, + { + "epoch": 0.1457, + "grad_norm": 0.6350707905293368, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 14570 + }, + { + "epoch": 0.14571, + "grad_norm": 0.6791557320846004, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 14571 + }, + { + "epoch": 0.14572, + "grad_norm": 0.8016430771828299, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 14572 + }, + { + "epoch": 0.14573, + "grad_norm": 1.0264268295157744, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 14573 + }, + { + "epoch": 0.14574, + "grad_norm": 1.1955488793948061, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 14574 + }, + { + "epoch": 0.14575, + "grad_norm": 0.7119546441286688, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 14575 + }, + { + "epoch": 0.14576, + "grad_norm": 0.714543676774026, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 14576 + }, + { + "epoch": 0.14577, + "grad_norm": 0.6603251991861709, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 14577 + }, + { + "epoch": 0.14578, + "grad_norm": 0.7986259057033251, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 14578 + }, + { + "epoch": 0.14579, + "grad_norm": 0.9782640365302716, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 14579 + }, + { + "epoch": 0.1458, + "grad_norm": 1.1327755080474855, + "learning_rate": 0.003, + "loss": 4.079, + "step": 14580 + }, + { + "epoch": 0.14581, + "grad_norm": 0.9360040545332118, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 14581 + }, + { + "epoch": 0.14582, + "grad_norm": 0.8794183719156582, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 14582 + }, + { + "epoch": 0.14583, + "grad_norm": 0.8116502083563901, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 14583 + }, + { + "epoch": 0.14584, + "grad_norm": 0.7997145982234645, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 14584 + }, + { + "epoch": 0.14585, + "grad_norm": 0.8783829350835252, + "learning_rate": 0.003, + "loss": 4.083, + "step": 14585 + }, + { + "epoch": 0.14586, + "grad_norm": 0.868254154872957, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 14586 + }, + { + "epoch": 0.14587, + "grad_norm": 0.8102510840724461, + "learning_rate": 0.003, + "loss": 4.074, + "step": 14587 + }, + { + "epoch": 0.14588, + "grad_norm": 0.76591626897998, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 14588 + }, + { + "epoch": 0.14589, + "grad_norm": 0.8506288982616398, + "learning_rate": 0.003, + "loss": 4.032, + "step": 14589 + }, + { + "epoch": 0.1459, + "grad_norm": 0.839280358810672, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 14590 + }, + { + "epoch": 0.14591, + "grad_norm": 0.8624119454127883, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 14591 + }, + { + "epoch": 0.14592, + "grad_norm": 0.8135901322358887, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 14592 + }, + { + "epoch": 0.14593, + "grad_norm": 0.8970900221202972, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 14593 + }, + { + "epoch": 0.14594, + "grad_norm": 1.073051140183195, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 14594 + }, + { + "epoch": 0.14595, + "grad_norm": 1.143921867376063, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 14595 + }, + { + "epoch": 0.14596, + "grad_norm": 0.9751246681564357, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 14596 + }, + { + "epoch": 0.14597, + "grad_norm": 0.9558439609153476, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 14597 + }, + { + "epoch": 0.14598, + "grad_norm": 0.8489551978752373, + "learning_rate": 0.003, + "loss": 4.1205, + "step": 14598 + }, + { + "epoch": 0.14599, + "grad_norm": 0.7378453945629538, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 14599 + }, + { + "epoch": 0.146, + "grad_norm": 0.7620338281678484, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 14600 + }, + { + "epoch": 0.14601, + "grad_norm": 0.7872528072739438, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 14601 + }, + { + "epoch": 0.14602, + "grad_norm": 0.8283077690042281, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 14602 + }, + { + "epoch": 0.14603, + "grad_norm": 0.8875270533641183, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 14603 + }, + { + "epoch": 0.14604, + "grad_norm": 0.9602478703917192, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 14604 + }, + { + "epoch": 0.14605, + "grad_norm": 0.9356670106534075, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 14605 + }, + { + "epoch": 0.14606, + "grad_norm": 0.9591106551999335, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 14606 + }, + { + "epoch": 0.14607, + "grad_norm": 1.0597266076352092, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 14607 + }, + { + "epoch": 0.14608, + "grad_norm": 1.0148851676702153, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 14608 + }, + { + "epoch": 0.14609, + "grad_norm": 0.9344274923235639, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 14609 + }, + { + "epoch": 0.1461, + "grad_norm": 0.7351046879113694, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 14610 + }, + { + "epoch": 0.14611, + "grad_norm": 0.6541584505103268, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 14611 + }, + { + "epoch": 0.14612, + "grad_norm": 0.7163037137339126, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 14612 + }, + { + "epoch": 0.14613, + "grad_norm": 0.8797493890159305, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 14613 + }, + { + "epoch": 0.14614, + "grad_norm": 1.1123619922511716, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 14614 + }, + { + "epoch": 0.14615, + "grad_norm": 0.931610247635008, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 14615 + }, + { + "epoch": 0.14616, + "grad_norm": 0.8743518829454591, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 14616 + }, + { + "epoch": 0.14617, + "grad_norm": 0.9004877603438717, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 14617 + }, + { + "epoch": 0.14618, + "grad_norm": 0.968993029689825, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 14618 + }, + { + "epoch": 0.14619, + "grad_norm": 1.0270859579787186, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 14619 + }, + { + "epoch": 0.1462, + "grad_norm": 1.045345147298914, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 14620 + }, + { + "epoch": 0.14621, + "grad_norm": 0.8978367997055376, + "learning_rate": 0.003, + "loss": 4.1061, + "step": 14621 + }, + { + "epoch": 0.14622, + "grad_norm": 0.8714211223153865, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 14622 + }, + { + "epoch": 0.14623, + "grad_norm": 0.853544821928365, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 14623 + }, + { + "epoch": 0.14624, + "grad_norm": 0.9256017866669742, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 14624 + }, + { + "epoch": 0.14625, + "grad_norm": 1.1011178026911914, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 14625 + }, + { + "epoch": 0.14626, + "grad_norm": 0.9988850699587214, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 14626 + }, + { + "epoch": 0.14627, + "grad_norm": 0.849954492700033, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 14627 + }, + { + "epoch": 0.14628, + "grad_norm": 0.9278147679571234, + "learning_rate": 0.003, + "loss": 4.095, + "step": 14628 + }, + { + "epoch": 0.14629, + "grad_norm": 1.1390529378052798, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 14629 + }, + { + "epoch": 0.1463, + "grad_norm": 0.94019973826379, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 14630 + }, + { + "epoch": 0.14631, + "grad_norm": 0.9850923102058784, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 14631 + }, + { + "epoch": 0.14632, + "grad_norm": 0.8180294607237206, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 14632 + }, + { + "epoch": 0.14633, + "grad_norm": 0.7238626279935578, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 14633 + }, + { + "epoch": 0.14634, + "grad_norm": 0.7124310769930774, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 14634 + }, + { + "epoch": 0.14635, + "grad_norm": 0.7720391825748518, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 14635 + }, + { + "epoch": 0.14636, + "grad_norm": 0.7720467057553294, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 14636 + }, + { + "epoch": 0.14637, + "grad_norm": 0.7765657830166095, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 14637 + }, + { + "epoch": 0.14638, + "grad_norm": 0.752024170890064, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 14638 + }, + { + "epoch": 0.14639, + "grad_norm": 0.7137448304298398, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 14639 + }, + { + "epoch": 0.1464, + "grad_norm": 0.9394829416126098, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 14640 + }, + { + "epoch": 0.14641, + "grad_norm": 1.2614752983356443, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 14641 + }, + { + "epoch": 0.14642, + "grad_norm": 0.8767862521104057, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 14642 + }, + { + "epoch": 0.14643, + "grad_norm": 0.7207122124663736, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 14643 + }, + { + "epoch": 0.14644, + "grad_norm": 0.6281009563715171, + "learning_rate": 0.003, + "loss": 4.029, + "step": 14644 + }, + { + "epoch": 0.14645, + "grad_norm": 0.5840767945666202, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 14645 + }, + { + "epoch": 0.14646, + "grad_norm": 0.5869076032495534, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 14646 + }, + { + "epoch": 0.14647, + "grad_norm": 0.5245143418956627, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 14647 + }, + { + "epoch": 0.14648, + "grad_norm": 0.5789115806082246, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 14648 + }, + { + "epoch": 0.14649, + "grad_norm": 0.6069099859606367, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 14649 + }, + { + "epoch": 0.1465, + "grad_norm": 0.59759498127923, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14650 + }, + { + "epoch": 0.14651, + "grad_norm": 0.5684717975111642, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 14651 + }, + { + "epoch": 0.14652, + "grad_norm": 0.5627215808491731, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 14652 + }, + { + "epoch": 0.14653, + "grad_norm": 0.6655467461551765, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 14653 + }, + { + "epoch": 0.14654, + "grad_norm": 0.7313967000135548, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 14654 + }, + { + "epoch": 0.14655, + "grad_norm": 0.7716177143225461, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 14655 + }, + { + "epoch": 0.14656, + "grad_norm": 0.7932886713813072, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 14656 + }, + { + "epoch": 0.14657, + "grad_norm": 1.048857394901038, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 14657 + }, + { + "epoch": 0.14658, + "grad_norm": 1.3762766534991493, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 14658 + }, + { + "epoch": 0.14659, + "grad_norm": 0.7287529165877537, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 14659 + }, + { + "epoch": 0.1466, + "grad_norm": 0.7813755703515315, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 14660 + }, + { + "epoch": 0.14661, + "grad_norm": 0.8299047236815597, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 14661 + }, + { + "epoch": 0.14662, + "grad_norm": 0.9433792544809567, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 14662 + }, + { + "epoch": 0.14663, + "grad_norm": 1.102204855802617, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 14663 + }, + { + "epoch": 0.14664, + "grad_norm": 1.036955760055929, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 14664 + }, + { + "epoch": 0.14665, + "grad_norm": 1.095144185653256, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 14665 + }, + { + "epoch": 0.14666, + "grad_norm": 0.9939603749475371, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 14666 + }, + { + "epoch": 0.14667, + "grad_norm": 1.0756469695185484, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 14667 + }, + { + "epoch": 0.14668, + "grad_norm": 1.001666873680944, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 14668 + }, + { + "epoch": 0.14669, + "grad_norm": 0.9468326311851436, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 14669 + }, + { + "epoch": 0.1467, + "grad_norm": 0.8864556572703375, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 14670 + }, + { + "epoch": 0.14671, + "grad_norm": 0.9495657902929036, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 14671 + }, + { + "epoch": 0.14672, + "grad_norm": 0.9735126737119133, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 14672 + }, + { + "epoch": 0.14673, + "grad_norm": 0.9317013267782335, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 14673 + }, + { + "epoch": 0.14674, + "grad_norm": 0.8267662344630435, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 14674 + }, + { + "epoch": 0.14675, + "grad_norm": 0.8465527100738274, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 14675 + }, + { + "epoch": 0.14676, + "grad_norm": 0.7975477618697624, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 14676 + }, + { + "epoch": 0.14677, + "grad_norm": 0.746888833612203, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 14677 + }, + { + "epoch": 0.14678, + "grad_norm": 0.9099746904431035, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 14678 + }, + { + "epoch": 0.14679, + "grad_norm": 0.9924469453796482, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 14679 + }, + { + "epoch": 0.1468, + "grad_norm": 1.0255541725347632, + "learning_rate": 0.003, + "loss": 4.1091, + "step": 14680 + }, + { + "epoch": 0.14681, + "grad_norm": 0.9293619106149132, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 14681 + }, + { + "epoch": 0.14682, + "grad_norm": 0.8593043718586173, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 14682 + }, + { + "epoch": 0.14683, + "grad_norm": 0.9023966416140686, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 14683 + }, + { + "epoch": 0.14684, + "grad_norm": 0.9075070152522987, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 14684 + }, + { + "epoch": 0.14685, + "grad_norm": 0.990139660391972, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 14685 + }, + { + "epoch": 0.14686, + "grad_norm": 1.153361119676969, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 14686 + }, + { + "epoch": 0.14687, + "grad_norm": 1.0618144659524875, + "learning_rate": 0.003, + "loss": 4.082, + "step": 14687 + }, + { + "epoch": 0.14688, + "grad_norm": 1.1146777987509127, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 14688 + }, + { + "epoch": 0.14689, + "grad_norm": 0.809168480469712, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 14689 + }, + { + "epoch": 0.1469, + "grad_norm": 0.8571241803552251, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 14690 + }, + { + "epoch": 0.14691, + "grad_norm": 0.9309826043601989, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 14691 + }, + { + "epoch": 0.14692, + "grad_norm": 0.839841465329743, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 14692 + }, + { + "epoch": 0.14693, + "grad_norm": 0.696634463907577, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 14693 + }, + { + "epoch": 0.14694, + "grad_norm": 0.6860810439075222, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 14694 + }, + { + "epoch": 0.14695, + "grad_norm": 0.7375096967257634, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 14695 + }, + { + "epoch": 0.14696, + "grad_norm": 0.833826589352164, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 14696 + }, + { + "epoch": 0.14697, + "grad_norm": 0.985952008144028, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 14697 + }, + { + "epoch": 0.14698, + "grad_norm": 1.101613432328946, + "learning_rate": 0.003, + "loss": 4.057, + "step": 14698 + }, + { + "epoch": 0.14699, + "grad_norm": 0.8630794099808715, + "learning_rate": 0.003, + "loss": 4.034, + "step": 14699 + }, + { + "epoch": 0.147, + "grad_norm": 0.828157332753674, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 14700 + }, + { + "epoch": 0.14701, + "grad_norm": 0.7381105295462711, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 14701 + }, + { + "epoch": 0.14702, + "grad_norm": 0.7181426099842978, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 14702 + }, + { + "epoch": 0.14703, + "grad_norm": 0.7714457014561105, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 14703 + }, + { + "epoch": 0.14704, + "grad_norm": 0.8031825957815455, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 14704 + }, + { + "epoch": 0.14705, + "grad_norm": 0.8271898204374427, + "learning_rate": 0.003, + "loss": 4.062, + "step": 14705 + }, + { + "epoch": 0.14706, + "grad_norm": 0.7859278980344379, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 14706 + }, + { + "epoch": 0.14707, + "grad_norm": 0.9239315061288464, + "learning_rate": 0.003, + "loss": 4.079, + "step": 14707 + }, + { + "epoch": 0.14708, + "grad_norm": 1.036256286106465, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 14708 + }, + { + "epoch": 0.14709, + "grad_norm": 0.9262385110211995, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 14709 + }, + { + "epoch": 0.1471, + "grad_norm": 0.934436100353201, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 14710 + }, + { + "epoch": 0.14711, + "grad_norm": 1.0279129756159637, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 14711 + }, + { + "epoch": 0.14712, + "grad_norm": 1.0577358558193888, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 14712 + }, + { + "epoch": 0.14713, + "grad_norm": 1.0515648948673155, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 14713 + }, + { + "epoch": 0.14714, + "grad_norm": 0.8512134099767551, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 14714 + }, + { + "epoch": 0.14715, + "grad_norm": 0.7196155645320721, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 14715 + }, + { + "epoch": 0.14716, + "grad_norm": 0.6925672881321339, + "learning_rate": 0.003, + "loss": 4.098, + "step": 14716 + }, + { + "epoch": 0.14717, + "grad_norm": 0.7593293024180404, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 14717 + }, + { + "epoch": 0.14718, + "grad_norm": 0.870702469651855, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 14718 + }, + { + "epoch": 0.14719, + "grad_norm": 0.8852622342973488, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 14719 + }, + { + "epoch": 0.1472, + "grad_norm": 0.7412346900448475, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 14720 + }, + { + "epoch": 0.14721, + "grad_norm": 0.7934841697821211, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 14721 + }, + { + "epoch": 0.14722, + "grad_norm": 0.8274894305948751, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 14722 + }, + { + "epoch": 0.14723, + "grad_norm": 0.8076222834921906, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 14723 + }, + { + "epoch": 0.14724, + "grad_norm": 0.7713772507344148, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 14724 + }, + { + "epoch": 0.14725, + "grad_norm": 0.7802409897582635, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 14725 + }, + { + "epoch": 0.14726, + "grad_norm": 0.9637147817610195, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 14726 + }, + { + "epoch": 0.14727, + "grad_norm": 1.3358401635138504, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 14727 + }, + { + "epoch": 0.14728, + "grad_norm": 1.094002866839905, + "learning_rate": 0.003, + "loss": 4.061, + "step": 14728 + }, + { + "epoch": 0.14729, + "grad_norm": 1.064471378414021, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 14729 + }, + { + "epoch": 0.1473, + "grad_norm": 0.9103974869801705, + "learning_rate": 0.003, + "loss": 4.065, + "step": 14730 + }, + { + "epoch": 0.14731, + "grad_norm": 0.8250873471989922, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 14731 + }, + { + "epoch": 0.14732, + "grad_norm": 0.7599700541878261, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 14732 + }, + { + "epoch": 0.14733, + "grad_norm": 0.6575434766956358, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 14733 + }, + { + "epoch": 0.14734, + "grad_norm": 0.6050204566231443, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 14734 + }, + { + "epoch": 0.14735, + "grad_norm": 0.6940692057028003, + "learning_rate": 0.003, + "loss": 4.082, + "step": 14735 + }, + { + "epoch": 0.14736, + "grad_norm": 0.84017104510247, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 14736 + }, + { + "epoch": 0.14737, + "grad_norm": 0.8754131892508064, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 14737 + }, + { + "epoch": 0.14738, + "grad_norm": 0.9615225558629688, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 14738 + }, + { + "epoch": 0.14739, + "grad_norm": 1.1126273279398256, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 14739 + }, + { + "epoch": 0.1474, + "grad_norm": 0.8789448851738478, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 14740 + }, + { + "epoch": 0.14741, + "grad_norm": 0.7549636461614806, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 14741 + }, + { + "epoch": 0.14742, + "grad_norm": 0.7651498938045254, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 14742 + }, + { + "epoch": 0.14743, + "grad_norm": 0.8701006594439856, + "learning_rate": 0.003, + "loss": 4.087, + "step": 14743 + }, + { + "epoch": 0.14744, + "grad_norm": 0.9835486468749542, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14744 + }, + { + "epoch": 0.14745, + "grad_norm": 1.0642331575158999, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 14745 + }, + { + "epoch": 0.14746, + "grad_norm": 1.0492294401106332, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 14746 + }, + { + "epoch": 0.14747, + "grad_norm": 0.9413981212070113, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 14747 + }, + { + "epoch": 0.14748, + "grad_norm": 0.9397069742006033, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 14748 + }, + { + "epoch": 0.14749, + "grad_norm": 1.11373609411613, + "learning_rate": 0.003, + "loss": 4.08, + "step": 14749 + }, + { + "epoch": 0.1475, + "grad_norm": 0.8865431370636784, + "learning_rate": 0.003, + "loss": 4.084, + "step": 14750 + }, + { + "epoch": 0.14751, + "grad_norm": 0.5582480524303068, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 14751 + }, + { + "epoch": 0.14752, + "grad_norm": 0.6777600396655671, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 14752 + }, + { + "epoch": 0.14753, + "grad_norm": 0.7008825384856778, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 14753 + }, + { + "epoch": 0.14754, + "grad_norm": 0.6260364997682627, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 14754 + }, + { + "epoch": 0.14755, + "grad_norm": 0.6217675935186241, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 14755 + }, + { + "epoch": 0.14756, + "grad_norm": 0.7067199034422367, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 14756 + }, + { + "epoch": 0.14757, + "grad_norm": 0.8345884707638084, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 14757 + }, + { + "epoch": 0.14758, + "grad_norm": 0.9419638110211473, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 14758 + }, + { + "epoch": 0.14759, + "grad_norm": 1.002513849179099, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 14759 + }, + { + "epoch": 0.1476, + "grad_norm": 1.0061704996101026, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 14760 + }, + { + "epoch": 0.14761, + "grad_norm": 0.878344839961622, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 14761 + }, + { + "epoch": 0.14762, + "grad_norm": 0.786588567570064, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 14762 + }, + { + "epoch": 0.14763, + "grad_norm": 0.7889398622042119, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 14763 + }, + { + "epoch": 0.14764, + "grad_norm": 0.7687087384937162, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 14764 + }, + { + "epoch": 0.14765, + "grad_norm": 0.7510031088264584, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 14765 + }, + { + "epoch": 0.14766, + "grad_norm": 0.8324646367403518, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 14766 + }, + { + "epoch": 0.14767, + "grad_norm": 0.8349003272011257, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 14767 + }, + { + "epoch": 0.14768, + "grad_norm": 0.9259044255796463, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 14768 + }, + { + "epoch": 0.14769, + "grad_norm": 1.0284923847717176, + "learning_rate": 0.003, + "loss": 4.057, + "step": 14769 + }, + { + "epoch": 0.1477, + "grad_norm": 0.976762133188089, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 14770 + }, + { + "epoch": 0.14771, + "grad_norm": 1.0032274857983992, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 14771 + }, + { + "epoch": 0.14772, + "grad_norm": 1.0694798217390225, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 14772 + }, + { + "epoch": 0.14773, + "grad_norm": 0.9811266736398486, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 14773 + }, + { + "epoch": 0.14774, + "grad_norm": 0.9451081301158704, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 14774 + }, + { + "epoch": 0.14775, + "grad_norm": 0.9195920238594352, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 14775 + }, + { + "epoch": 0.14776, + "grad_norm": 1.045619909819611, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 14776 + }, + { + "epoch": 0.14777, + "grad_norm": 1.1195692749302386, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 14777 + }, + { + "epoch": 0.14778, + "grad_norm": 0.9256250140321228, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 14778 + }, + { + "epoch": 0.14779, + "grad_norm": 0.8785218642952807, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 14779 + }, + { + "epoch": 0.1478, + "grad_norm": 0.7851239946189108, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 14780 + }, + { + "epoch": 0.14781, + "grad_norm": 0.78004579180118, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 14781 + }, + { + "epoch": 0.14782, + "grad_norm": 0.8543830696121397, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 14782 + }, + { + "epoch": 0.14783, + "grad_norm": 0.8838195530462661, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 14783 + }, + { + "epoch": 0.14784, + "grad_norm": 1.0217768222772443, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 14784 + }, + { + "epoch": 0.14785, + "grad_norm": 1.0514376758786688, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 14785 + }, + { + "epoch": 0.14786, + "grad_norm": 0.9606875671622199, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14786 + }, + { + "epoch": 0.14787, + "grad_norm": 0.9243655458910466, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 14787 + }, + { + "epoch": 0.14788, + "grad_norm": 0.7511045449334031, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 14788 + }, + { + "epoch": 0.14789, + "grad_norm": 0.7090628535047875, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 14789 + }, + { + "epoch": 0.1479, + "grad_norm": 0.7319883043158829, + "learning_rate": 0.003, + "loss": 4.064, + "step": 14790 + }, + { + "epoch": 0.14791, + "grad_norm": 0.7145543038042341, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 14791 + }, + { + "epoch": 0.14792, + "grad_norm": 0.8122521791205854, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 14792 + }, + { + "epoch": 0.14793, + "grad_norm": 0.8448586491217989, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 14793 + }, + { + "epoch": 0.14794, + "grad_norm": 0.9455116940514773, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 14794 + }, + { + "epoch": 0.14795, + "grad_norm": 0.9706749144712972, + "learning_rate": 0.003, + "loss": 4.1183, + "step": 14795 + }, + { + "epoch": 0.14796, + "grad_norm": 0.974879979185396, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 14796 + }, + { + "epoch": 0.14797, + "grad_norm": 1.1141913255546896, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 14797 + }, + { + "epoch": 0.14798, + "grad_norm": 1.0529221422179145, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 14798 + }, + { + "epoch": 0.14799, + "grad_norm": 0.9894264927034992, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 14799 + }, + { + "epoch": 0.148, + "grad_norm": 1.0194162328251322, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 14800 + }, + { + "epoch": 0.14801, + "grad_norm": 0.9302982935422128, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 14801 + }, + { + "epoch": 0.14802, + "grad_norm": 1.0780535412079117, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 14802 + }, + { + "epoch": 0.14803, + "grad_norm": 0.9887225458379839, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 14803 + }, + { + "epoch": 0.14804, + "grad_norm": 0.9716411164392459, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 14804 + }, + { + "epoch": 0.14805, + "grad_norm": 0.8637497964270253, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 14805 + }, + { + "epoch": 0.14806, + "grad_norm": 0.7903898273337917, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 14806 + }, + { + "epoch": 0.14807, + "grad_norm": 0.8490409046573257, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 14807 + }, + { + "epoch": 0.14808, + "grad_norm": 1.0034610976823193, + "learning_rate": 0.003, + "loss": 4.0977, + "step": 14808 + }, + { + "epoch": 0.14809, + "grad_norm": 1.1816087044358994, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 14809 + }, + { + "epoch": 0.1481, + "grad_norm": 0.9064930970852736, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 14810 + }, + { + "epoch": 0.14811, + "grad_norm": 0.8516828867538914, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 14811 + }, + { + "epoch": 0.14812, + "grad_norm": 0.8061801702923695, + "learning_rate": 0.003, + "loss": 4.027, + "step": 14812 + }, + { + "epoch": 0.14813, + "grad_norm": 0.8251069162496621, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 14813 + }, + { + "epoch": 0.14814, + "grad_norm": 0.8553441504363729, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 14814 + }, + { + "epoch": 0.14815, + "grad_norm": 1.0016757800662375, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 14815 + }, + { + "epoch": 0.14816, + "grad_norm": 1.0346234884618168, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 14816 + }, + { + "epoch": 0.14817, + "grad_norm": 0.7749126658491023, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 14817 + }, + { + "epoch": 0.14818, + "grad_norm": 0.7200310165662526, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 14818 + }, + { + "epoch": 0.14819, + "grad_norm": 0.8394696666783071, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 14819 + }, + { + "epoch": 0.1482, + "grad_norm": 0.9057421912259922, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 14820 + }, + { + "epoch": 0.14821, + "grad_norm": 0.8886738161507868, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 14821 + }, + { + "epoch": 0.14822, + "grad_norm": 1.045143250935779, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 14822 + }, + { + "epoch": 0.14823, + "grad_norm": 0.8295574752948383, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 14823 + }, + { + "epoch": 0.14824, + "grad_norm": 0.7083040585887035, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 14824 + }, + { + "epoch": 0.14825, + "grad_norm": 0.7850557599282676, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 14825 + }, + { + "epoch": 0.14826, + "grad_norm": 0.7667351367600366, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 14826 + }, + { + "epoch": 0.14827, + "grad_norm": 0.8439361869003991, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 14827 + }, + { + "epoch": 0.14828, + "grad_norm": 0.9527190140415227, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 14828 + }, + { + "epoch": 0.14829, + "grad_norm": 0.9447239642238727, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 14829 + }, + { + "epoch": 0.1483, + "grad_norm": 0.8747502536455956, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 14830 + }, + { + "epoch": 0.14831, + "grad_norm": 0.8136418171167563, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 14831 + }, + { + "epoch": 0.14832, + "grad_norm": 0.7488638523264456, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 14832 + }, + { + "epoch": 0.14833, + "grad_norm": 0.7674862775383623, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 14833 + }, + { + "epoch": 0.14834, + "grad_norm": 0.8227403242309463, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14834 + }, + { + "epoch": 0.14835, + "grad_norm": 0.8244965620341915, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 14835 + }, + { + "epoch": 0.14836, + "grad_norm": 0.7292023786298032, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 14836 + }, + { + "epoch": 0.14837, + "grad_norm": 0.634328887281756, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 14837 + }, + { + "epoch": 0.14838, + "grad_norm": 0.6310999363266393, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 14838 + }, + { + "epoch": 0.14839, + "grad_norm": 0.7649104871170423, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 14839 + }, + { + "epoch": 0.1484, + "grad_norm": 0.9326303923828554, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 14840 + }, + { + "epoch": 0.14841, + "grad_norm": 1.0164941161077334, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 14841 + }, + { + "epoch": 0.14842, + "grad_norm": 0.9786120246103639, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 14842 + }, + { + "epoch": 0.14843, + "grad_norm": 1.0209791174342724, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 14843 + }, + { + "epoch": 0.14844, + "grad_norm": 0.9639623705046754, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 14844 + }, + { + "epoch": 0.14845, + "grad_norm": 0.9282902973995008, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 14845 + }, + { + "epoch": 0.14846, + "grad_norm": 0.9217950902227127, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 14846 + }, + { + "epoch": 0.14847, + "grad_norm": 0.8824808881635553, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 14847 + }, + { + "epoch": 0.14848, + "grad_norm": 0.8450670052059192, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 14848 + }, + { + "epoch": 0.14849, + "grad_norm": 0.8121697588609396, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 14849 + }, + { + "epoch": 0.1485, + "grad_norm": 0.8250310663417711, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 14850 + }, + { + "epoch": 0.14851, + "grad_norm": 1.050910913970291, + "learning_rate": 0.003, + "loss": 4.071, + "step": 14851 + }, + { + "epoch": 0.14852, + "grad_norm": 0.9577614016696363, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 14852 + }, + { + "epoch": 0.14853, + "grad_norm": 0.7960357516960352, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 14853 + }, + { + "epoch": 0.14854, + "grad_norm": 0.8254845284606344, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 14854 + }, + { + "epoch": 0.14855, + "grad_norm": 0.8080456450538953, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 14855 + }, + { + "epoch": 0.14856, + "grad_norm": 0.7753398929805875, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 14856 + }, + { + "epoch": 0.14857, + "grad_norm": 0.7061527106788822, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 14857 + }, + { + "epoch": 0.14858, + "grad_norm": 0.5800571865656208, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14858 + }, + { + "epoch": 0.14859, + "grad_norm": 0.5832967010914923, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 14859 + }, + { + "epoch": 0.1486, + "grad_norm": 0.6681652966369956, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 14860 + }, + { + "epoch": 0.14861, + "grad_norm": 0.9662894337646466, + "learning_rate": 0.003, + "loss": 4.072, + "step": 14861 + }, + { + "epoch": 0.14862, + "grad_norm": 1.4940736064093287, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 14862 + }, + { + "epoch": 0.14863, + "grad_norm": 0.48684413451456354, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 14863 + }, + { + "epoch": 0.14864, + "grad_norm": 0.9214178308312404, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 14864 + }, + { + "epoch": 0.14865, + "grad_norm": 1.1051662933711708, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 14865 + }, + { + "epoch": 0.14866, + "grad_norm": 0.6444855888481819, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 14866 + }, + { + "epoch": 0.14867, + "grad_norm": 0.7012195739490744, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 14867 + }, + { + "epoch": 0.14868, + "grad_norm": 0.6651698771785198, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 14868 + }, + { + "epoch": 0.14869, + "grad_norm": 0.7190387626118099, + "learning_rate": 0.003, + "loss": 4.069, + "step": 14869 + }, + { + "epoch": 0.1487, + "grad_norm": 0.7160446661129076, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 14870 + }, + { + "epoch": 0.14871, + "grad_norm": 0.7145206533123866, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 14871 + }, + { + "epoch": 0.14872, + "grad_norm": 0.8690487507205731, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 14872 + }, + { + "epoch": 0.14873, + "grad_norm": 1.0177862633302825, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 14873 + }, + { + "epoch": 0.14874, + "grad_norm": 1.04667369319081, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 14874 + }, + { + "epoch": 0.14875, + "grad_norm": 1.0012624441665312, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 14875 + }, + { + "epoch": 0.14876, + "grad_norm": 1.0680376580115962, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 14876 + }, + { + "epoch": 0.14877, + "grad_norm": 0.9653137634165351, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 14877 + }, + { + "epoch": 0.14878, + "grad_norm": 0.9771121139268824, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 14878 + }, + { + "epoch": 0.14879, + "grad_norm": 1.036955708787152, + "learning_rate": 0.003, + "loss": 4.1041, + "step": 14879 + }, + { + "epoch": 0.1488, + "grad_norm": 0.9779097238863572, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 14880 + }, + { + "epoch": 0.14881, + "grad_norm": 0.9135611922651338, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 14881 + }, + { + "epoch": 0.14882, + "grad_norm": 1.0105999223406752, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 14882 + }, + { + "epoch": 0.14883, + "grad_norm": 1.0593121256665727, + "learning_rate": 0.003, + "loss": 4.081, + "step": 14883 + }, + { + "epoch": 0.14884, + "grad_norm": 0.9045938409237391, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 14884 + }, + { + "epoch": 0.14885, + "grad_norm": 0.8188998322728103, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 14885 + }, + { + "epoch": 0.14886, + "grad_norm": 0.7203265686049506, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 14886 + }, + { + "epoch": 0.14887, + "grad_norm": 0.7621853329044047, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 14887 + }, + { + "epoch": 0.14888, + "grad_norm": 0.7428639111160718, + "learning_rate": 0.003, + "loss": 4.1072, + "step": 14888 + }, + { + "epoch": 0.14889, + "grad_norm": 0.688946974269044, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 14889 + }, + { + "epoch": 0.1489, + "grad_norm": 0.7727991382090984, + "learning_rate": 0.003, + "loss": 4.061, + "step": 14890 + }, + { + "epoch": 0.14891, + "grad_norm": 0.923764079240288, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 14891 + }, + { + "epoch": 0.14892, + "grad_norm": 1.0818850695456843, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 14892 + }, + { + "epoch": 0.14893, + "grad_norm": 0.9533515493596308, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 14893 + }, + { + "epoch": 0.14894, + "grad_norm": 0.981481341965367, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 14894 + }, + { + "epoch": 0.14895, + "grad_norm": 0.9285268392525123, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 14895 + }, + { + "epoch": 0.14896, + "grad_norm": 0.9065692112969851, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 14896 + }, + { + "epoch": 0.14897, + "grad_norm": 0.9286869561404768, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 14897 + }, + { + "epoch": 0.14898, + "grad_norm": 0.9693103568044311, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 14898 + }, + { + "epoch": 0.14899, + "grad_norm": 1.2318849337713658, + "learning_rate": 0.003, + "loss": 4.074, + "step": 14899 + }, + { + "epoch": 0.149, + "grad_norm": 0.7794046979508626, + "learning_rate": 0.003, + "loss": 4.07, + "step": 14900 + }, + { + "epoch": 0.14901, + "grad_norm": 0.8651071691592058, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 14901 + }, + { + "epoch": 0.14902, + "grad_norm": 0.9940404816213152, + "learning_rate": 0.003, + "loss": 4.094, + "step": 14902 + }, + { + "epoch": 0.14903, + "grad_norm": 0.9841659656864069, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 14903 + }, + { + "epoch": 0.14904, + "grad_norm": 0.9594197113729875, + "learning_rate": 0.003, + "loss": 4.106, + "step": 14904 + }, + { + "epoch": 0.14905, + "grad_norm": 1.0375522482259765, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 14905 + }, + { + "epoch": 0.14906, + "grad_norm": 0.9748921579520625, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 14906 + }, + { + "epoch": 0.14907, + "grad_norm": 0.9352250012513379, + "learning_rate": 0.003, + "loss": 4.085, + "step": 14907 + }, + { + "epoch": 0.14908, + "grad_norm": 0.9259155938051827, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 14908 + }, + { + "epoch": 0.14909, + "grad_norm": 0.8927050335806, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 14909 + }, + { + "epoch": 0.1491, + "grad_norm": 0.7725386609308739, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 14910 + }, + { + "epoch": 0.14911, + "grad_norm": 0.6701881124744672, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 14911 + }, + { + "epoch": 0.14912, + "grad_norm": 0.7280153131448277, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 14912 + }, + { + "epoch": 0.14913, + "grad_norm": 0.8517887732778402, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 14913 + }, + { + "epoch": 0.14914, + "grad_norm": 0.9656980157846073, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 14914 + }, + { + "epoch": 0.14915, + "grad_norm": 1.0086694536682543, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 14915 + }, + { + "epoch": 0.14916, + "grad_norm": 0.852154984227591, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 14916 + }, + { + "epoch": 0.14917, + "grad_norm": 0.7569734589829502, + "learning_rate": 0.003, + "loss": 4.1001, + "step": 14917 + }, + { + "epoch": 0.14918, + "grad_norm": 0.6979820605494047, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 14918 + }, + { + "epoch": 0.14919, + "grad_norm": 0.6633397643554763, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 14919 + }, + { + "epoch": 0.1492, + "grad_norm": 0.619787494363792, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 14920 + }, + { + "epoch": 0.14921, + "grad_norm": 0.608910457091843, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 14921 + }, + { + "epoch": 0.14922, + "grad_norm": 0.6255806418384025, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 14922 + }, + { + "epoch": 0.14923, + "grad_norm": 0.7034802121224089, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 14923 + }, + { + "epoch": 0.14924, + "grad_norm": 0.7120507732000536, + "learning_rate": 0.003, + "loss": 4.041, + "step": 14924 + }, + { + "epoch": 0.14925, + "grad_norm": 0.731759045849398, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 14925 + }, + { + "epoch": 0.14926, + "grad_norm": 0.7958735905441341, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 14926 + }, + { + "epoch": 0.14927, + "grad_norm": 0.974310767132195, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 14927 + }, + { + "epoch": 0.14928, + "grad_norm": 1.1300574818742186, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 14928 + }, + { + "epoch": 0.14929, + "grad_norm": 0.6874079550294511, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 14929 + }, + { + "epoch": 0.1493, + "grad_norm": 0.5768107042264596, + "learning_rate": 0.003, + "loss": 4.052, + "step": 14930 + }, + { + "epoch": 0.14931, + "grad_norm": 0.6451752914617287, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 14931 + }, + { + "epoch": 0.14932, + "grad_norm": 0.666473932481538, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 14932 + }, + { + "epoch": 0.14933, + "grad_norm": 0.7766091975960613, + "learning_rate": 0.003, + "loss": 4.05, + "step": 14933 + }, + { + "epoch": 0.14934, + "grad_norm": 0.8452392387063843, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 14934 + }, + { + "epoch": 0.14935, + "grad_norm": 0.9864910485720647, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 14935 + }, + { + "epoch": 0.14936, + "grad_norm": 1.0719299273206342, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 14936 + }, + { + "epoch": 0.14937, + "grad_norm": 0.9071005461209528, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 14937 + }, + { + "epoch": 0.14938, + "grad_norm": 0.8405463476567478, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 14938 + }, + { + "epoch": 0.14939, + "grad_norm": 0.8575181261962258, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 14939 + }, + { + "epoch": 0.1494, + "grad_norm": 0.8796580025409219, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 14940 + }, + { + "epoch": 0.14941, + "grad_norm": 0.8635099200038631, + "learning_rate": 0.003, + "loss": 4.105, + "step": 14941 + }, + { + "epoch": 0.14942, + "grad_norm": 0.765315575062824, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 14942 + }, + { + "epoch": 0.14943, + "grad_norm": 0.7096407636591316, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 14943 + }, + { + "epoch": 0.14944, + "grad_norm": 0.7977257960695163, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 14944 + }, + { + "epoch": 0.14945, + "grad_norm": 0.782795689153648, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 14945 + }, + { + "epoch": 0.14946, + "grad_norm": 0.7725557924540706, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 14946 + }, + { + "epoch": 0.14947, + "grad_norm": 0.8471655105920625, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 14947 + }, + { + "epoch": 0.14948, + "grad_norm": 1.031115687120366, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 14948 + }, + { + "epoch": 0.14949, + "grad_norm": 1.1187946877050938, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 14949 + }, + { + "epoch": 0.1495, + "grad_norm": 0.8942213332346919, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 14950 + }, + { + "epoch": 0.14951, + "grad_norm": 0.9204781667982811, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 14951 + }, + { + "epoch": 0.14952, + "grad_norm": 0.9352311395830009, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 14952 + }, + { + "epoch": 0.14953, + "grad_norm": 1.0093833778732473, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 14953 + }, + { + "epoch": 0.14954, + "grad_norm": 1.0478771341499364, + "learning_rate": 0.003, + "loss": 4.112, + "step": 14954 + }, + { + "epoch": 0.14955, + "grad_norm": 0.882340612382404, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 14955 + }, + { + "epoch": 0.14956, + "grad_norm": 0.8578629575346205, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 14956 + }, + { + "epoch": 0.14957, + "grad_norm": 0.9048449868787202, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 14957 + }, + { + "epoch": 0.14958, + "grad_norm": 0.9866701700073888, + "learning_rate": 0.003, + "loss": 4.103, + "step": 14958 + }, + { + "epoch": 0.14959, + "grad_norm": 1.0395924027785424, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 14959 + }, + { + "epoch": 0.1496, + "grad_norm": 1.0718346439433137, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 14960 + }, + { + "epoch": 0.14961, + "grad_norm": 1.017150832806336, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14961 + }, + { + "epoch": 0.14962, + "grad_norm": 1.0949210508719271, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 14962 + }, + { + "epoch": 0.14963, + "grad_norm": 0.8677949581703225, + "learning_rate": 0.003, + "loss": 4.076, + "step": 14963 + }, + { + "epoch": 0.14964, + "grad_norm": 0.8208694744407704, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 14964 + }, + { + "epoch": 0.14965, + "grad_norm": 0.7672724181788401, + "learning_rate": 0.003, + "loss": 4.067, + "step": 14965 + }, + { + "epoch": 0.14966, + "grad_norm": 0.7334087592400507, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 14966 + }, + { + "epoch": 0.14967, + "grad_norm": 0.8592385250407862, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 14967 + }, + { + "epoch": 0.14968, + "grad_norm": 0.9927538887648965, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 14968 + }, + { + "epoch": 0.14969, + "grad_norm": 0.9994553782115532, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 14969 + }, + { + "epoch": 0.1497, + "grad_norm": 1.047072417223147, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 14970 + }, + { + "epoch": 0.14971, + "grad_norm": 1.076232570183918, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 14971 + }, + { + "epoch": 0.14972, + "grad_norm": 0.9348679591768919, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 14972 + }, + { + "epoch": 0.14973, + "grad_norm": 0.8771961907491908, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 14973 + }, + { + "epoch": 0.14974, + "grad_norm": 0.9099521244080998, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 14974 + }, + { + "epoch": 0.14975, + "grad_norm": 0.9894242589418759, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 14975 + }, + { + "epoch": 0.14976, + "grad_norm": 1.1150719540092562, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 14976 + }, + { + "epoch": 0.14977, + "grad_norm": 0.9052581333802919, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 14977 + }, + { + "epoch": 0.14978, + "grad_norm": 0.8720960042061171, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 14978 + }, + { + "epoch": 0.14979, + "grad_norm": 0.9141304591155341, + "learning_rate": 0.003, + "loss": 4.075, + "step": 14979 + }, + { + "epoch": 0.1498, + "grad_norm": 0.8629320719816637, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 14980 + }, + { + "epoch": 0.14981, + "grad_norm": 0.8454396606444087, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 14981 + }, + { + "epoch": 0.14982, + "grad_norm": 0.8829463599536927, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 14982 + }, + { + "epoch": 0.14983, + "grad_norm": 0.9891816977960454, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 14983 + }, + { + "epoch": 0.14984, + "grad_norm": 0.9984058079787899, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 14984 + }, + { + "epoch": 0.14985, + "grad_norm": 0.7629448942608905, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 14985 + }, + { + "epoch": 0.14986, + "grad_norm": 0.6394039871849966, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 14986 + }, + { + "epoch": 0.14987, + "grad_norm": 0.6582736174880554, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 14987 + }, + { + "epoch": 0.14988, + "grad_norm": 0.6674299029980075, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 14988 + }, + { + "epoch": 0.14989, + "grad_norm": 0.6980416705713386, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 14989 + }, + { + "epoch": 0.1499, + "grad_norm": 0.6913745885470995, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 14990 + }, + { + "epoch": 0.14991, + "grad_norm": 0.6007846402580602, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 14991 + }, + { + "epoch": 0.14992, + "grad_norm": 0.5408813537938235, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 14992 + }, + { + "epoch": 0.14993, + "grad_norm": 0.5267819341584272, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 14993 + }, + { + "epoch": 0.14994, + "grad_norm": 0.5165303340641763, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 14994 + }, + { + "epoch": 0.14995, + "grad_norm": 0.5942808753876797, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 14995 + }, + { + "epoch": 0.14996, + "grad_norm": 0.7343045389402316, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 14996 + }, + { + "epoch": 0.14997, + "grad_norm": 0.7920155719691849, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 14997 + }, + { + "epoch": 0.14998, + "grad_norm": 0.9447915131845857, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 14998 + }, + { + "epoch": 0.14999, + "grad_norm": 1.3156869354631486, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 14999 + }, + { + "epoch": 0.15, + "grad_norm": 0.7348273108340998, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 15000 + }, + { + "epoch": 0.15001, + "grad_norm": 0.6559891082187708, + "learning_rate": 0.003, + "loss": 4.055, + "step": 15001 + }, + { + "epoch": 0.15002, + "grad_norm": 0.707503634726269, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 15002 + }, + { + "epoch": 0.15003, + "grad_norm": 0.726740953568803, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 15003 + }, + { + "epoch": 0.15004, + "grad_norm": 0.926987121955849, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 15004 + }, + { + "epoch": 0.15005, + "grad_norm": 1.169706646737343, + "learning_rate": 0.003, + "loss": 4.096, + "step": 15005 + }, + { + "epoch": 0.15006, + "grad_norm": 0.858629392878249, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 15006 + }, + { + "epoch": 0.15007, + "grad_norm": 0.8710255151395264, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 15007 + }, + { + "epoch": 0.15008, + "grad_norm": 0.9323288184785412, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 15008 + }, + { + "epoch": 0.15009, + "grad_norm": 0.8016143464855188, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 15009 + }, + { + "epoch": 0.1501, + "grad_norm": 0.880337039559984, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 15010 + }, + { + "epoch": 0.15011, + "grad_norm": 1.0895991072764721, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 15011 + }, + { + "epoch": 0.15012, + "grad_norm": 1.1519094860806856, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 15012 + }, + { + "epoch": 0.15013, + "grad_norm": 0.8339136951685342, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 15013 + }, + { + "epoch": 0.15014, + "grad_norm": 0.7312198192986489, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 15014 + }, + { + "epoch": 0.15015, + "grad_norm": 0.7179300708126604, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 15015 + }, + { + "epoch": 0.15016, + "grad_norm": 0.8808797372870163, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 15016 + }, + { + "epoch": 0.15017, + "grad_norm": 1.1218937776661737, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 15017 + }, + { + "epoch": 0.15018, + "grad_norm": 0.8977155710382226, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 15018 + }, + { + "epoch": 0.15019, + "grad_norm": 0.8109154105901505, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 15019 + }, + { + "epoch": 0.1502, + "grad_norm": 0.8098722962407195, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 15020 + }, + { + "epoch": 0.15021, + "grad_norm": 0.8669939347458737, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 15021 + }, + { + "epoch": 0.15022, + "grad_norm": 1.0375664805207678, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 15022 + }, + { + "epoch": 0.15023, + "grad_norm": 1.0426634912626966, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 15023 + }, + { + "epoch": 0.15024, + "grad_norm": 0.8736846127438801, + "learning_rate": 0.003, + "loss": 4.049, + "step": 15024 + }, + { + "epoch": 0.15025, + "grad_norm": 0.9437814264314455, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 15025 + }, + { + "epoch": 0.15026, + "grad_norm": 1.1321860388220588, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 15026 + }, + { + "epoch": 0.15027, + "grad_norm": 1.0450166618111094, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 15027 + }, + { + "epoch": 0.15028, + "grad_norm": 0.874551180787626, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 15028 + }, + { + "epoch": 0.15029, + "grad_norm": 0.769974405546577, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 15029 + }, + { + "epoch": 0.1503, + "grad_norm": 0.7224279390896323, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 15030 + }, + { + "epoch": 0.15031, + "grad_norm": 0.6880650816822179, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 15031 + }, + { + "epoch": 0.15032, + "grad_norm": 0.6260300798186833, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 15032 + }, + { + "epoch": 0.15033, + "grad_norm": 0.6076110821161129, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 15033 + }, + { + "epoch": 0.15034, + "grad_norm": 0.727019230585347, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 15034 + }, + { + "epoch": 0.15035, + "grad_norm": 0.8451350538115437, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 15035 + }, + { + "epoch": 0.15036, + "grad_norm": 0.9391333908635195, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 15036 + }, + { + "epoch": 0.15037, + "grad_norm": 0.9568895429578911, + "learning_rate": 0.003, + "loss": 4.1032, + "step": 15037 + }, + { + "epoch": 0.15038, + "grad_norm": 1.131316642422788, + "learning_rate": 0.003, + "loss": 4.092, + "step": 15038 + }, + { + "epoch": 0.15039, + "grad_norm": 1.0820982816406701, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 15039 + }, + { + "epoch": 0.1504, + "grad_norm": 1.0106112529513311, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 15040 + }, + { + "epoch": 0.15041, + "grad_norm": 0.9521076511768249, + "learning_rate": 0.003, + "loss": 4.1155, + "step": 15041 + }, + { + "epoch": 0.15042, + "grad_norm": 0.920015805394499, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 15042 + }, + { + "epoch": 0.15043, + "grad_norm": 0.7552979883785288, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 15043 + }, + { + "epoch": 0.15044, + "grad_norm": 0.8342374556027915, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 15044 + }, + { + "epoch": 0.15045, + "grad_norm": 0.8943774965335569, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 15045 + }, + { + "epoch": 0.15046, + "grad_norm": 1.0531437113688678, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 15046 + }, + { + "epoch": 0.15047, + "grad_norm": 1.0180501141105878, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 15047 + }, + { + "epoch": 0.15048, + "grad_norm": 0.9121791262945473, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 15048 + }, + { + "epoch": 0.15049, + "grad_norm": 0.945566995431633, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 15049 + }, + { + "epoch": 0.1505, + "grad_norm": 0.8699715498075093, + "learning_rate": 0.003, + "loss": 4.081, + "step": 15050 + }, + { + "epoch": 0.15051, + "grad_norm": 0.8516562362658695, + "learning_rate": 0.003, + "loss": 4.042, + "step": 15051 + }, + { + "epoch": 0.15052, + "grad_norm": 0.847357439658091, + "learning_rate": 0.003, + "loss": 4.091, + "step": 15052 + }, + { + "epoch": 0.15053, + "grad_norm": 0.8539737558488057, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 15053 + }, + { + "epoch": 0.15054, + "grad_norm": 1.000812720741745, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 15054 + }, + { + "epoch": 0.15055, + "grad_norm": 1.1126528375972191, + "learning_rate": 0.003, + "loss": 4.085, + "step": 15055 + }, + { + "epoch": 0.15056, + "grad_norm": 0.7977120214576124, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 15056 + }, + { + "epoch": 0.15057, + "grad_norm": 0.7419229424382847, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 15057 + }, + { + "epoch": 0.15058, + "grad_norm": 0.7838019486444135, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 15058 + }, + { + "epoch": 0.15059, + "grad_norm": 0.7332083959605218, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 15059 + }, + { + "epoch": 0.1506, + "grad_norm": 0.7455840825888586, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 15060 + }, + { + "epoch": 0.15061, + "grad_norm": 0.7721669725707283, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 15061 + }, + { + "epoch": 0.15062, + "grad_norm": 0.9315782636727437, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 15062 + }, + { + "epoch": 0.15063, + "grad_norm": 1.0829324772663194, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 15063 + }, + { + "epoch": 0.15064, + "grad_norm": 1.0719702137178482, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 15064 + }, + { + "epoch": 0.15065, + "grad_norm": 0.9590853495894157, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 15065 + }, + { + "epoch": 0.15066, + "grad_norm": 0.9782700242694625, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 15066 + }, + { + "epoch": 0.15067, + "grad_norm": 0.9117010656159933, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 15067 + }, + { + "epoch": 0.15068, + "grad_norm": 0.8235517729639226, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 15068 + }, + { + "epoch": 0.15069, + "grad_norm": 0.8441872681964596, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 15069 + }, + { + "epoch": 0.1507, + "grad_norm": 0.7525765817838392, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 15070 + }, + { + "epoch": 0.15071, + "grad_norm": 0.8453931881506186, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 15071 + }, + { + "epoch": 0.15072, + "grad_norm": 0.8636282769401511, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 15072 + }, + { + "epoch": 0.15073, + "grad_norm": 0.9829775407411315, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 15073 + }, + { + "epoch": 0.15074, + "grad_norm": 1.0059675353774253, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 15074 + }, + { + "epoch": 0.15075, + "grad_norm": 0.8957322165552001, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 15075 + }, + { + "epoch": 0.15076, + "grad_norm": 0.853476197375503, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 15076 + }, + { + "epoch": 0.15077, + "grad_norm": 0.7950408125840406, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 15077 + }, + { + "epoch": 0.15078, + "grad_norm": 0.8142917115529535, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 15078 + }, + { + "epoch": 0.15079, + "grad_norm": 0.8477090125509233, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 15079 + }, + { + "epoch": 0.1508, + "grad_norm": 0.7663104448708389, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 15080 + }, + { + "epoch": 0.15081, + "grad_norm": 0.7397684908572042, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 15081 + }, + { + "epoch": 0.15082, + "grad_norm": 0.8354558800377258, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 15082 + }, + { + "epoch": 0.15083, + "grad_norm": 0.6910626555141486, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 15083 + }, + { + "epoch": 0.15084, + "grad_norm": 0.7293989807468484, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 15084 + }, + { + "epoch": 0.15085, + "grad_norm": 0.7642813294138431, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15085 + }, + { + "epoch": 0.15086, + "grad_norm": 0.8882841442194418, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 15086 + }, + { + "epoch": 0.15087, + "grad_norm": 1.0427765998507974, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 15087 + }, + { + "epoch": 0.15088, + "grad_norm": 1.1424136012532162, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 15088 + }, + { + "epoch": 0.15089, + "grad_norm": 0.9658202607279203, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 15089 + }, + { + "epoch": 0.1509, + "grad_norm": 1.0009322490904504, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 15090 + }, + { + "epoch": 0.15091, + "grad_norm": 1.0655670822265935, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 15091 + }, + { + "epoch": 0.15092, + "grad_norm": 1.0325834214140834, + "learning_rate": 0.003, + "loss": 4.1018, + "step": 15092 + }, + { + "epoch": 0.15093, + "grad_norm": 1.0279871288626194, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 15093 + }, + { + "epoch": 0.15094, + "grad_norm": 0.976515820218539, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 15094 + }, + { + "epoch": 0.15095, + "grad_norm": 1.0273089457940514, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 15095 + }, + { + "epoch": 0.15096, + "grad_norm": 1.0210313870405392, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 15096 + }, + { + "epoch": 0.15097, + "grad_norm": 1.0763912056309544, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 15097 + }, + { + "epoch": 0.15098, + "grad_norm": 1.0183582632170034, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 15098 + }, + { + "epoch": 0.15099, + "grad_norm": 1.0133441682251516, + "learning_rate": 0.003, + "loss": 4.074, + "step": 15099 + }, + { + "epoch": 0.151, + "grad_norm": 1.014058941118107, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 15100 + }, + { + "epoch": 0.15101, + "grad_norm": 0.9433122305218627, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 15101 + }, + { + "epoch": 0.15102, + "grad_norm": 0.905602098977912, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 15102 + }, + { + "epoch": 0.15103, + "grad_norm": 0.7943329362958932, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 15103 + }, + { + "epoch": 0.15104, + "grad_norm": 0.9579959372806925, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 15104 + }, + { + "epoch": 0.15105, + "grad_norm": 1.0770051648263383, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 15105 + }, + { + "epoch": 0.15106, + "grad_norm": 0.9924025895609426, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 15106 + }, + { + "epoch": 0.15107, + "grad_norm": 1.021375144229543, + "learning_rate": 0.003, + "loss": 4.07, + "step": 15107 + }, + { + "epoch": 0.15108, + "grad_norm": 0.9387463115084094, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 15108 + }, + { + "epoch": 0.15109, + "grad_norm": 0.9006975014037859, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 15109 + }, + { + "epoch": 0.1511, + "grad_norm": 0.8538120284930222, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 15110 + }, + { + "epoch": 0.15111, + "grad_norm": 0.8692401582453622, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 15111 + }, + { + "epoch": 0.15112, + "grad_norm": 0.8035820239386009, + "learning_rate": 0.003, + "loss": 4.067, + "step": 15112 + }, + { + "epoch": 0.15113, + "grad_norm": 0.7560033114405544, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 15113 + }, + { + "epoch": 0.15114, + "grad_norm": 0.7324938727488074, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 15114 + }, + { + "epoch": 0.15115, + "grad_norm": 0.8000574554573201, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 15115 + }, + { + "epoch": 0.15116, + "grad_norm": 0.756042649955105, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 15116 + }, + { + "epoch": 0.15117, + "grad_norm": 0.7620121160526505, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 15117 + }, + { + "epoch": 0.15118, + "grad_norm": 0.7799876604078111, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 15118 + }, + { + "epoch": 0.15119, + "grad_norm": 0.9251279569899423, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 15119 + }, + { + "epoch": 0.1512, + "grad_norm": 1.0591749364671184, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 15120 + }, + { + "epoch": 0.15121, + "grad_norm": 1.0544780767081794, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 15121 + }, + { + "epoch": 0.15122, + "grad_norm": 0.8011775350217684, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 15122 + }, + { + "epoch": 0.15123, + "grad_norm": 0.7048887305335355, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 15123 + }, + { + "epoch": 0.15124, + "grad_norm": 0.6646916324277997, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 15124 + }, + { + "epoch": 0.15125, + "grad_norm": 0.5997067875706642, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 15125 + }, + { + "epoch": 0.15126, + "grad_norm": 0.5284961902011485, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 15126 + }, + { + "epoch": 0.15127, + "grad_norm": 0.49008238412079774, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 15127 + }, + { + "epoch": 0.15128, + "grad_norm": 0.4691905705157153, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 15128 + }, + { + "epoch": 0.15129, + "grad_norm": 0.4792073352967458, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 15129 + }, + { + "epoch": 0.1513, + "grad_norm": 0.5133091416230263, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 15130 + }, + { + "epoch": 0.15131, + "grad_norm": 0.48283337225191414, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 15131 + }, + { + "epoch": 0.15132, + "grad_norm": 0.5550133025090831, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 15132 + }, + { + "epoch": 0.15133, + "grad_norm": 0.7060448342252426, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 15133 + }, + { + "epoch": 0.15134, + "grad_norm": 0.9177711985373354, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 15134 + }, + { + "epoch": 0.15135, + "grad_norm": 1.2631999758061152, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 15135 + }, + { + "epoch": 0.15136, + "grad_norm": 0.7457497717023199, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 15136 + }, + { + "epoch": 0.15137, + "grad_norm": 0.7275488284905349, + "learning_rate": 0.003, + "loss": 4.044, + "step": 15137 + }, + { + "epoch": 0.15138, + "grad_norm": 0.7659938351391803, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 15138 + }, + { + "epoch": 0.15139, + "grad_norm": 0.9471665683810248, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 15139 + }, + { + "epoch": 0.1514, + "grad_norm": 1.0644411698978926, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 15140 + }, + { + "epoch": 0.15141, + "grad_norm": 0.9911485071782313, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 15141 + }, + { + "epoch": 0.15142, + "grad_norm": 0.9177669091355584, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 15142 + }, + { + "epoch": 0.15143, + "grad_norm": 0.9539454491336194, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 15143 + }, + { + "epoch": 0.15144, + "grad_norm": 1.0190823580872017, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 15144 + }, + { + "epoch": 0.15145, + "grad_norm": 1.1097182863703374, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 15145 + }, + { + "epoch": 0.15146, + "grad_norm": 0.902108819088088, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 15146 + }, + { + "epoch": 0.15147, + "grad_norm": 0.8116744335567405, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 15147 + }, + { + "epoch": 0.15148, + "grad_norm": 0.8015320957439042, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 15148 + }, + { + "epoch": 0.15149, + "grad_norm": 0.8378619398388899, + "learning_rate": 0.003, + "loss": 4.091, + "step": 15149 + }, + { + "epoch": 0.1515, + "grad_norm": 0.8467095533469075, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 15150 + }, + { + "epoch": 0.15151, + "grad_norm": 0.9897372155878956, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 15151 + }, + { + "epoch": 0.15152, + "grad_norm": 0.9789423591472723, + "learning_rate": 0.003, + "loss": 4.1157, + "step": 15152 + }, + { + "epoch": 0.15153, + "grad_norm": 1.0609979997095518, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 15153 + }, + { + "epoch": 0.15154, + "grad_norm": 1.3105194125792707, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 15154 + }, + { + "epoch": 0.15155, + "grad_norm": 0.9304455871903653, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 15155 + }, + { + "epoch": 0.15156, + "grad_norm": 0.8282275353630306, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 15156 + }, + { + "epoch": 0.15157, + "grad_norm": 0.9915551834570251, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 15157 + }, + { + "epoch": 0.15158, + "grad_norm": 1.0157859647007892, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 15158 + }, + { + "epoch": 0.15159, + "grad_norm": 1.033844815366442, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 15159 + }, + { + "epoch": 0.1516, + "grad_norm": 0.956169398803398, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 15160 + }, + { + "epoch": 0.15161, + "grad_norm": 1.0076553516889835, + "learning_rate": 0.003, + "loss": 4.077, + "step": 15161 + }, + { + "epoch": 0.15162, + "grad_norm": 1.125393989377539, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 15162 + }, + { + "epoch": 0.15163, + "grad_norm": 0.8165843946971866, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 15163 + }, + { + "epoch": 0.15164, + "grad_norm": 0.7562017922440614, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 15164 + }, + { + "epoch": 0.15165, + "grad_norm": 0.7428884695099078, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 15165 + }, + { + "epoch": 0.15166, + "grad_norm": 0.8404484768990306, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 15166 + }, + { + "epoch": 0.15167, + "grad_norm": 0.8993097791945752, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 15167 + }, + { + "epoch": 0.15168, + "grad_norm": 0.9380835480998397, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 15168 + }, + { + "epoch": 0.15169, + "grad_norm": 1.0806486544246405, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 15169 + }, + { + "epoch": 0.1517, + "grad_norm": 0.9673967332511697, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 15170 + }, + { + "epoch": 0.15171, + "grad_norm": 0.7531630984905995, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 15171 + }, + { + "epoch": 0.15172, + "grad_norm": 0.7011234156390739, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 15172 + }, + { + "epoch": 0.15173, + "grad_norm": 0.6693307433636371, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 15173 + }, + { + "epoch": 0.15174, + "grad_norm": 0.7130280441231969, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 15174 + }, + { + "epoch": 0.15175, + "grad_norm": 0.7215414438726487, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 15175 + }, + { + "epoch": 0.15176, + "grad_norm": 0.6033413933498546, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 15176 + }, + { + "epoch": 0.15177, + "grad_norm": 0.6508264593081992, + "learning_rate": 0.003, + "loss": 4.053, + "step": 15177 + }, + { + "epoch": 0.15178, + "grad_norm": 0.8675552582776156, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 15178 + }, + { + "epoch": 0.15179, + "grad_norm": 1.0419533953805116, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 15179 + }, + { + "epoch": 0.1518, + "grad_norm": 1.0854078659240491, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 15180 + }, + { + "epoch": 0.15181, + "grad_norm": 0.7928904816498591, + "learning_rate": 0.003, + "loss": 4.05, + "step": 15181 + }, + { + "epoch": 0.15182, + "grad_norm": 0.6830974320765402, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 15182 + }, + { + "epoch": 0.15183, + "grad_norm": 0.7399601288407891, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 15183 + }, + { + "epoch": 0.15184, + "grad_norm": 0.7855594409722747, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 15184 + }, + { + "epoch": 0.15185, + "grad_norm": 0.9248773640331247, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 15185 + }, + { + "epoch": 0.15186, + "grad_norm": 0.9862123419049738, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 15186 + }, + { + "epoch": 0.15187, + "grad_norm": 1.102582252076831, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 15187 + }, + { + "epoch": 0.15188, + "grad_norm": 0.7775029083333368, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 15188 + }, + { + "epoch": 0.15189, + "grad_norm": 0.6506700585859129, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 15189 + }, + { + "epoch": 0.1519, + "grad_norm": 0.6796661020644874, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 15190 + }, + { + "epoch": 0.15191, + "grad_norm": 0.6760083572280191, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 15191 + }, + { + "epoch": 0.15192, + "grad_norm": 0.7339459038360747, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 15192 + }, + { + "epoch": 0.15193, + "grad_norm": 0.8568832935336129, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 15193 + }, + { + "epoch": 0.15194, + "grad_norm": 1.1195764067073293, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 15194 + }, + { + "epoch": 0.15195, + "grad_norm": 1.1094324595718625, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 15195 + }, + { + "epoch": 0.15196, + "grad_norm": 0.8664847142200764, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 15196 + }, + { + "epoch": 0.15197, + "grad_norm": 0.8739867532656782, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 15197 + }, + { + "epoch": 0.15198, + "grad_norm": 0.7545970355300466, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 15198 + }, + { + "epoch": 0.15199, + "grad_norm": 0.8349321201361193, + "learning_rate": 0.003, + "loss": 4.075, + "step": 15199 + }, + { + "epoch": 0.152, + "grad_norm": 0.7981624058949888, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 15200 + }, + { + "epoch": 0.15201, + "grad_norm": 0.7709086415901311, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 15201 + }, + { + "epoch": 0.15202, + "grad_norm": 0.8511088125328431, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 15202 + }, + { + "epoch": 0.15203, + "grad_norm": 0.9486382759577987, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 15203 + }, + { + "epoch": 0.15204, + "grad_norm": 1.0585494769383783, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 15204 + }, + { + "epoch": 0.15205, + "grad_norm": 1.0727065049309727, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 15205 + }, + { + "epoch": 0.15206, + "grad_norm": 1.0075358371153782, + "learning_rate": 0.003, + "loss": 4.059, + "step": 15206 + }, + { + "epoch": 0.15207, + "grad_norm": 0.9114685142048955, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 15207 + }, + { + "epoch": 0.15208, + "grad_norm": 0.8427218720162823, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 15208 + }, + { + "epoch": 0.15209, + "grad_norm": 0.7520032644693868, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 15209 + }, + { + "epoch": 0.1521, + "grad_norm": 0.8301931618735977, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 15210 + }, + { + "epoch": 0.15211, + "grad_norm": 1.126954103542896, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 15211 + }, + { + "epoch": 0.15212, + "grad_norm": 1.076102076696635, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 15212 + }, + { + "epoch": 0.15213, + "grad_norm": 0.942643678682756, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 15213 + }, + { + "epoch": 0.15214, + "grad_norm": 0.9153046076681154, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 15214 + }, + { + "epoch": 0.15215, + "grad_norm": 0.8335209604456619, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 15215 + }, + { + "epoch": 0.15216, + "grad_norm": 0.6990706248717341, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 15216 + }, + { + "epoch": 0.15217, + "grad_norm": 0.6654491730735902, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 15217 + }, + { + "epoch": 0.15218, + "grad_norm": 0.7229603138423157, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 15218 + }, + { + "epoch": 0.15219, + "grad_norm": 0.8142168305882189, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 15219 + }, + { + "epoch": 0.1522, + "grad_norm": 0.9415591426472244, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 15220 + }, + { + "epoch": 0.15221, + "grad_norm": 0.9994658536708769, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 15221 + }, + { + "epoch": 0.15222, + "grad_norm": 1.0465433905293442, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 15222 + }, + { + "epoch": 0.15223, + "grad_norm": 0.9197723655156945, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 15223 + }, + { + "epoch": 0.15224, + "grad_norm": 0.8735843811577606, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 15224 + }, + { + "epoch": 0.15225, + "grad_norm": 0.8178200665499099, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 15225 + }, + { + "epoch": 0.15226, + "grad_norm": 0.8322703784874306, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 15226 + }, + { + "epoch": 0.15227, + "grad_norm": 0.9952331935687299, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 15227 + }, + { + "epoch": 0.15228, + "grad_norm": 1.3104611198973144, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 15228 + }, + { + "epoch": 0.15229, + "grad_norm": 0.6856977527170464, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 15229 + }, + { + "epoch": 0.1523, + "grad_norm": 0.7001782128719186, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 15230 + }, + { + "epoch": 0.15231, + "grad_norm": 0.8286781366001644, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 15231 + }, + { + "epoch": 0.15232, + "grad_norm": 1.0706370569707482, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 15232 + }, + { + "epoch": 0.15233, + "grad_norm": 1.1556593051225217, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 15233 + }, + { + "epoch": 0.15234, + "grad_norm": 0.6819453233243812, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 15234 + }, + { + "epoch": 0.15235, + "grad_norm": 0.6612352239163759, + "learning_rate": 0.003, + "loss": 4.053, + "step": 15235 + }, + { + "epoch": 0.15236, + "grad_norm": 0.773020101727993, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 15236 + }, + { + "epoch": 0.15237, + "grad_norm": 0.8525464474951641, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 15237 + }, + { + "epoch": 0.15238, + "grad_norm": 0.8523068065969795, + "learning_rate": 0.003, + "loss": 4.046, + "step": 15238 + }, + { + "epoch": 0.15239, + "grad_norm": 0.900599962097635, + "learning_rate": 0.003, + "loss": 4.054, + "step": 15239 + }, + { + "epoch": 0.1524, + "grad_norm": 0.991414851124613, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 15240 + }, + { + "epoch": 0.15241, + "grad_norm": 1.107188195389757, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 15241 + }, + { + "epoch": 0.15242, + "grad_norm": 0.8704800607026462, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 15242 + }, + { + "epoch": 0.15243, + "grad_norm": 0.7404929086328644, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 15243 + }, + { + "epoch": 0.15244, + "grad_norm": 0.7703572874716674, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 15244 + }, + { + "epoch": 0.15245, + "grad_norm": 0.7829666178006482, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 15245 + }, + { + "epoch": 0.15246, + "grad_norm": 0.7415765331039469, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 15246 + }, + { + "epoch": 0.15247, + "grad_norm": 0.7924097397188576, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 15247 + }, + { + "epoch": 0.15248, + "grad_norm": 0.7777784226768076, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 15248 + }, + { + "epoch": 0.15249, + "grad_norm": 0.6998405033690062, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 15249 + }, + { + "epoch": 0.1525, + "grad_norm": 0.6404729897707369, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 15250 + }, + { + "epoch": 0.15251, + "grad_norm": 0.7452709958927278, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 15251 + }, + { + "epoch": 0.15252, + "grad_norm": 0.7940589446847499, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 15252 + }, + { + "epoch": 0.15253, + "grad_norm": 0.9481448417893192, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 15253 + }, + { + "epoch": 0.15254, + "grad_norm": 0.9766521625518498, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 15254 + }, + { + "epoch": 0.15255, + "grad_norm": 1.1123993025345629, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 15255 + }, + { + "epoch": 0.15256, + "grad_norm": 1.2492375752632872, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 15256 + }, + { + "epoch": 0.15257, + "grad_norm": 0.837986899539563, + "learning_rate": 0.003, + "loss": 4.058, + "step": 15257 + }, + { + "epoch": 0.15258, + "grad_norm": 0.7909502973323723, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 15258 + }, + { + "epoch": 0.15259, + "grad_norm": 0.7890035720589068, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 15259 + }, + { + "epoch": 0.1526, + "grad_norm": 0.8224144675886454, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 15260 + }, + { + "epoch": 0.15261, + "grad_norm": 0.8060568569256621, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 15261 + }, + { + "epoch": 0.15262, + "grad_norm": 0.8176292865252134, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 15262 + }, + { + "epoch": 0.15263, + "grad_norm": 0.8369337185990806, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 15263 + }, + { + "epoch": 0.15264, + "grad_norm": 0.8485265208416527, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 15264 + }, + { + "epoch": 0.15265, + "grad_norm": 0.8305809739761414, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 15265 + }, + { + "epoch": 0.15266, + "grad_norm": 0.8639285814044848, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 15266 + }, + { + "epoch": 0.15267, + "grad_norm": 0.773095963409235, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 15267 + }, + { + "epoch": 0.15268, + "grad_norm": 0.75198856269351, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 15268 + }, + { + "epoch": 0.15269, + "grad_norm": 0.7634260830024288, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 15269 + }, + { + "epoch": 0.1527, + "grad_norm": 0.7352566458153885, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 15270 + }, + { + "epoch": 0.15271, + "grad_norm": 0.6974783227375817, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 15271 + }, + { + "epoch": 0.15272, + "grad_norm": 0.7098188681579817, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 15272 + }, + { + "epoch": 0.15273, + "grad_norm": 1.000394283896658, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 15273 + }, + { + "epoch": 0.15274, + "grad_norm": 1.3197167826645684, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 15274 + }, + { + "epoch": 0.15275, + "grad_norm": 0.7384549959415955, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 15275 + }, + { + "epoch": 0.15276, + "grad_norm": 0.7163675883747808, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 15276 + }, + { + "epoch": 0.15277, + "grad_norm": 0.7950138913049991, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 15277 + }, + { + "epoch": 0.15278, + "grad_norm": 0.8092100122272277, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 15278 + }, + { + "epoch": 0.15279, + "grad_norm": 0.8240904170313513, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 15279 + }, + { + "epoch": 0.1528, + "grad_norm": 0.8183117073174399, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 15280 + }, + { + "epoch": 0.15281, + "grad_norm": 0.9165721380388888, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 15281 + }, + { + "epoch": 0.15282, + "grad_norm": 1.0164291672477201, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 15282 + }, + { + "epoch": 0.15283, + "grad_norm": 1.07127527329947, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 15283 + }, + { + "epoch": 0.15284, + "grad_norm": 1.056796739288713, + "learning_rate": 0.003, + "loss": 4.081, + "step": 15284 + }, + { + "epoch": 0.15285, + "grad_norm": 1.0656942720152707, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 15285 + }, + { + "epoch": 0.15286, + "grad_norm": 1.0005962235424413, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 15286 + }, + { + "epoch": 0.15287, + "grad_norm": 1.0357349400924054, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 15287 + }, + { + "epoch": 0.15288, + "grad_norm": 0.9635146374979597, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 15288 + }, + { + "epoch": 0.15289, + "grad_norm": 0.8736500322014659, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 15289 + }, + { + "epoch": 0.1529, + "grad_norm": 0.9360463166067774, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 15290 + }, + { + "epoch": 0.15291, + "grad_norm": 1.1124004238748042, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 15291 + }, + { + "epoch": 0.15292, + "grad_norm": 1.2161053720096802, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 15292 + }, + { + "epoch": 0.15293, + "grad_norm": 0.7816941440634604, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 15293 + }, + { + "epoch": 0.15294, + "grad_norm": 0.7438995370025482, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 15294 + }, + { + "epoch": 0.15295, + "grad_norm": 0.8139253736925429, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 15295 + }, + { + "epoch": 0.15296, + "grad_norm": 0.7653942018142526, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 15296 + }, + { + "epoch": 0.15297, + "grad_norm": 0.8419651678229138, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 15297 + }, + { + "epoch": 0.15298, + "grad_norm": 0.9039099983876641, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 15298 + }, + { + "epoch": 0.15299, + "grad_norm": 0.9237047675124964, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 15299 + }, + { + "epoch": 0.153, + "grad_norm": 0.9831123579516327, + "learning_rate": 0.003, + "loss": 4.101, + "step": 15300 + }, + { + "epoch": 0.15301, + "grad_norm": 1.0398480708757667, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 15301 + }, + { + "epoch": 0.15302, + "grad_norm": 0.9210196022719573, + "learning_rate": 0.003, + "loss": 4.045, + "step": 15302 + }, + { + "epoch": 0.15303, + "grad_norm": 0.8574354152970236, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 15303 + }, + { + "epoch": 0.15304, + "grad_norm": 0.9279757904048951, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 15304 + }, + { + "epoch": 0.15305, + "grad_norm": 0.9907124836345076, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 15305 + }, + { + "epoch": 0.15306, + "grad_norm": 0.9772938976980835, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 15306 + }, + { + "epoch": 0.15307, + "grad_norm": 0.8269054411701829, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 15307 + }, + { + "epoch": 0.15308, + "grad_norm": 0.7824618857627942, + "learning_rate": 0.003, + "loss": 4.08, + "step": 15308 + }, + { + "epoch": 0.15309, + "grad_norm": 0.845367517646765, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 15309 + }, + { + "epoch": 0.1531, + "grad_norm": 0.9322488255292629, + "learning_rate": 0.003, + "loss": 4.09, + "step": 15310 + }, + { + "epoch": 0.15311, + "grad_norm": 0.9918762112315062, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 15311 + }, + { + "epoch": 0.15312, + "grad_norm": 0.9692327736408248, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 15312 + }, + { + "epoch": 0.15313, + "grad_norm": 1.0406673818535623, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 15313 + }, + { + "epoch": 0.15314, + "grad_norm": 0.9548435951190011, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 15314 + }, + { + "epoch": 0.15315, + "grad_norm": 0.8409777942134875, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 15315 + }, + { + "epoch": 0.15316, + "grad_norm": 0.788125148875671, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 15316 + }, + { + "epoch": 0.15317, + "grad_norm": 0.8256558652012488, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 15317 + }, + { + "epoch": 0.15318, + "grad_norm": 0.8197919997498069, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 15318 + }, + { + "epoch": 0.15319, + "grad_norm": 0.8465533516485063, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 15319 + }, + { + "epoch": 0.1532, + "grad_norm": 0.8174693643961777, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 15320 + }, + { + "epoch": 0.15321, + "grad_norm": 0.7909793362987472, + "learning_rate": 0.003, + "loss": 4.071, + "step": 15321 + }, + { + "epoch": 0.15322, + "grad_norm": 0.8215613196846124, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 15322 + }, + { + "epoch": 0.15323, + "grad_norm": 0.8116288700030023, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 15323 + }, + { + "epoch": 0.15324, + "grad_norm": 0.7705645852664121, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 15324 + }, + { + "epoch": 0.15325, + "grad_norm": 0.7586459276769124, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 15325 + }, + { + "epoch": 0.15326, + "grad_norm": 0.8214815946327116, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 15326 + }, + { + "epoch": 0.15327, + "grad_norm": 0.859227403114001, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 15327 + }, + { + "epoch": 0.15328, + "grad_norm": 0.9247243898742153, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 15328 + }, + { + "epoch": 0.15329, + "grad_norm": 1.0070150338670758, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 15329 + }, + { + "epoch": 0.1533, + "grad_norm": 1.048426345837903, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 15330 + }, + { + "epoch": 0.15331, + "grad_norm": 0.8691143494307864, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 15331 + }, + { + "epoch": 0.15332, + "grad_norm": 0.7418188611393264, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 15332 + }, + { + "epoch": 0.15333, + "grad_norm": 0.7223747846535831, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 15333 + }, + { + "epoch": 0.15334, + "grad_norm": 0.7181767443064082, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 15334 + }, + { + "epoch": 0.15335, + "grad_norm": 0.9082406051597949, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 15335 + }, + { + "epoch": 0.15336, + "grad_norm": 1.0659744240190578, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15336 + }, + { + "epoch": 0.15337, + "grad_norm": 0.8095698019551361, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 15337 + }, + { + "epoch": 0.15338, + "grad_norm": 0.7868786717459086, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 15338 + }, + { + "epoch": 0.15339, + "grad_norm": 0.8599956330432876, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 15339 + }, + { + "epoch": 0.1534, + "grad_norm": 0.7706506291807872, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 15340 + }, + { + "epoch": 0.15341, + "grad_norm": 0.8721086785843887, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 15341 + }, + { + "epoch": 0.15342, + "grad_norm": 1.0597943106109282, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 15342 + }, + { + "epoch": 0.15343, + "grad_norm": 1.0315648440749439, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 15343 + }, + { + "epoch": 0.15344, + "grad_norm": 1.0445752058541913, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 15344 + }, + { + "epoch": 0.15345, + "grad_norm": 1.126879153111509, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 15345 + }, + { + "epoch": 0.15346, + "grad_norm": 0.8758151127517635, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 15346 + }, + { + "epoch": 0.15347, + "grad_norm": 0.8887417297013118, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 15347 + }, + { + "epoch": 0.15348, + "grad_norm": 0.9078925324284505, + "learning_rate": 0.003, + "loss": 4.086, + "step": 15348 + }, + { + "epoch": 0.15349, + "grad_norm": 0.9437893858843688, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 15349 + }, + { + "epoch": 0.1535, + "grad_norm": 1.028551586928702, + "learning_rate": 0.003, + "loss": 4.083, + "step": 15350 + }, + { + "epoch": 0.15351, + "grad_norm": 0.9838643587197328, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 15351 + }, + { + "epoch": 0.15352, + "grad_norm": 1.0179072480536766, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 15352 + }, + { + "epoch": 0.15353, + "grad_norm": 1.0587174480745536, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 15353 + }, + { + "epoch": 0.15354, + "grad_norm": 0.7937917540579997, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 15354 + }, + { + "epoch": 0.15355, + "grad_norm": 0.7618401905739632, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 15355 + }, + { + "epoch": 0.15356, + "grad_norm": 0.7476017996224481, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 15356 + }, + { + "epoch": 0.15357, + "grad_norm": 0.7168312754858822, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 15357 + }, + { + "epoch": 0.15358, + "grad_norm": 0.6652411138469404, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 15358 + }, + { + "epoch": 0.15359, + "grad_norm": 0.5990865842852017, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 15359 + }, + { + "epoch": 0.1536, + "grad_norm": 0.610952128432108, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 15360 + }, + { + "epoch": 0.15361, + "grad_norm": 0.5850669319398033, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 15361 + }, + { + "epoch": 0.15362, + "grad_norm": 0.7146752691579432, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 15362 + }, + { + "epoch": 0.15363, + "grad_norm": 0.8886228198316868, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 15363 + }, + { + "epoch": 0.15364, + "grad_norm": 0.9598647006128489, + "learning_rate": 0.003, + "loss": 4.061, + "step": 15364 + }, + { + "epoch": 0.15365, + "grad_norm": 0.9996920584638711, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 15365 + }, + { + "epoch": 0.15366, + "grad_norm": 1.0729749653640965, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 15366 + }, + { + "epoch": 0.15367, + "grad_norm": 0.8812989363422271, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 15367 + }, + { + "epoch": 0.15368, + "grad_norm": 0.9612399829762662, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15368 + }, + { + "epoch": 0.15369, + "grad_norm": 1.2373738936147978, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 15369 + }, + { + "epoch": 0.1537, + "grad_norm": 1.0254891506435038, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 15370 + }, + { + "epoch": 0.15371, + "grad_norm": 0.8992332799160001, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 15371 + }, + { + "epoch": 0.15372, + "grad_norm": 0.8148899378746298, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 15372 + }, + { + "epoch": 0.15373, + "grad_norm": 0.8704590629676359, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 15373 + }, + { + "epoch": 0.15374, + "grad_norm": 1.0363916590663378, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 15374 + }, + { + "epoch": 0.15375, + "grad_norm": 0.8676314135519507, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 15375 + }, + { + "epoch": 0.15376, + "grad_norm": 0.7084009364673541, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 15376 + }, + { + "epoch": 0.15377, + "grad_norm": 0.698532730339605, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 15377 + }, + { + "epoch": 0.15378, + "grad_norm": 0.6100615528067144, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 15378 + }, + { + "epoch": 0.15379, + "grad_norm": 0.6477153996238283, + "learning_rate": 0.003, + "loss": 4.091, + "step": 15379 + }, + { + "epoch": 0.1538, + "grad_norm": 0.6877543332267829, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 15380 + }, + { + "epoch": 0.15381, + "grad_norm": 0.6537584265239311, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 15381 + }, + { + "epoch": 0.15382, + "grad_norm": 0.763424128315055, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 15382 + }, + { + "epoch": 0.15383, + "grad_norm": 0.8125123084912943, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 15383 + }, + { + "epoch": 0.15384, + "grad_norm": 0.7606723703046439, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 15384 + }, + { + "epoch": 0.15385, + "grad_norm": 0.6884466700203843, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 15385 + }, + { + "epoch": 0.15386, + "grad_norm": 0.6467478719710251, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 15386 + }, + { + "epoch": 0.15387, + "grad_norm": 0.6245316876044793, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 15387 + }, + { + "epoch": 0.15388, + "grad_norm": 0.6356434538542645, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 15388 + }, + { + "epoch": 0.15389, + "grad_norm": 0.6373822402222443, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 15389 + }, + { + "epoch": 0.1539, + "grad_norm": 0.7718510561746118, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 15390 + }, + { + "epoch": 0.15391, + "grad_norm": 0.9061684310137172, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 15391 + }, + { + "epoch": 0.15392, + "grad_norm": 1.1775563525898003, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 15392 + }, + { + "epoch": 0.15393, + "grad_norm": 0.9732620045236304, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 15393 + }, + { + "epoch": 0.15394, + "grad_norm": 0.9950541024320676, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 15394 + }, + { + "epoch": 0.15395, + "grad_norm": 0.9328073183592768, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 15395 + }, + { + "epoch": 0.15396, + "grad_norm": 1.0172921681937264, + "learning_rate": 0.003, + "loss": 4.045, + "step": 15396 + }, + { + "epoch": 0.15397, + "grad_norm": 1.0314286160597368, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 15397 + }, + { + "epoch": 0.15398, + "grad_norm": 1.1131646114895066, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 15398 + }, + { + "epoch": 0.15399, + "grad_norm": 0.8061905602399027, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 15399 + }, + { + "epoch": 0.154, + "grad_norm": 0.7281692435789039, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 15400 + }, + { + "epoch": 0.15401, + "grad_norm": 0.7136606728727093, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 15401 + }, + { + "epoch": 0.15402, + "grad_norm": 0.7044475162085178, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 15402 + }, + { + "epoch": 0.15403, + "grad_norm": 0.7620517403211995, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 15403 + }, + { + "epoch": 0.15404, + "grad_norm": 0.8943543003820128, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 15404 + }, + { + "epoch": 0.15405, + "grad_norm": 1.0634772629309255, + "learning_rate": 0.003, + "loss": 4.1146, + "step": 15405 + }, + { + "epoch": 0.15406, + "grad_norm": 1.0489995344298069, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 15406 + }, + { + "epoch": 0.15407, + "grad_norm": 0.936834227930002, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 15407 + }, + { + "epoch": 0.15408, + "grad_norm": 0.8968233810962296, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 15408 + }, + { + "epoch": 0.15409, + "grad_norm": 0.8650125405473275, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 15409 + }, + { + "epoch": 0.1541, + "grad_norm": 0.8351046721298193, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 15410 + }, + { + "epoch": 0.15411, + "grad_norm": 0.8456992735792717, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 15411 + }, + { + "epoch": 0.15412, + "grad_norm": 0.9616974588856743, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 15412 + }, + { + "epoch": 0.15413, + "grad_norm": 1.228338969955763, + "learning_rate": 0.003, + "loss": 4.1184, + "step": 15413 + }, + { + "epoch": 0.15414, + "grad_norm": 0.9327203029167266, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 15414 + }, + { + "epoch": 0.15415, + "grad_norm": 1.0369311853927972, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 15415 + }, + { + "epoch": 0.15416, + "grad_norm": 1.1884505990050545, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 15416 + }, + { + "epoch": 0.15417, + "grad_norm": 0.9161430344272499, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 15417 + }, + { + "epoch": 0.15418, + "grad_norm": 1.091616793433311, + "learning_rate": 0.003, + "loss": 4.051, + "step": 15418 + }, + { + "epoch": 0.15419, + "grad_norm": 1.1469923028891025, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 15419 + }, + { + "epoch": 0.1542, + "grad_norm": 0.8134990180415536, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 15420 + }, + { + "epoch": 0.15421, + "grad_norm": 0.749698599058815, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 15421 + }, + { + "epoch": 0.15422, + "grad_norm": 0.7126708577307878, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 15422 + }, + { + "epoch": 0.15423, + "grad_norm": 0.6560223291309407, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 15423 + }, + { + "epoch": 0.15424, + "grad_norm": 0.6434394591799395, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 15424 + }, + { + "epoch": 0.15425, + "grad_norm": 0.7214371154521346, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 15425 + }, + { + "epoch": 0.15426, + "grad_norm": 0.7707324717352958, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 15426 + }, + { + "epoch": 0.15427, + "grad_norm": 0.9784461671732944, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 15427 + }, + { + "epoch": 0.15428, + "grad_norm": 1.0751486596194002, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 15428 + }, + { + "epoch": 0.15429, + "grad_norm": 0.9342153187863622, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 15429 + }, + { + "epoch": 0.1543, + "grad_norm": 0.9635213215700588, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 15430 + }, + { + "epoch": 0.15431, + "grad_norm": 0.9691929750211792, + "learning_rate": 0.003, + "loss": 4.0975, + "step": 15431 + }, + { + "epoch": 0.15432, + "grad_norm": 0.857186961871027, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 15432 + }, + { + "epoch": 0.15433, + "grad_norm": 0.8988001941635466, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 15433 + }, + { + "epoch": 0.15434, + "grad_norm": 0.9092065583290543, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 15434 + }, + { + "epoch": 0.15435, + "grad_norm": 1.0763532230832278, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 15435 + }, + { + "epoch": 0.15436, + "grad_norm": 0.9015359583839574, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 15436 + }, + { + "epoch": 0.15437, + "grad_norm": 0.8159108758688398, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 15437 + }, + { + "epoch": 0.15438, + "grad_norm": 0.8669191192028983, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 15438 + }, + { + "epoch": 0.15439, + "grad_norm": 0.8622772622890306, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 15439 + }, + { + "epoch": 0.1544, + "grad_norm": 0.862024516715705, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 15440 + }, + { + "epoch": 0.15441, + "grad_norm": 1.0019341765054546, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 15441 + }, + { + "epoch": 0.15442, + "grad_norm": 1.0460953698286313, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 15442 + }, + { + "epoch": 0.15443, + "grad_norm": 0.9056468395580206, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 15443 + }, + { + "epoch": 0.15444, + "grad_norm": 0.8762762068790201, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 15444 + }, + { + "epoch": 0.15445, + "grad_norm": 0.9318923437666932, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 15445 + }, + { + "epoch": 0.15446, + "grad_norm": 1.0039644072078817, + "learning_rate": 0.003, + "loss": 4.06, + "step": 15446 + }, + { + "epoch": 0.15447, + "grad_norm": 1.1812029223095382, + "learning_rate": 0.003, + "loss": 4.083, + "step": 15447 + }, + { + "epoch": 0.15448, + "grad_norm": 0.7460119650318848, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 15448 + }, + { + "epoch": 0.15449, + "grad_norm": 0.6190852095293629, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 15449 + }, + { + "epoch": 0.1545, + "grad_norm": 0.7711414583657713, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 15450 + }, + { + "epoch": 0.15451, + "grad_norm": 0.8699766039956232, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15451 + }, + { + "epoch": 0.15452, + "grad_norm": 0.918779270026279, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 15452 + }, + { + "epoch": 0.15453, + "grad_norm": 1.0578539680005319, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 15453 + }, + { + "epoch": 0.15454, + "grad_norm": 1.1553769043442381, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 15454 + }, + { + "epoch": 0.15455, + "grad_norm": 0.8973035021789558, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 15455 + }, + { + "epoch": 0.15456, + "grad_norm": 0.8607516540225554, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 15456 + }, + { + "epoch": 0.15457, + "grad_norm": 0.8289164207339367, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 15457 + }, + { + "epoch": 0.15458, + "grad_norm": 0.7399216084926746, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 15458 + }, + { + "epoch": 0.15459, + "grad_norm": 0.6263654591934297, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 15459 + }, + { + "epoch": 0.1546, + "grad_norm": 0.6189360858960318, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 15460 + }, + { + "epoch": 0.15461, + "grad_norm": 0.6654599519087065, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 15461 + }, + { + "epoch": 0.15462, + "grad_norm": 0.6672983694893821, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 15462 + }, + { + "epoch": 0.15463, + "grad_norm": 0.7409763602890124, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 15463 + }, + { + "epoch": 0.15464, + "grad_norm": 0.7518234475164591, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 15464 + }, + { + "epoch": 0.15465, + "grad_norm": 0.8086846319741519, + "learning_rate": 0.003, + "loss": 4.046, + "step": 15465 + }, + { + "epoch": 0.15466, + "grad_norm": 0.8869002598209089, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 15466 + }, + { + "epoch": 0.15467, + "grad_norm": 1.1329410550211505, + "learning_rate": 0.003, + "loss": 4.069, + "step": 15467 + }, + { + "epoch": 0.15468, + "grad_norm": 1.0645004856880826, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 15468 + }, + { + "epoch": 0.15469, + "grad_norm": 1.0267216422241157, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 15469 + }, + { + "epoch": 0.1547, + "grad_norm": 0.9149964881198352, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 15470 + }, + { + "epoch": 0.15471, + "grad_norm": 0.8287218209214179, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 15471 + }, + { + "epoch": 0.15472, + "grad_norm": 0.7796472725168954, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 15472 + }, + { + "epoch": 0.15473, + "grad_norm": 0.7761389980881427, + "learning_rate": 0.003, + "loss": 4.084, + "step": 15473 + }, + { + "epoch": 0.15474, + "grad_norm": 0.7270120134919071, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 15474 + }, + { + "epoch": 0.15475, + "grad_norm": 0.7763197507518927, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 15475 + }, + { + "epoch": 0.15476, + "grad_norm": 0.7674069391230546, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 15476 + }, + { + "epoch": 0.15477, + "grad_norm": 0.7647874309242547, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 15477 + }, + { + "epoch": 0.15478, + "grad_norm": 0.7402283439445141, + "learning_rate": 0.003, + "loss": 4.056, + "step": 15478 + }, + { + "epoch": 0.15479, + "grad_norm": 0.7704264106087683, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 15479 + }, + { + "epoch": 0.1548, + "grad_norm": 0.8739289097868363, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 15480 + }, + { + "epoch": 0.15481, + "grad_norm": 1.1263533864300832, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 15481 + }, + { + "epoch": 0.15482, + "grad_norm": 0.9525460025294208, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 15482 + }, + { + "epoch": 0.15483, + "grad_norm": 0.9333168866468142, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 15483 + }, + { + "epoch": 0.15484, + "grad_norm": 0.9730324058784258, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 15484 + }, + { + "epoch": 0.15485, + "grad_norm": 0.909897383863647, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 15485 + }, + { + "epoch": 0.15486, + "grad_norm": 0.8009061464324777, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 15486 + }, + { + "epoch": 0.15487, + "grad_norm": 0.6404602622859169, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 15487 + }, + { + "epoch": 0.15488, + "grad_norm": 0.6460085282699142, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 15488 + }, + { + "epoch": 0.15489, + "grad_norm": 0.6917772168227212, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 15489 + }, + { + "epoch": 0.1549, + "grad_norm": 0.7452533445497697, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 15490 + }, + { + "epoch": 0.15491, + "grad_norm": 0.74914215694899, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 15491 + }, + { + "epoch": 0.15492, + "grad_norm": 0.6500003700792345, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 15492 + }, + { + "epoch": 0.15493, + "grad_norm": 0.7208607957872774, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 15493 + }, + { + "epoch": 0.15494, + "grad_norm": 0.7954539482905564, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 15494 + }, + { + "epoch": 0.15495, + "grad_norm": 1.1448476120062712, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 15495 + }, + { + "epoch": 0.15496, + "grad_norm": 1.0429540905268704, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 15496 + }, + { + "epoch": 0.15497, + "grad_norm": 0.9922058860462024, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 15497 + }, + { + "epoch": 0.15498, + "grad_norm": 1.0374318456690717, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 15498 + }, + { + "epoch": 0.15499, + "grad_norm": 0.850225728016009, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 15499 + }, + { + "epoch": 0.155, + "grad_norm": 0.823741912085178, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 15500 + }, + { + "epoch": 0.15501, + "grad_norm": 0.9238563261828668, + "learning_rate": 0.003, + "loss": 4.057, + "step": 15501 + }, + { + "epoch": 0.15502, + "grad_norm": 1.1054316400914197, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 15502 + }, + { + "epoch": 0.15503, + "grad_norm": 1.0008061464778677, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 15503 + }, + { + "epoch": 0.15504, + "grad_norm": 1.1001959250202507, + "learning_rate": 0.003, + "loss": 4.091, + "step": 15504 + }, + { + "epoch": 0.15505, + "grad_norm": 0.8872338983350687, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 15505 + }, + { + "epoch": 0.15506, + "grad_norm": 0.9467325459884459, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 15506 + }, + { + "epoch": 0.15507, + "grad_norm": 1.009616961840951, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 15507 + }, + { + "epoch": 0.15508, + "grad_norm": 1.1703253045225837, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 15508 + }, + { + "epoch": 0.15509, + "grad_norm": 0.9910258078543399, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 15509 + }, + { + "epoch": 0.1551, + "grad_norm": 0.8590018006910455, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 15510 + }, + { + "epoch": 0.15511, + "grad_norm": 0.8221040719720489, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 15511 + }, + { + "epoch": 0.15512, + "grad_norm": 0.8550604390636547, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 15512 + }, + { + "epoch": 0.15513, + "grad_norm": 0.8035956560395793, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 15513 + }, + { + "epoch": 0.15514, + "grad_norm": 0.9233044007204195, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 15514 + }, + { + "epoch": 0.15515, + "grad_norm": 0.9381953022868085, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 15515 + }, + { + "epoch": 0.15516, + "grad_norm": 1.039866091398066, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 15516 + }, + { + "epoch": 0.15517, + "grad_norm": 1.170461779810142, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 15517 + }, + { + "epoch": 0.15518, + "grad_norm": 0.8593146424575652, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 15518 + }, + { + "epoch": 0.15519, + "grad_norm": 0.8535163044475705, + "learning_rate": 0.003, + "loss": 4.055, + "step": 15519 + }, + { + "epoch": 0.1552, + "grad_norm": 0.8908455444384552, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 15520 + }, + { + "epoch": 0.15521, + "grad_norm": 0.9530158401911695, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 15521 + }, + { + "epoch": 0.15522, + "grad_norm": 1.0710813249073143, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 15522 + }, + { + "epoch": 0.15523, + "grad_norm": 0.9547514449675653, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 15523 + }, + { + "epoch": 0.15524, + "grad_norm": 0.9708381670048761, + "learning_rate": 0.003, + "loss": 4.092, + "step": 15524 + }, + { + "epoch": 0.15525, + "grad_norm": 0.9281972432170034, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 15525 + }, + { + "epoch": 0.15526, + "grad_norm": 0.9452754971493649, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 15526 + }, + { + "epoch": 0.15527, + "grad_norm": 1.1336347797235296, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15527 + }, + { + "epoch": 0.15528, + "grad_norm": 0.9822485734434496, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 15528 + }, + { + "epoch": 0.15529, + "grad_norm": 0.9144468853148353, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 15529 + }, + { + "epoch": 0.1553, + "grad_norm": 0.9621912015091932, + "learning_rate": 0.003, + "loss": 4.1074, + "step": 15530 + }, + { + "epoch": 0.15531, + "grad_norm": 0.834701156342796, + "learning_rate": 0.003, + "loss": 4.064, + "step": 15531 + }, + { + "epoch": 0.15532, + "grad_norm": 0.7038072073156139, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 15532 + }, + { + "epoch": 0.15533, + "grad_norm": 0.7864258445634408, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 15533 + }, + { + "epoch": 0.15534, + "grad_norm": 0.8498937190027228, + "learning_rate": 0.003, + "loss": 4.091, + "step": 15534 + }, + { + "epoch": 0.15535, + "grad_norm": 0.9988490320319162, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 15535 + }, + { + "epoch": 0.15536, + "grad_norm": 1.104970485487583, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15536 + }, + { + "epoch": 0.15537, + "grad_norm": 0.8122962187221906, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 15537 + }, + { + "epoch": 0.15538, + "grad_norm": 0.6829837254445589, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 15538 + }, + { + "epoch": 0.15539, + "grad_norm": 0.6239711604895152, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 15539 + }, + { + "epoch": 0.1554, + "grad_norm": 0.5897056435055278, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 15540 + }, + { + "epoch": 0.15541, + "grad_norm": 0.5551094437362032, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15541 + }, + { + "epoch": 0.15542, + "grad_norm": 0.56511733964809, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 15542 + }, + { + "epoch": 0.15543, + "grad_norm": 0.6117786371457542, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 15543 + }, + { + "epoch": 0.15544, + "grad_norm": 0.7046444160464634, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 15544 + }, + { + "epoch": 0.15545, + "grad_norm": 0.9654374717196152, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 15545 + }, + { + "epoch": 0.15546, + "grad_norm": 1.2749271054755769, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 15546 + }, + { + "epoch": 0.15547, + "grad_norm": 0.745483960991596, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 15547 + }, + { + "epoch": 0.15548, + "grad_norm": 0.7855719930691469, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 15548 + }, + { + "epoch": 0.15549, + "grad_norm": 0.7601840785983824, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15549 + }, + { + "epoch": 0.1555, + "grad_norm": 0.8142037516037431, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 15550 + }, + { + "epoch": 0.15551, + "grad_norm": 0.7949631614947104, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 15551 + }, + { + "epoch": 0.15552, + "grad_norm": 0.716858353732648, + "learning_rate": 0.003, + "loss": 4.046, + "step": 15552 + }, + { + "epoch": 0.15553, + "grad_norm": 0.8449742704018046, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 15553 + }, + { + "epoch": 0.15554, + "grad_norm": 0.9028674042410417, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 15554 + }, + { + "epoch": 0.15555, + "grad_norm": 1.0332359239049087, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 15555 + }, + { + "epoch": 0.15556, + "grad_norm": 1.3285988413244725, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 15556 + }, + { + "epoch": 0.15557, + "grad_norm": 0.82007769013011, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 15557 + }, + { + "epoch": 0.15558, + "grad_norm": 0.6894776810610859, + "learning_rate": 0.003, + "loss": 4.065, + "step": 15558 + }, + { + "epoch": 0.15559, + "grad_norm": 0.6316581580258641, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 15559 + }, + { + "epoch": 0.1556, + "grad_norm": 0.7116632678924181, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 15560 + }, + { + "epoch": 0.15561, + "grad_norm": 0.82837303506694, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 15561 + }, + { + "epoch": 0.15562, + "grad_norm": 0.9611383598681128, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 15562 + }, + { + "epoch": 0.15563, + "grad_norm": 1.092475113649714, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 15563 + }, + { + "epoch": 0.15564, + "grad_norm": 1.060393024021851, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15564 + }, + { + "epoch": 0.15565, + "grad_norm": 1.0487472071725026, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 15565 + }, + { + "epoch": 0.15566, + "grad_norm": 0.9463545538800499, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 15566 + }, + { + "epoch": 0.15567, + "grad_norm": 0.7501415029738309, + "learning_rate": 0.003, + "loss": 4.061, + "step": 15567 + }, + { + "epoch": 0.15568, + "grad_norm": 0.6219431825249087, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 15568 + }, + { + "epoch": 0.15569, + "grad_norm": 0.6918215471451595, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 15569 + }, + { + "epoch": 0.1557, + "grad_norm": 0.8086146257580983, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 15570 + }, + { + "epoch": 0.15571, + "grad_norm": 0.8533713854330912, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 15571 + }, + { + "epoch": 0.15572, + "grad_norm": 0.8780257870236586, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 15572 + }, + { + "epoch": 0.15573, + "grad_norm": 0.8496333543849813, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 15573 + }, + { + "epoch": 0.15574, + "grad_norm": 0.9110526708886625, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 15574 + }, + { + "epoch": 0.15575, + "grad_norm": 1.015019738431502, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 15575 + }, + { + "epoch": 0.15576, + "grad_norm": 0.969549567680602, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 15576 + }, + { + "epoch": 0.15577, + "grad_norm": 0.9051056955696452, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 15577 + }, + { + "epoch": 0.15578, + "grad_norm": 0.964665349585722, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 15578 + }, + { + "epoch": 0.15579, + "grad_norm": 1.1679976408997053, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 15579 + }, + { + "epoch": 0.1558, + "grad_norm": 1.102628850605355, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 15580 + }, + { + "epoch": 0.15581, + "grad_norm": 0.9956977239385475, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 15581 + }, + { + "epoch": 0.15582, + "grad_norm": 1.097055256038559, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 15582 + }, + { + "epoch": 0.15583, + "grad_norm": 1.0646531800168875, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 15583 + }, + { + "epoch": 0.15584, + "grad_norm": 0.8760851156942279, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 15584 + }, + { + "epoch": 0.15585, + "grad_norm": 0.8943881993935535, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 15585 + }, + { + "epoch": 0.15586, + "grad_norm": 0.9190415541194136, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 15586 + }, + { + "epoch": 0.15587, + "grad_norm": 0.8556050544661058, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 15587 + }, + { + "epoch": 0.15588, + "grad_norm": 0.6945857287160594, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 15588 + }, + { + "epoch": 0.15589, + "grad_norm": 0.6471982189951477, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 15589 + }, + { + "epoch": 0.1559, + "grad_norm": 0.6662768618836037, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 15590 + }, + { + "epoch": 0.15591, + "grad_norm": 0.7599561337701207, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 15591 + }, + { + "epoch": 0.15592, + "grad_norm": 0.775076448494908, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 15592 + }, + { + "epoch": 0.15593, + "grad_norm": 0.9174076140493366, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 15593 + }, + { + "epoch": 0.15594, + "grad_norm": 1.2169025906499393, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 15594 + }, + { + "epoch": 0.15595, + "grad_norm": 0.9000408502979979, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 15595 + }, + { + "epoch": 0.15596, + "grad_norm": 0.6406303812655746, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 15596 + }, + { + "epoch": 0.15597, + "grad_norm": 0.7788255556972971, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 15597 + }, + { + "epoch": 0.15598, + "grad_norm": 1.0347369074474213, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 15598 + }, + { + "epoch": 0.15599, + "grad_norm": 1.035064376849458, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 15599 + }, + { + "epoch": 0.156, + "grad_norm": 0.9692133530209425, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 15600 + }, + { + "epoch": 0.15601, + "grad_norm": 1.0450323767106415, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 15601 + }, + { + "epoch": 0.15602, + "grad_norm": 1.0699803348981658, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 15602 + }, + { + "epoch": 0.15603, + "grad_norm": 1.0875426637442756, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 15603 + }, + { + "epoch": 0.15604, + "grad_norm": 0.8694194830067608, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 15604 + }, + { + "epoch": 0.15605, + "grad_norm": 0.8734698103010141, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 15605 + }, + { + "epoch": 0.15606, + "grad_norm": 0.9072438647082046, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 15606 + }, + { + "epoch": 0.15607, + "grad_norm": 0.8103647416641858, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 15607 + }, + { + "epoch": 0.15608, + "grad_norm": 0.7938669168603442, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 15608 + }, + { + "epoch": 0.15609, + "grad_norm": 0.751983038353267, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 15609 + }, + { + "epoch": 0.1561, + "grad_norm": 0.8291569967019005, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 15610 + }, + { + "epoch": 0.15611, + "grad_norm": 0.9380251030634215, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 15611 + }, + { + "epoch": 0.15612, + "grad_norm": 0.9915590564103034, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 15612 + }, + { + "epoch": 0.15613, + "grad_norm": 1.1092022208759647, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 15613 + }, + { + "epoch": 0.15614, + "grad_norm": 0.7807231605511251, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 15614 + }, + { + "epoch": 0.15615, + "grad_norm": 0.7924724999912204, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 15615 + }, + { + "epoch": 0.15616, + "grad_norm": 0.8434872336299037, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 15616 + }, + { + "epoch": 0.15617, + "grad_norm": 0.8657148855571658, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 15617 + }, + { + "epoch": 0.15618, + "grad_norm": 0.8000624026317716, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 15618 + }, + { + "epoch": 0.15619, + "grad_norm": 0.7572274328551856, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 15619 + }, + { + "epoch": 0.1562, + "grad_norm": 0.8103320823809439, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 15620 + }, + { + "epoch": 0.15621, + "grad_norm": 0.7878389534066891, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 15621 + }, + { + "epoch": 0.15622, + "grad_norm": 0.7443935066134619, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 15622 + }, + { + "epoch": 0.15623, + "grad_norm": 0.6766944161961428, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 15623 + }, + { + "epoch": 0.15624, + "grad_norm": 0.697385615345915, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 15624 + }, + { + "epoch": 0.15625, + "grad_norm": 0.8204428082539018, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 15625 + }, + { + "epoch": 0.15626, + "grad_norm": 0.8759598190282629, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 15626 + }, + { + "epoch": 0.15627, + "grad_norm": 0.9387044096573821, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 15627 + }, + { + "epoch": 0.15628, + "grad_norm": 1.3407919288523218, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 15628 + }, + { + "epoch": 0.15629, + "grad_norm": 0.8778475783479913, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 15629 + }, + { + "epoch": 0.1563, + "grad_norm": 0.8032525466694488, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 15630 + }, + { + "epoch": 0.15631, + "grad_norm": 0.7706529729065191, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 15631 + }, + { + "epoch": 0.15632, + "grad_norm": 0.777650827857467, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 15632 + }, + { + "epoch": 0.15633, + "grad_norm": 0.7507406733082125, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 15633 + }, + { + "epoch": 0.15634, + "grad_norm": 0.7526007500523764, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 15634 + }, + { + "epoch": 0.15635, + "grad_norm": 0.873807783654368, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 15635 + }, + { + "epoch": 0.15636, + "grad_norm": 1.082259618076844, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 15636 + }, + { + "epoch": 0.15637, + "grad_norm": 1.2133033360040355, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 15637 + }, + { + "epoch": 0.15638, + "grad_norm": 0.7981373134230838, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 15638 + }, + { + "epoch": 0.15639, + "grad_norm": 0.7395655181129475, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 15639 + }, + { + "epoch": 0.1564, + "grad_norm": 0.7191230219618001, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 15640 + }, + { + "epoch": 0.15641, + "grad_norm": 0.8915858040822598, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 15641 + }, + { + "epoch": 0.15642, + "grad_norm": 1.049270737940305, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 15642 + }, + { + "epoch": 0.15643, + "grad_norm": 1.0616572559274393, + "learning_rate": 0.003, + "loss": 4.092, + "step": 15643 + }, + { + "epoch": 0.15644, + "grad_norm": 1.1734717817211473, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 15644 + }, + { + "epoch": 0.15645, + "grad_norm": 0.908989516417746, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 15645 + }, + { + "epoch": 0.15646, + "grad_norm": 0.7782227095912202, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 15646 + }, + { + "epoch": 0.15647, + "grad_norm": 0.7433683006151823, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 15647 + }, + { + "epoch": 0.15648, + "grad_norm": 0.7168902425258895, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 15648 + }, + { + "epoch": 0.15649, + "grad_norm": 0.8293715696958915, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 15649 + }, + { + "epoch": 0.1565, + "grad_norm": 1.121938905967932, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 15650 + }, + { + "epoch": 0.15651, + "grad_norm": 1.1008889824817694, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 15651 + }, + { + "epoch": 0.15652, + "grad_norm": 0.9041408844798634, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 15652 + }, + { + "epoch": 0.15653, + "grad_norm": 0.9356524254061378, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 15653 + }, + { + "epoch": 0.15654, + "grad_norm": 1.017812118603101, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 15654 + }, + { + "epoch": 0.15655, + "grad_norm": 0.8849690125000396, + "learning_rate": 0.003, + "loss": 4.1142, + "step": 15655 + }, + { + "epoch": 0.15656, + "grad_norm": 0.80066641325765, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 15656 + }, + { + "epoch": 0.15657, + "grad_norm": 0.7720592171862735, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 15657 + }, + { + "epoch": 0.15658, + "grad_norm": 0.8310448609370841, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 15658 + }, + { + "epoch": 0.15659, + "grad_norm": 0.8911564350069221, + "learning_rate": 0.003, + "loss": 4.089, + "step": 15659 + }, + { + "epoch": 0.1566, + "grad_norm": 0.9011121850370314, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 15660 + }, + { + "epoch": 0.15661, + "grad_norm": 0.8308295448043017, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 15661 + }, + { + "epoch": 0.15662, + "grad_norm": 0.8220893814376936, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 15662 + }, + { + "epoch": 0.15663, + "grad_norm": 0.8508278807804223, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 15663 + }, + { + "epoch": 0.15664, + "grad_norm": 0.8810822277320196, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 15664 + }, + { + "epoch": 0.15665, + "grad_norm": 0.8979826756292933, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 15665 + }, + { + "epoch": 0.15666, + "grad_norm": 1.0794416018717115, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 15666 + }, + { + "epoch": 0.15667, + "grad_norm": 1.1323057680415858, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 15667 + }, + { + "epoch": 0.15668, + "grad_norm": 0.9457768178013904, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 15668 + }, + { + "epoch": 0.15669, + "grad_norm": 0.822780495101964, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 15669 + }, + { + "epoch": 0.1567, + "grad_norm": 0.7925062459807524, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 15670 + }, + { + "epoch": 0.15671, + "grad_norm": 0.7373144217051462, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 15671 + }, + { + "epoch": 0.15672, + "grad_norm": 0.6886291315707866, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 15672 + }, + { + "epoch": 0.15673, + "grad_norm": 0.744018490301524, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 15673 + }, + { + "epoch": 0.15674, + "grad_norm": 0.898881703571009, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 15674 + }, + { + "epoch": 0.15675, + "grad_norm": 1.0405910871431263, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 15675 + }, + { + "epoch": 0.15676, + "grad_norm": 1.096964710533686, + "learning_rate": 0.003, + "loss": 4.091, + "step": 15676 + }, + { + "epoch": 0.15677, + "grad_norm": 0.9826366381431736, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 15677 + }, + { + "epoch": 0.15678, + "grad_norm": 0.8258066792600829, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 15678 + }, + { + "epoch": 0.15679, + "grad_norm": 0.6680673033501574, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 15679 + }, + { + "epoch": 0.1568, + "grad_norm": 0.6398973115746407, + "learning_rate": 0.003, + "loss": 4.052, + "step": 15680 + }, + { + "epoch": 0.15681, + "grad_norm": 0.7222595822300055, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 15681 + }, + { + "epoch": 0.15682, + "grad_norm": 0.8086635202275088, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 15682 + }, + { + "epoch": 0.15683, + "grad_norm": 0.977112135109623, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 15683 + }, + { + "epoch": 0.15684, + "grad_norm": 1.109455056486782, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 15684 + }, + { + "epoch": 0.15685, + "grad_norm": 0.9090749241980511, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 15685 + }, + { + "epoch": 0.15686, + "grad_norm": 0.9628260634567355, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 15686 + }, + { + "epoch": 0.15687, + "grad_norm": 0.9642946528879622, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 15687 + }, + { + "epoch": 0.15688, + "grad_norm": 0.854095914822299, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 15688 + }, + { + "epoch": 0.15689, + "grad_norm": 0.8479285272959167, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 15689 + }, + { + "epoch": 0.1569, + "grad_norm": 0.8781520499741814, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 15690 + }, + { + "epoch": 0.15691, + "grad_norm": 0.9017737170528728, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 15691 + }, + { + "epoch": 0.15692, + "grad_norm": 1.0384170734104432, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 15692 + }, + { + "epoch": 0.15693, + "grad_norm": 1.0401987567002433, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 15693 + }, + { + "epoch": 0.15694, + "grad_norm": 1.0079068238266227, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 15694 + }, + { + "epoch": 0.15695, + "grad_norm": 1.0256911911475113, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 15695 + }, + { + "epoch": 0.15696, + "grad_norm": 0.9743371047297614, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 15696 + }, + { + "epoch": 0.15697, + "grad_norm": 0.9372429085578399, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 15697 + }, + { + "epoch": 0.15698, + "grad_norm": 0.8421375798506733, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 15698 + }, + { + "epoch": 0.15699, + "grad_norm": 0.8581421722938858, + "learning_rate": 0.003, + "loss": 4.1093, + "step": 15699 + }, + { + "epoch": 0.157, + "grad_norm": 0.9545500654908698, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 15700 + }, + { + "epoch": 0.15701, + "grad_norm": 0.9543262188975177, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 15701 + }, + { + "epoch": 0.15702, + "grad_norm": 0.9627815404535414, + "learning_rate": 0.003, + "loss": 4.069, + "step": 15702 + }, + { + "epoch": 0.15703, + "grad_norm": 0.9918419255125808, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 15703 + }, + { + "epoch": 0.15704, + "grad_norm": 1.0131857783053495, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 15704 + }, + { + "epoch": 0.15705, + "grad_norm": 1.0257031076667074, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 15705 + }, + { + "epoch": 0.15706, + "grad_norm": 1.0286957803718653, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 15706 + }, + { + "epoch": 0.15707, + "grad_norm": 0.9186980426032404, + "learning_rate": 0.003, + "loss": 4.076, + "step": 15707 + }, + { + "epoch": 0.15708, + "grad_norm": 0.7730894632026283, + "learning_rate": 0.003, + "loss": 4.077, + "step": 15708 + }, + { + "epoch": 0.15709, + "grad_norm": 0.7607058204213273, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 15709 + }, + { + "epoch": 0.1571, + "grad_norm": 0.847158509950108, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 15710 + }, + { + "epoch": 0.15711, + "grad_norm": 0.7855540977503699, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 15711 + }, + { + "epoch": 0.15712, + "grad_norm": 0.7590691475382797, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 15712 + }, + { + "epoch": 0.15713, + "grad_norm": 0.7515180294257787, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 15713 + }, + { + "epoch": 0.15714, + "grad_norm": 0.9504277167206577, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 15714 + }, + { + "epoch": 0.15715, + "grad_norm": 1.198440609131435, + "learning_rate": 0.003, + "loss": 4.061, + "step": 15715 + }, + { + "epoch": 0.15716, + "grad_norm": 1.0858578296053583, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 15716 + }, + { + "epoch": 0.15717, + "grad_norm": 1.0199720058816553, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 15717 + }, + { + "epoch": 0.15718, + "grad_norm": 0.8520616767350729, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 15718 + }, + { + "epoch": 0.15719, + "grad_norm": 0.8426834502301647, + "learning_rate": 0.003, + "loss": 4.074, + "step": 15719 + }, + { + "epoch": 0.1572, + "grad_norm": 0.7956666337344602, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 15720 + }, + { + "epoch": 0.15721, + "grad_norm": 0.8129533212501311, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 15721 + }, + { + "epoch": 0.15722, + "grad_norm": 0.7928362352001691, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 15722 + }, + { + "epoch": 0.15723, + "grad_norm": 0.829214299157527, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 15723 + }, + { + "epoch": 0.15724, + "grad_norm": 0.919715823163542, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 15724 + }, + { + "epoch": 0.15725, + "grad_norm": 1.1444190031629045, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 15725 + }, + { + "epoch": 0.15726, + "grad_norm": 0.9073740489717936, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 15726 + }, + { + "epoch": 0.15727, + "grad_norm": 0.7513967054772299, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 15727 + }, + { + "epoch": 0.15728, + "grad_norm": 0.6879061506877912, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 15728 + }, + { + "epoch": 0.15729, + "grad_norm": 0.7177779412606914, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 15729 + }, + { + "epoch": 0.1573, + "grad_norm": 0.694528917822137, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 15730 + }, + { + "epoch": 0.15731, + "grad_norm": 0.5816306084682613, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 15731 + }, + { + "epoch": 0.15732, + "grad_norm": 0.6184272707278754, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 15732 + }, + { + "epoch": 0.15733, + "grad_norm": 0.6447521953877523, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 15733 + }, + { + "epoch": 0.15734, + "grad_norm": 0.7368951937763425, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 15734 + }, + { + "epoch": 0.15735, + "grad_norm": 0.8355992550878152, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 15735 + }, + { + "epoch": 0.15736, + "grad_norm": 1.0353058417685639, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 15736 + }, + { + "epoch": 0.15737, + "grad_norm": 0.9629005530656198, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 15737 + }, + { + "epoch": 0.15738, + "grad_norm": 1.1452608606326131, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 15738 + }, + { + "epoch": 0.15739, + "grad_norm": 0.8067676323285322, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 15739 + }, + { + "epoch": 0.1574, + "grad_norm": 0.6876944333280933, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 15740 + }, + { + "epoch": 0.15741, + "grad_norm": 0.6487259660436814, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 15741 + }, + { + "epoch": 0.15742, + "grad_norm": 0.6728003567964077, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 15742 + }, + { + "epoch": 0.15743, + "grad_norm": 0.7232384826137596, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 15743 + }, + { + "epoch": 0.15744, + "grad_norm": 0.7748727714315641, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 15744 + }, + { + "epoch": 0.15745, + "grad_norm": 0.9137185643541321, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 15745 + }, + { + "epoch": 0.15746, + "grad_norm": 1.0152304776651153, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 15746 + }, + { + "epoch": 0.15747, + "grad_norm": 1.2635504910008515, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15747 + }, + { + "epoch": 0.15748, + "grad_norm": 0.9544033390249997, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 15748 + }, + { + "epoch": 0.15749, + "grad_norm": 0.8535703853345411, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 15749 + }, + { + "epoch": 0.1575, + "grad_norm": 0.9144994248160686, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 15750 + }, + { + "epoch": 0.15751, + "grad_norm": 1.1130684787630263, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 15751 + }, + { + "epoch": 0.15752, + "grad_norm": 0.9559660859466867, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 15752 + }, + { + "epoch": 0.15753, + "grad_norm": 0.9439558528154831, + "learning_rate": 0.003, + "loss": 4.085, + "step": 15753 + }, + { + "epoch": 0.15754, + "grad_norm": 1.0810191752234264, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 15754 + }, + { + "epoch": 0.15755, + "grad_norm": 0.9101250314218337, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 15755 + }, + { + "epoch": 0.15756, + "grad_norm": 0.80897549279076, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 15756 + }, + { + "epoch": 0.15757, + "grad_norm": 0.6965237862948986, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 15757 + }, + { + "epoch": 0.15758, + "grad_norm": 0.8162714537084819, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 15758 + }, + { + "epoch": 0.15759, + "grad_norm": 0.9764982859149774, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 15759 + }, + { + "epoch": 0.1576, + "grad_norm": 1.0017412746249308, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 15760 + }, + { + "epoch": 0.15761, + "grad_norm": 0.8578906445848595, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 15761 + }, + { + "epoch": 0.15762, + "grad_norm": 0.7995798485272619, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 15762 + }, + { + "epoch": 0.15763, + "grad_norm": 0.8079100884732389, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 15763 + }, + { + "epoch": 0.15764, + "grad_norm": 0.8170771717330821, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 15764 + }, + { + "epoch": 0.15765, + "grad_norm": 0.8462133152211715, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 15765 + }, + { + "epoch": 0.15766, + "grad_norm": 0.9314144890134789, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 15766 + }, + { + "epoch": 0.15767, + "grad_norm": 0.9572502108046494, + "learning_rate": 0.003, + "loss": 4.096, + "step": 15767 + }, + { + "epoch": 0.15768, + "grad_norm": 1.1736672094140246, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 15768 + }, + { + "epoch": 0.15769, + "grad_norm": 0.9717898579580005, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 15769 + }, + { + "epoch": 0.1577, + "grad_norm": 0.9281000936883017, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 15770 + }, + { + "epoch": 0.15771, + "grad_norm": 0.9756625332243586, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 15771 + }, + { + "epoch": 0.15772, + "grad_norm": 1.0761356894686833, + "learning_rate": 0.003, + "loss": 4.1099, + "step": 15772 + }, + { + "epoch": 0.15773, + "grad_norm": 1.0718172437686129, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 15773 + }, + { + "epoch": 0.15774, + "grad_norm": 0.8564932899366225, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 15774 + }, + { + "epoch": 0.15775, + "grad_norm": 0.8429720031221283, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 15775 + }, + { + "epoch": 0.15776, + "grad_norm": 0.8145436415841807, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 15776 + }, + { + "epoch": 0.15777, + "grad_norm": 0.7495825811763993, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 15777 + }, + { + "epoch": 0.15778, + "grad_norm": 0.66056685707336, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 15778 + }, + { + "epoch": 0.15779, + "grad_norm": 0.7236335334459462, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 15779 + }, + { + "epoch": 0.1578, + "grad_norm": 0.7928322617373862, + "learning_rate": 0.003, + "loss": 4.078, + "step": 15780 + }, + { + "epoch": 0.15781, + "grad_norm": 0.9811309359272324, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 15781 + }, + { + "epoch": 0.15782, + "grad_norm": 1.2922646757724578, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 15782 + }, + { + "epoch": 0.15783, + "grad_norm": 0.613175412425735, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 15783 + }, + { + "epoch": 0.15784, + "grad_norm": 0.8101068988943512, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 15784 + }, + { + "epoch": 0.15785, + "grad_norm": 1.0459800872368754, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 15785 + }, + { + "epoch": 0.15786, + "grad_norm": 0.9335789478468234, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 15786 + }, + { + "epoch": 0.15787, + "grad_norm": 0.8549098445919725, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 15787 + }, + { + "epoch": 0.15788, + "grad_norm": 0.6652901808347063, + "learning_rate": 0.003, + "loss": 4.086, + "step": 15788 + }, + { + "epoch": 0.15789, + "grad_norm": 0.7645271509465255, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 15789 + }, + { + "epoch": 0.1579, + "grad_norm": 0.881282108531678, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 15790 + }, + { + "epoch": 0.15791, + "grad_norm": 0.8842507702863724, + "learning_rate": 0.003, + "loss": 4.079, + "step": 15791 + }, + { + "epoch": 0.15792, + "grad_norm": 0.9161298484082427, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 15792 + }, + { + "epoch": 0.15793, + "grad_norm": 0.8918615565759648, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 15793 + }, + { + "epoch": 0.15794, + "grad_norm": 0.8461781028666253, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 15794 + }, + { + "epoch": 0.15795, + "grad_norm": 0.796053179876573, + "learning_rate": 0.003, + "loss": 4.082, + "step": 15795 + }, + { + "epoch": 0.15796, + "grad_norm": 0.7884645267478809, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 15796 + }, + { + "epoch": 0.15797, + "grad_norm": 0.8402589525212326, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 15797 + }, + { + "epoch": 0.15798, + "grad_norm": 0.9791280961884749, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 15798 + }, + { + "epoch": 0.15799, + "grad_norm": 1.0466519313257117, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 15799 + }, + { + "epoch": 0.158, + "grad_norm": 0.9546207591549601, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 15800 + }, + { + "epoch": 0.15801, + "grad_norm": 0.9295367269240018, + "learning_rate": 0.003, + "loss": 4.066, + "step": 15801 + }, + { + "epoch": 0.15802, + "grad_norm": 0.8491558503792144, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 15802 + }, + { + "epoch": 0.15803, + "grad_norm": 0.7909596313425348, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 15803 + }, + { + "epoch": 0.15804, + "grad_norm": 0.8561291185647388, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 15804 + }, + { + "epoch": 0.15805, + "grad_norm": 0.9291004483511431, + "learning_rate": 0.003, + "loss": 4.09, + "step": 15805 + }, + { + "epoch": 0.15806, + "grad_norm": 1.0798417303303218, + "learning_rate": 0.003, + "loss": 4.09, + "step": 15806 + }, + { + "epoch": 0.15807, + "grad_norm": 0.9130461523318819, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 15807 + }, + { + "epoch": 0.15808, + "grad_norm": 0.7153070233852723, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 15808 + }, + { + "epoch": 0.15809, + "grad_norm": 0.6466685309472603, + "learning_rate": 0.003, + "loss": 4.061, + "step": 15809 + }, + { + "epoch": 0.1581, + "grad_norm": 0.7137648139929387, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 15810 + }, + { + "epoch": 0.15811, + "grad_norm": 0.7371865634872482, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 15811 + }, + { + "epoch": 0.15812, + "grad_norm": 0.666837145811955, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 15812 + }, + { + "epoch": 0.15813, + "grad_norm": 0.6849383084673003, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 15813 + }, + { + "epoch": 0.15814, + "grad_norm": 0.7690573260358415, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 15814 + }, + { + "epoch": 0.15815, + "grad_norm": 0.9014056808320806, + "learning_rate": 0.003, + "loss": 4.06, + "step": 15815 + }, + { + "epoch": 0.15816, + "grad_norm": 0.9974616340619585, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15816 + }, + { + "epoch": 0.15817, + "grad_norm": 1.0018362625024821, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 15817 + }, + { + "epoch": 0.15818, + "grad_norm": 1.0045306907978533, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 15818 + }, + { + "epoch": 0.15819, + "grad_norm": 1.044796734807557, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 15819 + }, + { + "epoch": 0.1582, + "grad_norm": 1.1060677062990496, + "learning_rate": 0.003, + "loss": 4.1162, + "step": 15820 + }, + { + "epoch": 0.15821, + "grad_norm": 0.9625014805197428, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 15821 + }, + { + "epoch": 0.15822, + "grad_norm": 1.1276907425219156, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 15822 + }, + { + "epoch": 0.15823, + "grad_norm": 0.9231495786036672, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 15823 + }, + { + "epoch": 0.15824, + "grad_norm": 0.7210559162759669, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 15824 + }, + { + "epoch": 0.15825, + "grad_norm": 0.6689758088881779, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 15825 + }, + { + "epoch": 0.15826, + "grad_norm": 0.7991481396230212, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 15826 + }, + { + "epoch": 0.15827, + "grad_norm": 0.9195265416903654, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 15827 + }, + { + "epoch": 0.15828, + "grad_norm": 0.9227451049506976, + "learning_rate": 0.003, + "loss": 4.05, + "step": 15828 + }, + { + "epoch": 0.15829, + "grad_norm": 1.183853321591796, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 15829 + }, + { + "epoch": 0.1583, + "grad_norm": 1.116908744750503, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 15830 + }, + { + "epoch": 0.15831, + "grad_norm": 0.9280804561427755, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 15831 + }, + { + "epoch": 0.15832, + "grad_norm": 0.6993523895866569, + "learning_rate": 0.003, + "loss": 4.069, + "step": 15832 + }, + { + "epoch": 0.15833, + "grad_norm": 0.6778153881642555, + "learning_rate": 0.003, + "loss": 4.041, + "step": 15833 + }, + { + "epoch": 0.15834, + "grad_norm": 0.6710842473104999, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 15834 + }, + { + "epoch": 0.15835, + "grad_norm": 0.7940880968918053, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 15835 + }, + { + "epoch": 0.15836, + "grad_norm": 0.9486913042238366, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 15836 + }, + { + "epoch": 0.15837, + "grad_norm": 0.9574558816711672, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 15837 + }, + { + "epoch": 0.15838, + "grad_norm": 0.8016829926384446, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 15838 + }, + { + "epoch": 0.15839, + "grad_norm": 0.8767245454791264, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 15839 + }, + { + "epoch": 0.1584, + "grad_norm": 0.9753319003274548, + "learning_rate": 0.003, + "loss": 4.061, + "step": 15840 + }, + { + "epoch": 0.15841, + "grad_norm": 0.935985741070316, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 15841 + }, + { + "epoch": 0.15842, + "grad_norm": 0.9731181664130097, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 15842 + }, + { + "epoch": 0.15843, + "grad_norm": 0.9810573764018233, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 15843 + }, + { + "epoch": 0.15844, + "grad_norm": 1.0527692285822723, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 15844 + }, + { + "epoch": 0.15845, + "grad_norm": 0.9724371067157719, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 15845 + }, + { + "epoch": 0.15846, + "grad_norm": 0.9355618632943368, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 15846 + }, + { + "epoch": 0.15847, + "grad_norm": 0.8765585917398276, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 15847 + }, + { + "epoch": 0.15848, + "grad_norm": 0.8960885793948482, + "learning_rate": 0.003, + "loss": 4.059, + "step": 15848 + }, + { + "epoch": 0.15849, + "grad_norm": 0.9078083133473381, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 15849 + }, + { + "epoch": 0.1585, + "grad_norm": 0.9588368589979, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 15850 + }, + { + "epoch": 0.15851, + "grad_norm": 1.0538067692342266, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 15851 + }, + { + "epoch": 0.15852, + "grad_norm": 0.8705415590237465, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 15852 + }, + { + "epoch": 0.15853, + "grad_norm": 0.7556140365323665, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 15853 + }, + { + "epoch": 0.15854, + "grad_norm": 0.708190257785594, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 15854 + }, + { + "epoch": 0.15855, + "grad_norm": 0.7239228060794102, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 15855 + }, + { + "epoch": 0.15856, + "grad_norm": 0.7071674787914626, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 15856 + }, + { + "epoch": 0.15857, + "grad_norm": 0.9379712638299085, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15857 + }, + { + "epoch": 0.15858, + "grad_norm": 1.1165791795695725, + "learning_rate": 0.003, + "loss": 4.095, + "step": 15858 + }, + { + "epoch": 0.15859, + "grad_norm": 0.8722408835796578, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 15859 + }, + { + "epoch": 0.1586, + "grad_norm": 0.7530543252965523, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 15860 + }, + { + "epoch": 0.15861, + "grad_norm": 0.6665608961709756, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 15861 + }, + { + "epoch": 0.15862, + "grad_norm": 0.730624445103437, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 15862 + }, + { + "epoch": 0.15863, + "grad_norm": 0.8559718685431023, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 15863 + }, + { + "epoch": 0.15864, + "grad_norm": 1.0446524778821076, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 15864 + }, + { + "epoch": 0.15865, + "grad_norm": 0.9897685739575387, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 15865 + }, + { + "epoch": 0.15866, + "grad_norm": 0.8793218934252126, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 15866 + }, + { + "epoch": 0.15867, + "grad_norm": 0.8602972215958646, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 15867 + }, + { + "epoch": 0.15868, + "grad_norm": 0.8087072696062023, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 15868 + }, + { + "epoch": 0.15869, + "grad_norm": 0.7744091911306118, + "learning_rate": 0.003, + "loss": 4.035, + "step": 15869 + }, + { + "epoch": 0.1587, + "grad_norm": 0.8032899597741882, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 15870 + }, + { + "epoch": 0.15871, + "grad_norm": 0.8212848129479703, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 15871 + }, + { + "epoch": 0.15872, + "grad_norm": 0.8098673310288629, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 15872 + }, + { + "epoch": 0.15873, + "grad_norm": 0.8344842706362949, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 15873 + }, + { + "epoch": 0.15874, + "grad_norm": 0.9609025124157962, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 15874 + }, + { + "epoch": 0.15875, + "grad_norm": 1.166264056059148, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 15875 + }, + { + "epoch": 0.15876, + "grad_norm": 0.9426802309816715, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 15876 + }, + { + "epoch": 0.15877, + "grad_norm": 0.9748216070848362, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 15877 + }, + { + "epoch": 0.15878, + "grad_norm": 0.9652460088843033, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 15878 + }, + { + "epoch": 0.15879, + "grad_norm": 0.9470132798977262, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 15879 + }, + { + "epoch": 0.1588, + "grad_norm": 0.8330772838015335, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 15880 + }, + { + "epoch": 0.15881, + "grad_norm": 0.8834610872328756, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 15881 + }, + { + "epoch": 0.15882, + "grad_norm": 0.9353522346926292, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 15882 + }, + { + "epoch": 0.15883, + "grad_norm": 0.8955176576822342, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 15883 + }, + { + "epoch": 0.15884, + "grad_norm": 0.9101327685911393, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 15884 + }, + { + "epoch": 0.15885, + "grad_norm": 0.8630667318408559, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 15885 + }, + { + "epoch": 0.15886, + "grad_norm": 0.8895435355974495, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 15886 + }, + { + "epoch": 0.15887, + "grad_norm": 1.02701068155811, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 15887 + }, + { + "epoch": 0.15888, + "grad_norm": 1.1528507805179966, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 15888 + }, + { + "epoch": 0.15889, + "grad_norm": 0.8222781462504181, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 15889 + }, + { + "epoch": 0.1589, + "grad_norm": 0.6987602441668954, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 15890 + }, + { + "epoch": 0.15891, + "grad_norm": 0.7600823317276932, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 15891 + }, + { + "epoch": 0.15892, + "grad_norm": 0.8391905920734278, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 15892 + }, + { + "epoch": 0.15893, + "grad_norm": 0.8387386380485324, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 15893 + }, + { + "epoch": 0.15894, + "grad_norm": 0.7891534314815928, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 15894 + }, + { + "epoch": 0.15895, + "grad_norm": 0.810900494614915, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 15895 + }, + { + "epoch": 0.15896, + "grad_norm": 0.9182939036553969, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 15896 + }, + { + "epoch": 0.15897, + "grad_norm": 0.9616215424728389, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 15897 + }, + { + "epoch": 0.15898, + "grad_norm": 0.8999451087939887, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 15898 + }, + { + "epoch": 0.15899, + "grad_norm": 0.9174561922257051, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 15899 + }, + { + "epoch": 0.159, + "grad_norm": 1.0073397840502547, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 15900 + }, + { + "epoch": 0.15901, + "grad_norm": 0.8570122026857621, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 15901 + }, + { + "epoch": 0.15902, + "grad_norm": 0.7256120389509747, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 15902 + }, + { + "epoch": 0.15903, + "grad_norm": 0.77451479463265, + "learning_rate": 0.003, + "loss": 4.07, + "step": 15903 + }, + { + "epoch": 0.15904, + "grad_norm": 0.8142437498841877, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 15904 + }, + { + "epoch": 0.15905, + "grad_norm": 0.8886496045970114, + "learning_rate": 0.003, + "loss": 4.058, + "step": 15905 + }, + { + "epoch": 0.15906, + "grad_norm": 0.7823787993194367, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 15906 + }, + { + "epoch": 0.15907, + "grad_norm": 0.8576597012670907, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 15907 + }, + { + "epoch": 0.15908, + "grad_norm": 1.0226556653999712, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 15908 + }, + { + "epoch": 0.15909, + "grad_norm": 1.228440531695168, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 15909 + }, + { + "epoch": 0.1591, + "grad_norm": 0.8072323047436248, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 15910 + }, + { + "epoch": 0.15911, + "grad_norm": 0.6625593171708525, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 15911 + }, + { + "epoch": 0.15912, + "grad_norm": 0.7641361854965768, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 15912 + }, + { + "epoch": 0.15913, + "grad_norm": 0.8955766167997712, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 15913 + }, + { + "epoch": 0.15914, + "grad_norm": 0.9284289402262057, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 15914 + }, + { + "epoch": 0.15915, + "grad_norm": 0.8998828107354967, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 15915 + }, + { + "epoch": 0.15916, + "grad_norm": 0.9402371663310323, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 15916 + }, + { + "epoch": 0.15917, + "grad_norm": 0.9902722681553752, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 15917 + }, + { + "epoch": 0.15918, + "grad_norm": 1.0974782386344044, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 15918 + }, + { + "epoch": 0.15919, + "grad_norm": 0.8974947705604499, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 15919 + }, + { + "epoch": 0.1592, + "grad_norm": 0.8323694012148697, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 15920 + }, + { + "epoch": 0.15921, + "grad_norm": 0.8507533872744127, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 15921 + }, + { + "epoch": 0.15922, + "grad_norm": 1.0006896458398264, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 15922 + }, + { + "epoch": 0.15923, + "grad_norm": 1.0863181557188903, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 15923 + }, + { + "epoch": 0.15924, + "grad_norm": 0.9521719258873104, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 15924 + }, + { + "epoch": 0.15925, + "grad_norm": 0.944414005035015, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 15925 + }, + { + "epoch": 0.15926, + "grad_norm": 0.9296021380224746, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 15926 + }, + { + "epoch": 0.15927, + "grad_norm": 0.8613107161757301, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 15927 + }, + { + "epoch": 0.15928, + "grad_norm": 0.9249367893175384, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 15928 + }, + { + "epoch": 0.15929, + "grad_norm": 0.8878680131531572, + "learning_rate": 0.003, + "loss": 4.086, + "step": 15929 + }, + { + "epoch": 0.1593, + "grad_norm": 1.013715090911043, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 15930 + }, + { + "epoch": 0.15931, + "grad_norm": 0.8510340941979475, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 15931 + }, + { + "epoch": 0.15932, + "grad_norm": 0.815676870885383, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 15932 + }, + { + "epoch": 0.15933, + "grad_norm": 0.7743675386553625, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 15933 + }, + { + "epoch": 0.15934, + "grad_norm": 0.7403873546053269, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 15934 + }, + { + "epoch": 0.15935, + "grad_norm": 0.7858125515042295, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 15935 + }, + { + "epoch": 0.15936, + "grad_norm": 0.8930344673960163, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 15936 + }, + { + "epoch": 0.15937, + "grad_norm": 0.9316602155800011, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 15937 + }, + { + "epoch": 0.15938, + "grad_norm": 0.8816423475685182, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 15938 + }, + { + "epoch": 0.15939, + "grad_norm": 0.8023319702925781, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 15939 + }, + { + "epoch": 0.1594, + "grad_norm": 0.8651903663949028, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 15940 + }, + { + "epoch": 0.15941, + "grad_norm": 0.8852802210463416, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 15941 + }, + { + "epoch": 0.15942, + "grad_norm": 0.8149679450595194, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 15942 + }, + { + "epoch": 0.15943, + "grad_norm": 0.7056486815572971, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 15943 + }, + { + "epoch": 0.15944, + "grad_norm": 0.7494177680373358, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 15944 + }, + { + "epoch": 0.15945, + "grad_norm": 0.8463757146270932, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 15945 + }, + { + "epoch": 0.15946, + "grad_norm": 0.9484762575642697, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 15946 + }, + { + "epoch": 0.15947, + "grad_norm": 1.2333563410789112, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 15947 + }, + { + "epoch": 0.15948, + "grad_norm": 0.7909947284917025, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 15948 + }, + { + "epoch": 0.15949, + "grad_norm": 0.7306097375896525, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 15949 + }, + { + "epoch": 0.1595, + "grad_norm": 0.7554572014212151, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 15950 + }, + { + "epoch": 0.15951, + "grad_norm": 0.8045348607371821, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 15951 + }, + { + "epoch": 0.15952, + "grad_norm": 0.8632792409610709, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 15952 + }, + { + "epoch": 0.15953, + "grad_norm": 0.8832684523003634, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 15953 + }, + { + "epoch": 0.15954, + "grad_norm": 0.9299209913713404, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 15954 + }, + { + "epoch": 0.15955, + "grad_norm": 1.032021770746047, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 15955 + }, + { + "epoch": 0.15956, + "grad_norm": 0.9818817360177301, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 15956 + }, + { + "epoch": 0.15957, + "grad_norm": 0.9918576095944156, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 15957 + }, + { + "epoch": 0.15958, + "grad_norm": 0.9072398848104601, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 15958 + }, + { + "epoch": 0.15959, + "grad_norm": 0.8848674987600716, + "learning_rate": 0.003, + "loss": 4.079, + "step": 15959 + }, + { + "epoch": 0.1596, + "grad_norm": 0.8117353518274283, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 15960 + }, + { + "epoch": 0.15961, + "grad_norm": 0.813328243786487, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 15961 + }, + { + "epoch": 0.15962, + "grad_norm": 0.977835910106764, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 15962 + }, + { + "epoch": 0.15963, + "grad_norm": 1.2892851496837776, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 15963 + }, + { + "epoch": 0.15964, + "grad_norm": 0.6903747880207948, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 15964 + }, + { + "epoch": 0.15965, + "grad_norm": 0.6844003685678792, + "learning_rate": 0.003, + "loss": 4.075, + "step": 15965 + }, + { + "epoch": 0.15966, + "grad_norm": 0.664731853915502, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 15966 + }, + { + "epoch": 0.15967, + "grad_norm": 0.7169235637818511, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 15967 + }, + { + "epoch": 0.15968, + "grad_norm": 0.8060018350301563, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 15968 + }, + { + "epoch": 0.15969, + "grad_norm": 0.9091242557631898, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 15969 + }, + { + "epoch": 0.1597, + "grad_norm": 1.070008204035234, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 15970 + }, + { + "epoch": 0.15971, + "grad_norm": 1.2103233997348608, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 15971 + }, + { + "epoch": 0.15972, + "grad_norm": 0.8094656173431708, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 15972 + }, + { + "epoch": 0.15973, + "grad_norm": 0.8478586830339004, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 15973 + }, + { + "epoch": 0.15974, + "grad_norm": 0.8653546240953361, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 15974 + }, + { + "epoch": 0.15975, + "grad_norm": 0.8860408958322518, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 15975 + }, + { + "epoch": 0.15976, + "grad_norm": 0.9102600963263386, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 15976 + }, + { + "epoch": 0.15977, + "grad_norm": 0.8727421657068137, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 15977 + }, + { + "epoch": 0.15978, + "grad_norm": 0.9107200803049118, + "learning_rate": 0.003, + "loss": 4.074, + "step": 15978 + }, + { + "epoch": 0.15979, + "grad_norm": 0.9341700720119386, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 15979 + }, + { + "epoch": 0.1598, + "grad_norm": 0.8318930244131669, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 15980 + }, + { + "epoch": 0.15981, + "grad_norm": 0.7454439507132611, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 15981 + }, + { + "epoch": 0.15982, + "grad_norm": 0.9058637695553168, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 15982 + }, + { + "epoch": 0.15983, + "grad_norm": 1.2318091116124625, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 15983 + }, + { + "epoch": 0.15984, + "grad_norm": 0.928161210640171, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 15984 + }, + { + "epoch": 0.15985, + "grad_norm": 0.8825323891365938, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 15985 + }, + { + "epoch": 0.15986, + "grad_norm": 0.9331902724573091, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 15986 + }, + { + "epoch": 0.15987, + "grad_norm": 0.9173291854109743, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 15987 + }, + { + "epoch": 0.15988, + "grad_norm": 0.6923402307071086, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 15988 + }, + { + "epoch": 0.15989, + "grad_norm": 0.6881565806819077, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 15989 + }, + { + "epoch": 0.1599, + "grad_norm": 0.6415710035119405, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 15990 + }, + { + "epoch": 0.15991, + "grad_norm": 0.5702418106814733, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 15991 + }, + { + "epoch": 0.15992, + "grad_norm": 0.5454637420160554, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 15992 + }, + { + "epoch": 0.15993, + "grad_norm": 0.6275972535879244, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 15993 + }, + { + "epoch": 0.15994, + "grad_norm": 0.6948256041813252, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 15994 + }, + { + "epoch": 0.15995, + "grad_norm": 0.7618122762233865, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 15995 + }, + { + "epoch": 0.15996, + "grad_norm": 0.7733683885650522, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 15996 + }, + { + "epoch": 0.15997, + "grad_norm": 0.9205548246120419, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 15997 + }, + { + "epoch": 0.15998, + "grad_norm": 1.0012476550778662, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 15998 + }, + { + "epoch": 0.15999, + "grad_norm": 0.8926619011650007, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 15999 + }, + { + "epoch": 0.16, + "grad_norm": 1.04034913772308, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 16000 + }, + { + "epoch": 0.16001, + "grad_norm": 0.9875749945099409, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 16001 + }, + { + "epoch": 0.16002, + "grad_norm": 1.0522854332233935, + "learning_rate": 0.003, + "loss": 4.07, + "step": 16002 + }, + { + "epoch": 0.16003, + "grad_norm": 0.7770874892608689, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 16003 + }, + { + "epoch": 0.16004, + "grad_norm": 0.7574820985938998, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 16004 + }, + { + "epoch": 0.16005, + "grad_norm": 0.7585189135995564, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 16005 + }, + { + "epoch": 0.16006, + "grad_norm": 0.6546507720652547, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 16006 + }, + { + "epoch": 0.16007, + "grad_norm": 0.7581247263003393, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 16007 + }, + { + "epoch": 0.16008, + "grad_norm": 0.8826284331558714, + "learning_rate": 0.003, + "loss": 4.087, + "step": 16008 + }, + { + "epoch": 0.16009, + "grad_norm": 1.240712531102964, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 16009 + }, + { + "epoch": 0.1601, + "grad_norm": 0.9538344035686259, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 16010 + }, + { + "epoch": 0.16011, + "grad_norm": 1.033857866287827, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 16011 + }, + { + "epoch": 0.16012, + "grad_norm": 0.9311632932211753, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 16012 + }, + { + "epoch": 0.16013, + "grad_norm": 1.0491791126642829, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 16013 + }, + { + "epoch": 0.16014, + "grad_norm": 0.9839521570660503, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 16014 + }, + { + "epoch": 0.16015, + "grad_norm": 0.8702868304503665, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 16015 + }, + { + "epoch": 0.16016, + "grad_norm": 0.7530861667542754, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 16016 + }, + { + "epoch": 0.16017, + "grad_norm": 0.7147199273034064, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 16017 + }, + { + "epoch": 0.16018, + "grad_norm": 0.6433067677650978, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 16018 + }, + { + "epoch": 0.16019, + "grad_norm": 0.7594542328789683, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 16019 + }, + { + "epoch": 0.1602, + "grad_norm": 0.8970739684818545, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 16020 + }, + { + "epoch": 0.16021, + "grad_norm": 0.8490208137519246, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 16021 + }, + { + "epoch": 0.16022, + "grad_norm": 0.9183771077368851, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 16022 + }, + { + "epoch": 0.16023, + "grad_norm": 1.0917793172404378, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 16023 + }, + { + "epoch": 0.16024, + "grad_norm": 0.9644313592043875, + "learning_rate": 0.003, + "loss": 4.079, + "step": 16024 + }, + { + "epoch": 0.16025, + "grad_norm": 0.9459752941651064, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 16025 + }, + { + "epoch": 0.16026, + "grad_norm": 1.025321156474423, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 16026 + }, + { + "epoch": 0.16027, + "grad_norm": 1.1053338433748303, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 16027 + }, + { + "epoch": 0.16028, + "grad_norm": 0.807135351221388, + "learning_rate": 0.003, + "loss": 4.081, + "step": 16028 + }, + { + "epoch": 0.16029, + "grad_norm": 0.6682553235156985, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 16029 + }, + { + "epoch": 0.1603, + "grad_norm": 0.5598976813828049, + "learning_rate": 0.003, + "loss": 4.053, + "step": 16030 + }, + { + "epoch": 0.16031, + "grad_norm": 0.6408595513391798, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 16031 + }, + { + "epoch": 0.16032, + "grad_norm": 0.6611918144827753, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 16032 + }, + { + "epoch": 0.16033, + "grad_norm": 0.7211422238082564, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 16033 + }, + { + "epoch": 0.16034, + "grad_norm": 0.7644557015482096, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 16034 + }, + { + "epoch": 0.16035, + "grad_norm": 0.8034077903207645, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 16035 + }, + { + "epoch": 0.16036, + "grad_norm": 0.8301832588548755, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 16036 + }, + { + "epoch": 0.16037, + "grad_norm": 0.8829561319755103, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 16037 + }, + { + "epoch": 0.16038, + "grad_norm": 0.9062856939981632, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 16038 + }, + { + "epoch": 0.16039, + "grad_norm": 1.0435050301640487, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 16039 + }, + { + "epoch": 0.1604, + "grad_norm": 1.161696773773414, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 16040 + }, + { + "epoch": 0.16041, + "grad_norm": 1.049776658985168, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 16041 + }, + { + "epoch": 0.16042, + "grad_norm": 1.0095540675816592, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 16042 + }, + { + "epoch": 0.16043, + "grad_norm": 1.0389961580110167, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 16043 + }, + { + "epoch": 0.16044, + "grad_norm": 1.1219833074583003, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 16044 + }, + { + "epoch": 0.16045, + "grad_norm": 0.9769731729552488, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 16045 + }, + { + "epoch": 0.16046, + "grad_norm": 0.9821796117147715, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 16046 + }, + { + "epoch": 0.16047, + "grad_norm": 0.9522902886577452, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 16047 + }, + { + "epoch": 0.16048, + "grad_norm": 0.9445776198863215, + "learning_rate": 0.003, + "loss": 4.069, + "step": 16048 + }, + { + "epoch": 0.16049, + "grad_norm": 0.9887852799914124, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 16049 + }, + { + "epoch": 0.1605, + "grad_norm": 0.9282871554540153, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 16050 + }, + { + "epoch": 0.16051, + "grad_norm": 0.8937769659977602, + "learning_rate": 0.003, + "loss": 4.073, + "step": 16051 + }, + { + "epoch": 0.16052, + "grad_norm": 0.8438392490479742, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 16052 + }, + { + "epoch": 0.16053, + "grad_norm": 0.8548799870414077, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 16053 + }, + { + "epoch": 0.16054, + "grad_norm": 0.8207204685878131, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 16054 + }, + { + "epoch": 0.16055, + "grad_norm": 0.9141771764995624, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 16055 + }, + { + "epoch": 0.16056, + "grad_norm": 0.9888082350106229, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 16056 + }, + { + "epoch": 0.16057, + "grad_norm": 1.0735077625299911, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 16057 + }, + { + "epoch": 0.16058, + "grad_norm": 1.0721273987403106, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 16058 + }, + { + "epoch": 0.16059, + "grad_norm": 1.1176266745551053, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 16059 + }, + { + "epoch": 0.1606, + "grad_norm": 0.969389072573312, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 16060 + }, + { + "epoch": 0.16061, + "grad_norm": 0.8224590607768597, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 16061 + }, + { + "epoch": 0.16062, + "grad_norm": 0.6720431701334851, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 16062 + }, + { + "epoch": 0.16063, + "grad_norm": 0.6818729632002152, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 16063 + }, + { + "epoch": 0.16064, + "grad_norm": 0.8985636731615916, + "learning_rate": 0.003, + "loss": 4.094, + "step": 16064 + }, + { + "epoch": 0.16065, + "grad_norm": 1.0579805261802582, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 16065 + }, + { + "epoch": 0.16066, + "grad_norm": 1.0907066243086723, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 16066 + }, + { + "epoch": 0.16067, + "grad_norm": 0.7791578863985551, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 16067 + }, + { + "epoch": 0.16068, + "grad_norm": 0.5591335499491482, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 16068 + }, + { + "epoch": 0.16069, + "grad_norm": 0.591376055821003, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 16069 + }, + { + "epoch": 0.1607, + "grad_norm": 0.6512621136286194, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 16070 + }, + { + "epoch": 0.16071, + "grad_norm": 0.6576654985720259, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 16071 + }, + { + "epoch": 0.16072, + "grad_norm": 0.674312163956186, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 16072 + }, + { + "epoch": 0.16073, + "grad_norm": 0.6518324043878526, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 16073 + }, + { + "epoch": 0.16074, + "grad_norm": 0.6953964500504463, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 16074 + }, + { + "epoch": 0.16075, + "grad_norm": 0.7318374110395267, + "learning_rate": 0.003, + "loss": 4.078, + "step": 16075 + }, + { + "epoch": 0.16076, + "grad_norm": 0.738169566379988, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 16076 + }, + { + "epoch": 0.16077, + "grad_norm": 0.6507978114501953, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 16077 + }, + { + "epoch": 0.16078, + "grad_norm": 0.6444506288956432, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 16078 + }, + { + "epoch": 0.16079, + "grad_norm": 0.7626020296482732, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 16079 + }, + { + "epoch": 0.1608, + "grad_norm": 0.9860731495646577, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 16080 + }, + { + "epoch": 0.16081, + "grad_norm": 1.0900955865738118, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 16081 + }, + { + "epoch": 0.16082, + "grad_norm": 1.053783767638632, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 16082 + }, + { + "epoch": 0.16083, + "grad_norm": 1.101311764095707, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 16083 + }, + { + "epoch": 0.16084, + "grad_norm": 1.225144577378712, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 16084 + }, + { + "epoch": 0.16085, + "grad_norm": 1.0988604664404609, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 16085 + }, + { + "epoch": 0.16086, + "grad_norm": 1.0143867562647895, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 16086 + }, + { + "epoch": 0.16087, + "grad_norm": 0.9728535251670195, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 16087 + }, + { + "epoch": 0.16088, + "grad_norm": 1.1536823978110107, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 16088 + }, + { + "epoch": 0.16089, + "grad_norm": 0.8434130155671425, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 16089 + }, + { + "epoch": 0.1609, + "grad_norm": 0.6284904218935911, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 16090 + }, + { + "epoch": 0.16091, + "grad_norm": 0.6057989632237925, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 16091 + }, + { + "epoch": 0.16092, + "grad_norm": 0.7277031337734261, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 16092 + }, + { + "epoch": 0.16093, + "grad_norm": 0.8613467424478359, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 16093 + }, + { + "epoch": 0.16094, + "grad_norm": 0.992589572106217, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 16094 + }, + { + "epoch": 0.16095, + "grad_norm": 1.0328894614323847, + "learning_rate": 0.003, + "loss": 4.047, + "step": 16095 + }, + { + "epoch": 0.16096, + "grad_norm": 1.0775067185238585, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16096 + }, + { + "epoch": 0.16097, + "grad_norm": 0.9481167996653196, + "learning_rate": 0.003, + "loss": 4.045, + "step": 16097 + }, + { + "epoch": 0.16098, + "grad_norm": 0.8310343218773544, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16098 + }, + { + "epoch": 0.16099, + "grad_norm": 0.7432804786692967, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 16099 + }, + { + "epoch": 0.161, + "grad_norm": 0.7651025363039288, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 16100 + }, + { + "epoch": 0.16101, + "grad_norm": 0.8010871738088194, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 16101 + }, + { + "epoch": 0.16102, + "grad_norm": 0.9549984349511024, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 16102 + }, + { + "epoch": 0.16103, + "grad_norm": 0.8802420693876036, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 16103 + }, + { + "epoch": 0.16104, + "grad_norm": 0.7570305416904907, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 16104 + }, + { + "epoch": 0.16105, + "grad_norm": 0.6899773517005993, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 16105 + }, + { + "epoch": 0.16106, + "grad_norm": 0.6658011982893003, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 16106 + }, + { + "epoch": 0.16107, + "grad_norm": 0.7506078945169913, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 16107 + }, + { + "epoch": 0.16108, + "grad_norm": 0.7424462464592597, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 16108 + }, + { + "epoch": 0.16109, + "grad_norm": 0.7785698489989537, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 16109 + }, + { + "epoch": 0.1611, + "grad_norm": 0.9224817582034468, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 16110 + }, + { + "epoch": 0.16111, + "grad_norm": 0.888232403600876, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 16111 + }, + { + "epoch": 0.16112, + "grad_norm": 0.8482124293727162, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 16112 + }, + { + "epoch": 0.16113, + "grad_norm": 0.7901927949141969, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 16113 + }, + { + "epoch": 0.16114, + "grad_norm": 0.9037691266050597, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 16114 + }, + { + "epoch": 0.16115, + "grad_norm": 0.9356281618274787, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 16115 + }, + { + "epoch": 0.16116, + "grad_norm": 0.8696043376245159, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 16116 + }, + { + "epoch": 0.16117, + "grad_norm": 0.954833084830157, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 16117 + }, + { + "epoch": 0.16118, + "grad_norm": 0.9819884195188994, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 16118 + }, + { + "epoch": 0.16119, + "grad_norm": 1.1803300259419853, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 16119 + }, + { + "epoch": 0.1612, + "grad_norm": 1.042483615683666, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16120 + }, + { + "epoch": 0.16121, + "grad_norm": 1.0832236766831873, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 16121 + }, + { + "epoch": 0.16122, + "grad_norm": 1.1181535977453274, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 16122 + }, + { + "epoch": 0.16123, + "grad_norm": 0.852356323752036, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 16123 + }, + { + "epoch": 0.16124, + "grad_norm": 0.7660958063238035, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 16124 + }, + { + "epoch": 0.16125, + "grad_norm": 0.8917390529772087, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 16125 + }, + { + "epoch": 0.16126, + "grad_norm": 1.0485061599011214, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 16126 + }, + { + "epoch": 0.16127, + "grad_norm": 1.1550677872719373, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 16127 + }, + { + "epoch": 0.16128, + "grad_norm": 1.2033352556382457, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 16128 + }, + { + "epoch": 0.16129, + "grad_norm": 0.9479689526076787, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 16129 + }, + { + "epoch": 0.1613, + "grad_norm": 1.00085432316174, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 16130 + }, + { + "epoch": 0.16131, + "grad_norm": 1.0426525513597849, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 16131 + }, + { + "epoch": 0.16132, + "grad_norm": 0.9472206142456094, + "learning_rate": 0.003, + "loss": 4.08, + "step": 16132 + }, + { + "epoch": 0.16133, + "grad_norm": 0.8465359997349694, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 16133 + }, + { + "epoch": 0.16134, + "grad_norm": 0.8444626961435644, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 16134 + }, + { + "epoch": 0.16135, + "grad_norm": 0.7612020993107949, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 16135 + }, + { + "epoch": 0.16136, + "grad_norm": 0.8235281739406003, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 16136 + }, + { + "epoch": 0.16137, + "grad_norm": 0.8727770642878132, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 16137 + }, + { + "epoch": 0.16138, + "grad_norm": 1.0650874817500309, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 16138 + }, + { + "epoch": 0.16139, + "grad_norm": 0.997039583973425, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 16139 + }, + { + "epoch": 0.1614, + "grad_norm": 0.9335305364102751, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 16140 + }, + { + "epoch": 0.16141, + "grad_norm": 0.8764782582415178, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 16141 + }, + { + "epoch": 0.16142, + "grad_norm": 0.8602530505109808, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 16142 + }, + { + "epoch": 0.16143, + "grad_norm": 0.8516346837710956, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 16143 + }, + { + "epoch": 0.16144, + "grad_norm": 0.8040861469272487, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 16144 + }, + { + "epoch": 0.16145, + "grad_norm": 0.8534615396130611, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 16145 + }, + { + "epoch": 0.16146, + "grad_norm": 0.9310365006212695, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16146 + }, + { + "epoch": 0.16147, + "grad_norm": 0.9047052432143592, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 16147 + }, + { + "epoch": 0.16148, + "grad_norm": 0.8300437671265596, + "learning_rate": 0.003, + "loss": 4.063, + "step": 16148 + }, + { + "epoch": 0.16149, + "grad_norm": 0.796095988427987, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 16149 + }, + { + "epoch": 0.1615, + "grad_norm": 0.6754321803911064, + "learning_rate": 0.003, + "loss": 4.081, + "step": 16150 + }, + { + "epoch": 0.16151, + "grad_norm": 0.5722295073166785, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 16151 + }, + { + "epoch": 0.16152, + "grad_norm": 0.6519936639812628, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 16152 + }, + { + "epoch": 0.16153, + "grad_norm": 0.8071980548334277, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 16153 + }, + { + "epoch": 0.16154, + "grad_norm": 0.9527683328836535, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 16154 + }, + { + "epoch": 0.16155, + "grad_norm": 1.0365543089232858, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 16155 + }, + { + "epoch": 0.16156, + "grad_norm": 0.866323296095971, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 16156 + }, + { + "epoch": 0.16157, + "grad_norm": 0.7802931145542943, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 16157 + }, + { + "epoch": 0.16158, + "grad_norm": 0.9022402072547404, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 16158 + }, + { + "epoch": 0.16159, + "grad_norm": 0.9185278921751916, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 16159 + }, + { + "epoch": 0.1616, + "grad_norm": 0.7779480960036026, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 16160 + }, + { + "epoch": 0.16161, + "grad_norm": 0.8540115917124532, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 16161 + }, + { + "epoch": 0.16162, + "grad_norm": 0.8710301392729666, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 16162 + }, + { + "epoch": 0.16163, + "grad_norm": 0.9185723429298399, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 16163 + }, + { + "epoch": 0.16164, + "grad_norm": 0.870149263886394, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 16164 + }, + { + "epoch": 0.16165, + "grad_norm": 0.9590652445806417, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 16165 + }, + { + "epoch": 0.16166, + "grad_norm": 0.9705187919682385, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 16166 + }, + { + "epoch": 0.16167, + "grad_norm": 0.9354814546160687, + "learning_rate": 0.003, + "loss": 4.074, + "step": 16167 + }, + { + "epoch": 0.16168, + "grad_norm": 1.0353083177703744, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 16168 + }, + { + "epoch": 0.16169, + "grad_norm": 1.139545834970642, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 16169 + }, + { + "epoch": 0.1617, + "grad_norm": 0.9189672899307466, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 16170 + }, + { + "epoch": 0.16171, + "grad_norm": 0.9529781619985794, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 16171 + }, + { + "epoch": 0.16172, + "grad_norm": 1.0523261081221165, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 16172 + }, + { + "epoch": 0.16173, + "grad_norm": 0.9449027969526601, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 16173 + }, + { + "epoch": 0.16174, + "grad_norm": 0.9196952296341383, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 16174 + }, + { + "epoch": 0.16175, + "grad_norm": 0.868460902537329, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 16175 + }, + { + "epoch": 0.16176, + "grad_norm": 0.7378373702584629, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16176 + }, + { + "epoch": 0.16177, + "grad_norm": 0.7264787377825251, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 16177 + }, + { + "epoch": 0.16178, + "grad_norm": 0.754651947342741, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 16178 + }, + { + "epoch": 0.16179, + "grad_norm": 0.8981779971221018, + "learning_rate": 0.003, + "loss": 4.058, + "step": 16179 + }, + { + "epoch": 0.1618, + "grad_norm": 0.8890031517849919, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 16180 + }, + { + "epoch": 0.16181, + "grad_norm": 0.7993844784574181, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 16181 + }, + { + "epoch": 0.16182, + "grad_norm": 0.8383713944082801, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 16182 + }, + { + "epoch": 0.16183, + "grad_norm": 0.741935624659754, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 16183 + }, + { + "epoch": 0.16184, + "grad_norm": 0.9150947015891732, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 16184 + }, + { + "epoch": 0.16185, + "grad_norm": 1.14201213309905, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 16185 + }, + { + "epoch": 0.16186, + "grad_norm": 0.8169479642039301, + "learning_rate": 0.003, + "loss": 4.045, + "step": 16186 + }, + { + "epoch": 0.16187, + "grad_norm": 0.6175338474335003, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 16187 + }, + { + "epoch": 0.16188, + "grad_norm": 0.5067716276572606, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 16188 + }, + { + "epoch": 0.16189, + "grad_norm": 0.630339607808424, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 16189 + }, + { + "epoch": 0.1619, + "grad_norm": 0.7737387391209762, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 16190 + }, + { + "epoch": 0.16191, + "grad_norm": 0.9102922482934057, + "learning_rate": 0.003, + "loss": 4.065, + "step": 16191 + }, + { + "epoch": 0.16192, + "grad_norm": 1.0551965593201214, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 16192 + }, + { + "epoch": 0.16193, + "grad_norm": 0.8780697635294824, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 16193 + }, + { + "epoch": 0.16194, + "grad_norm": 0.7980214042007789, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 16194 + }, + { + "epoch": 0.16195, + "grad_norm": 0.8988465824796351, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 16195 + }, + { + "epoch": 0.16196, + "grad_norm": 0.8228969895381656, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 16196 + }, + { + "epoch": 0.16197, + "grad_norm": 0.8983174915373545, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 16197 + }, + { + "epoch": 0.16198, + "grad_norm": 1.1260559439354179, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 16198 + }, + { + "epoch": 0.16199, + "grad_norm": 0.9390686499553168, + "learning_rate": 0.003, + "loss": 4.064, + "step": 16199 + }, + { + "epoch": 0.162, + "grad_norm": 0.8152299144993529, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 16200 + }, + { + "epoch": 0.16201, + "grad_norm": 0.7360914535721991, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16201 + }, + { + "epoch": 0.16202, + "grad_norm": 0.7622137232192114, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 16202 + }, + { + "epoch": 0.16203, + "grad_norm": 0.7413943410115538, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 16203 + }, + { + "epoch": 0.16204, + "grad_norm": 0.7977514074970313, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 16204 + }, + { + "epoch": 0.16205, + "grad_norm": 0.7892478107568978, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16205 + }, + { + "epoch": 0.16206, + "grad_norm": 0.720041606861047, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 16206 + }, + { + "epoch": 0.16207, + "grad_norm": 0.8672112136682307, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16207 + }, + { + "epoch": 0.16208, + "grad_norm": 1.0494020699201347, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 16208 + }, + { + "epoch": 0.16209, + "grad_norm": 1.1249674854929692, + "learning_rate": 0.003, + "loss": 4.079, + "step": 16209 + }, + { + "epoch": 0.1621, + "grad_norm": 1.024550523762066, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 16210 + }, + { + "epoch": 0.16211, + "grad_norm": 1.0137782212236954, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 16211 + }, + { + "epoch": 0.16212, + "grad_norm": 0.9786722594139657, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 16212 + }, + { + "epoch": 0.16213, + "grad_norm": 0.9969853986796966, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16213 + }, + { + "epoch": 0.16214, + "grad_norm": 1.0754979535674585, + "learning_rate": 0.003, + "loss": 4.1058, + "step": 16214 + }, + { + "epoch": 0.16215, + "grad_norm": 0.9568793780351604, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 16215 + }, + { + "epoch": 0.16216, + "grad_norm": 1.1455624548738463, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 16216 + }, + { + "epoch": 0.16217, + "grad_norm": 1.0490097896393278, + "learning_rate": 0.003, + "loss": 4.1056, + "step": 16217 + }, + { + "epoch": 0.16218, + "grad_norm": 1.0202644948426063, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 16218 + }, + { + "epoch": 0.16219, + "grad_norm": 1.0276796887123196, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 16219 + }, + { + "epoch": 0.1622, + "grad_norm": 1.0593292616804686, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 16220 + }, + { + "epoch": 0.16221, + "grad_norm": 1.0020651740485054, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 16221 + }, + { + "epoch": 0.16222, + "grad_norm": 1.1298337838433747, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 16222 + }, + { + "epoch": 0.16223, + "grad_norm": 1.0897648396658126, + "learning_rate": 0.003, + "loss": 4.103, + "step": 16223 + }, + { + "epoch": 0.16224, + "grad_norm": 0.9519637682676307, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16224 + }, + { + "epoch": 0.16225, + "grad_norm": 0.953671025969703, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 16225 + }, + { + "epoch": 0.16226, + "grad_norm": 0.9896970202695674, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 16226 + }, + { + "epoch": 0.16227, + "grad_norm": 1.0032716113127798, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 16227 + }, + { + "epoch": 0.16228, + "grad_norm": 0.9738652448770629, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 16228 + }, + { + "epoch": 0.16229, + "grad_norm": 0.7674792332968209, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 16229 + }, + { + "epoch": 0.1623, + "grad_norm": 0.7193398489790191, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 16230 + }, + { + "epoch": 0.16231, + "grad_norm": 0.7309786942515929, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 16231 + }, + { + "epoch": 0.16232, + "grad_norm": 0.7492285412052875, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 16232 + }, + { + "epoch": 0.16233, + "grad_norm": 0.8021556523508814, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 16233 + }, + { + "epoch": 0.16234, + "grad_norm": 1.0145900379495671, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 16234 + }, + { + "epoch": 0.16235, + "grad_norm": 1.1075616915131692, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 16235 + }, + { + "epoch": 0.16236, + "grad_norm": 0.8213397231729862, + "learning_rate": 0.003, + "loss": 4.066, + "step": 16236 + }, + { + "epoch": 0.16237, + "grad_norm": 0.6768504042704822, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 16237 + }, + { + "epoch": 0.16238, + "grad_norm": 0.722693548204655, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 16238 + }, + { + "epoch": 0.16239, + "grad_norm": 0.6765664675242864, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 16239 + }, + { + "epoch": 0.1624, + "grad_norm": 0.629076717424327, + "learning_rate": 0.003, + "loss": 4.059, + "step": 16240 + }, + { + "epoch": 0.16241, + "grad_norm": 0.6743156519209437, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 16241 + }, + { + "epoch": 0.16242, + "grad_norm": 0.8544287885854247, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 16242 + }, + { + "epoch": 0.16243, + "grad_norm": 1.104586476829831, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 16243 + }, + { + "epoch": 0.16244, + "grad_norm": 1.059421360909398, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 16244 + }, + { + "epoch": 0.16245, + "grad_norm": 0.8718112501399637, + "learning_rate": 0.003, + "loss": 4.051, + "step": 16245 + }, + { + "epoch": 0.16246, + "grad_norm": 0.7612164093055407, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 16246 + }, + { + "epoch": 0.16247, + "grad_norm": 0.7014308272271207, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 16247 + }, + { + "epoch": 0.16248, + "grad_norm": 0.804493152182718, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 16248 + }, + { + "epoch": 0.16249, + "grad_norm": 0.8753722381900808, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 16249 + }, + { + "epoch": 0.1625, + "grad_norm": 0.9105179582557639, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 16250 + }, + { + "epoch": 0.16251, + "grad_norm": 0.9568872467193253, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 16251 + }, + { + "epoch": 0.16252, + "grad_norm": 0.9229114089817099, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 16252 + }, + { + "epoch": 0.16253, + "grad_norm": 0.8563003071325918, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16253 + }, + { + "epoch": 0.16254, + "grad_norm": 0.8947039093540049, + "learning_rate": 0.003, + "loss": 4.041, + "step": 16254 + }, + { + "epoch": 0.16255, + "grad_norm": 0.8562350552382546, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 16255 + }, + { + "epoch": 0.16256, + "grad_norm": 0.7873504536371866, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 16256 + }, + { + "epoch": 0.16257, + "grad_norm": 0.7461171081796859, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 16257 + }, + { + "epoch": 0.16258, + "grad_norm": 0.674715451156988, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 16258 + }, + { + "epoch": 0.16259, + "grad_norm": 0.7035220419556732, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 16259 + }, + { + "epoch": 0.1626, + "grad_norm": 0.7835199425538674, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 16260 + }, + { + "epoch": 0.16261, + "grad_norm": 0.9023162600106548, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 16261 + }, + { + "epoch": 0.16262, + "grad_norm": 0.9670145461010528, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 16262 + }, + { + "epoch": 0.16263, + "grad_norm": 0.9734628038161928, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 16263 + }, + { + "epoch": 0.16264, + "grad_norm": 0.9188835579076302, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16264 + }, + { + "epoch": 0.16265, + "grad_norm": 0.8256702747886675, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 16265 + }, + { + "epoch": 0.16266, + "grad_norm": 0.7592208986754003, + "learning_rate": 0.003, + "loss": 4.068, + "step": 16266 + }, + { + "epoch": 0.16267, + "grad_norm": 0.8039117002212741, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 16267 + }, + { + "epoch": 0.16268, + "grad_norm": 0.8947686350506168, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 16268 + }, + { + "epoch": 0.16269, + "grad_norm": 0.8243573580472494, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 16269 + }, + { + "epoch": 0.1627, + "grad_norm": 0.6575010590174963, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 16270 + }, + { + "epoch": 0.16271, + "grad_norm": 0.7610139261035422, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 16271 + }, + { + "epoch": 0.16272, + "grad_norm": 0.8343058454375321, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 16272 + }, + { + "epoch": 0.16273, + "grad_norm": 0.8281436786429534, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 16273 + }, + { + "epoch": 0.16274, + "grad_norm": 0.7678649443323914, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 16274 + }, + { + "epoch": 0.16275, + "grad_norm": 0.7551691228694758, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 16275 + }, + { + "epoch": 0.16276, + "grad_norm": 0.8269508965204964, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 16276 + }, + { + "epoch": 0.16277, + "grad_norm": 0.8912629174403209, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 16277 + }, + { + "epoch": 0.16278, + "grad_norm": 0.9975366156148245, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 16278 + }, + { + "epoch": 0.16279, + "grad_norm": 1.1451233189077932, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 16279 + }, + { + "epoch": 0.1628, + "grad_norm": 1.0487951683788876, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 16280 + }, + { + "epoch": 0.16281, + "grad_norm": 1.0172643716337553, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 16281 + }, + { + "epoch": 0.16282, + "grad_norm": 1.0773619764096158, + "learning_rate": 0.003, + "loss": 4.053, + "step": 16282 + }, + { + "epoch": 0.16283, + "grad_norm": 0.8691246985842682, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 16283 + }, + { + "epoch": 0.16284, + "grad_norm": 0.968909349697707, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 16284 + }, + { + "epoch": 0.16285, + "grad_norm": 1.1473100685823328, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 16285 + }, + { + "epoch": 0.16286, + "grad_norm": 0.8849533566376779, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 16286 + }, + { + "epoch": 0.16287, + "grad_norm": 0.9774613453573878, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 16287 + }, + { + "epoch": 0.16288, + "grad_norm": 1.0977809366380906, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 16288 + }, + { + "epoch": 0.16289, + "grad_norm": 0.8561584769481401, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 16289 + }, + { + "epoch": 0.1629, + "grad_norm": 0.7848931043950328, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 16290 + }, + { + "epoch": 0.16291, + "grad_norm": 0.7802439977245792, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 16291 + }, + { + "epoch": 0.16292, + "grad_norm": 0.7651178311141256, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 16292 + }, + { + "epoch": 0.16293, + "grad_norm": 0.8685065968678868, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 16293 + }, + { + "epoch": 0.16294, + "grad_norm": 0.9237856864310275, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 16294 + }, + { + "epoch": 0.16295, + "grad_norm": 1.1278124372935312, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 16295 + }, + { + "epoch": 0.16296, + "grad_norm": 0.9891264765467419, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 16296 + }, + { + "epoch": 0.16297, + "grad_norm": 0.966613287257267, + "learning_rate": 0.003, + "loss": 4.072, + "step": 16297 + }, + { + "epoch": 0.16298, + "grad_norm": 0.710801390925547, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 16298 + }, + { + "epoch": 0.16299, + "grad_norm": 0.6152415750940189, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 16299 + }, + { + "epoch": 0.163, + "grad_norm": 0.7568654652124776, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 16300 + }, + { + "epoch": 0.16301, + "grad_norm": 0.9004162565321375, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 16301 + }, + { + "epoch": 0.16302, + "grad_norm": 0.9466994903295309, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 16302 + }, + { + "epoch": 0.16303, + "grad_norm": 0.9583379791654986, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 16303 + }, + { + "epoch": 0.16304, + "grad_norm": 0.8963929816095652, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 16304 + }, + { + "epoch": 0.16305, + "grad_norm": 0.853690250397017, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 16305 + }, + { + "epoch": 0.16306, + "grad_norm": 0.8474023517120366, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 16306 + }, + { + "epoch": 0.16307, + "grad_norm": 0.8141426006933553, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 16307 + }, + { + "epoch": 0.16308, + "grad_norm": 0.7883105319317605, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 16308 + }, + { + "epoch": 0.16309, + "grad_norm": 0.623205437017482, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 16309 + }, + { + "epoch": 0.1631, + "grad_norm": 0.6984950064714907, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 16310 + }, + { + "epoch": 0.16311, + "grad_norm": 0.7218842752055934, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 16311 + }, + { + "epoch": 0.16312, + "grad_norm": 0.7694544368850195, + "learning_rate": 0.003, + "loss": 4.1067, + "step": 16312 + }, + { + "epoch": 0.16313, + "grad_norm": 0.8263370653136152, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 16313 + }, + { + "epoch": 0.16314, + "grad_norm": 0.8317574056320102, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 16314 + }, + { + "epoch": 0.16315, + "grad_norm": 0.7535022241153165, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 16315 + }, + { + "epoch": 0.16316, + "grad_norm": 0.8417341764302408, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 16316 + }, + { + "epoch": 0.16317, + "grad_norm": 0.9901599491024954, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 16317 + }, + { + "epoch": 0.16318, + "grad_norm": 1.0004399101085264, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 16318 + }, + { + "epoch": 0.16319, + "grad_norm": 1.0692991592092003, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 16319 + }, + { + "epoch": 0.1632, + "grad_norm": 0.9561987828264448, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 16320 + }, + { + "epoch": 0.16321, + "grad_norm": 1.0251712944716842, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 16321 + }, + { + "epoch": 0.16322, + "grad_norm": 0.9816010445769573, + "learning_rate": 0.003, + "loss": 4.077, + "step": 16322 + }, + { + "epoch": 0.16323, + "grad_norm": 1.101057119290071, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 16323 + }, + { + "epoch": 0.16324, + "grad_norm": 1.0524633263040601, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 16324 + }, + { + "epoch": 0.16325, + "grad_norm": 0.8864279589063383, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 16325 + }, + { + "epoch": 0.16326, + "grad_norm": 0.8902968713402165, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 16326 + }, + { + "epoch": 0.16327, + "grad_norm": 1.0234400798281675, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 16327 + }, + { + "epoch": 0.16328, + "grad_norm": 0.9621135258676727, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 16328 + }, + { + "epoch": 0.16329, + "grad_norm": 1.0055178417417585, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 16329 + }, + { + "epoch": 0.1633, + "grad_norm": 1.0265496718395943, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 16330 + }, + { + "epoch": 0.16331, + "grad_norm": 0.7396114122702113, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 16331 + }, + { + "epoch": 0.16332, + "grad_norm": 0.6812936739431915, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 16332 + }, + { + "epoch": 0.16333, + "grad_norm": 0.6706419805801713, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 16333 + }, + { + "epoch": 0.16334, + "grad_norm": 0.7073675785764628, + "learning_rate": 0.003, + "loss": 4.082, + "step": 16334 + }, + { + "epoch": 0.16335, + "grad_norm": 0.7459792886555676, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 16335 + }, + { + "epoch": 0.16336, + "grad_norm": 0.9629007161107034, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 16336 + }, + { + "epoch": 0.16337, + "grad_norm": 1.0902010043538788, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 16337 + }, + { + "epoch": 0.16338, + "grad_norm": 0.8025447207521035, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 16338 + }, + { + "epoch": 0.16339, + "grad_norm": 0.7361805815115818, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 16339 + }, + { + "epoch": 0.1634, + "grad_norm": 0.6540008185315194, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 16340 + }, + { + "epoch": 0.16341, + "grad_norm": 0.688424768221228, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 16341 + }, + { + "epoch": 0.16342, + "grad_norm": 0.797647322547362, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 16342 + }, + { + "epoch": 0.16343, + "grad_norm": 0.8786609196121857, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 16343 + }, + { + "epoch": 0.16344, + "grad_norm": 0.9689555421782381, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 16344 + }, + { + "epoch": 0.16345, + "grad_norm": 1.010957931909869, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 16345 + }, + { + "epoch": 0.16346, + "grad_norm": 1.1043381808748678, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 16346 + }, + { + "epoch": 0.16347, + "grad_norm": 0.8627539680852754, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 16347 + }, + { + "epoch": 0.16348, + "grad_norm": 0.9415344862017608, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 16348 + }, + { + "epoch": 0.16349, + "grad_norm": 0.9722106862140105, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 16349 + }, + { + "epoch": 0.1635, + "grad_norm": 0.9225391759167083, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 16350 + }, + { + "epoch": 0.16351, + "grad_norm": 0.9050708322400156, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 16351 + }, + { + "epoch": 0.16352, + "grad_norm": 0.9648755421215274, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 16352 + }, + { + "epoch": 0.16353, + "grad_norm": 0.94978974355883, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 16353 + }, + { + "epoch": 0.16354, + "grad_norm": 0.8414682214556534, + "learning_rate": 0.003, + "loss": 4.097, + "step": 16354 + }, + { + "epoch": 0.16355, + "grad_norm": 0.8776397789694375, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 16355 + }, + { + "epoch": 0.16356, + "grad_norm": 0.9422987811838714, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 16356 + }, + { + "epoch": 0.16357, + "grad_norm": 0.8873496308807577, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 16357 + }, + { + "epoch": 0.16358, + "grad_norm": 0.9517019081662924, + "learning_rate": 0.003, + "loss": 4.093, + "step": 16358 + }, + { + "epoch": 0.16359, + "grad_norm": 0.8816314640473634, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 16359 + }, + { + "epoch": 0.1636, + "grad_norm": 0.9012518179501146, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 16360 + }, + { + "epoch": 0.16361, + "grad_norm": 1.072329376855215, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 16361 + }, + { + "epoch": 0.16362, + "grad_norm": 1.1132639228417447, + "learning_rate": 0.003, + "loss": 4.101, + "step": 16362 + }, + { + "epoch": 0.16363, + "grad_norm": 0.9357152587625169, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 16363 + }, + { + "epoch": 0.16364, + "grad_norm": 0.86584581619107, + "learning_rate": 0.003, + "loss": 4.089, + "step": 16364 + }, + { + "epoch": 0.16365, + "grad_norm": 0.8856370055503388, + "learning_rate": 0.003, + "loss": 4.082, + "step": 16365 + }, + { + "epoch": 0.16366, + "grad_norm": 0.9123103930623118, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 16366 + }, + { + "epoch": 0.16367, + "grad_norm": 0.8428889183502868, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 16367 + }, + { + "epoch": 0.16368, + "grad_norm": 0.9180145944409419, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 16368 + }, + { + "epoch": 0.16369, + "grad_norm": 0.829519571789251, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 16369 + }, + { + "epoch": 0.1637, + "grad_norm": 0.7900204959878786, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 16370 + }, + { + "epoch": 0.16371, + "grad_norm": 0.6797779972401826, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 16371 + }, + { + "epoch": 0.16372, + "grad_norm": 0.7262520028616694, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 16372 + }, + { + "epoch": 0.16373, + "grad_norm": 0.6769507847507134, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 16373 + }, + { + "epoch": 0.16374, + "grad_norm": 0.6338324807541841, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 16374 + }, + { + "epoch": 0.16375, + "grad_norm": 0.5978557858805862, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 16375 + }, + { + "epoch": 0.16376, + "grad_norm": 0.6375385979274388, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 16376 + }, + { + "epoch": 0.16377, + "grad_norm": 0.7618874770471668, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 16377 + }, + { + "epoch": 0.16378, + "grad_norm": 0.960624593465279, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 16378 + }, + { + "epoch": 0.16379, + "grad_norm": 1.307498950894558, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 16379 + }, + { + "epoch": 0.1638, + "grad_norm": 0.6842670158933465, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 16380 + }, + { + "epoch": 0.16381, + "grad_norm": 0.6605379375265492, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 16381 + }, + { + "epoch": 0.16382, + "grad_norm": 0.7131883422986482, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 16382 + }, + { + "epoch": 0.16383, + "grad_norm": 0.7352761940986067, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 16383 + }, + { + "epoch": 0.16384, + "grad_norm": 0.8802239132267842, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 16384 + }, + { + "epoch": 0.16385, + "grad_norm": 1.2980147396968764, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 16385 + }, + { + "epoch": 0.16386, + "grad_norm": 0.8717943810458164, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 16386 + }, + { + "epoch": 0.16387, + "grad_norm": 0.9473289171900946, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16387 + }, + { + "epoch": 0.16388, + "grad_norm": 1.1518247231894105, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 16388 + }, + { + "epoch": 0.16389, + "grad_norm": 0.9102925832056002, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 16389 + }, + { + "epoch": 0.1639, + "grad_norm": 0.865928627298715, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 16390 + }, + { + "epoch": 0.16391, + "grad_norm": 0.8432577037598782, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 16391 + }, + { + "epoch": 0.16392, + "grad_norm": 0.8210471714324126, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 16392 + }, + { + "epoch": 0.16393, + "grad_norm": 0.7105093895533194, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 16393 + }, + { + "epoch": 0.16394, + "grad_norm": 0.6653408636561565, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 16394 + }, + { + "epoch": 0.16395, + "grad_norm": 0.642852993617922, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 16395 + }, + { + "epoch": 0.16396, + "grad_norm": 0.6657922469312689, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 16396 + }, + { + "epoch": 0.16397, + "grad_norm": 0.734892109340266, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 16397 + }, + { + "epoch": 0.16398, + "grad_norm": 0.9900340614929757, + "learning_rate": 0.003, + "loss": 4.064, + "step": 16398 + }, + { + "epoch": 0.16399, + "grad_norm": 1.281382027096473, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 16399 + }, + { + "epoch": 0.164, + "grad_norm": 0.7489538583502504, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 16400 + }, + { + "epoch": 0.16401, + "grad_norm": 0.819926182966586, + "learning_rate": 0.003, + "loss": 4.05, + "step": 16401 + }, + { + "epoch": 0.16402, + "grad_norm": 0.7900644421362657, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 16402 + }, + { + "epoch": 0.16403, + "grad_norm": 0.8795568912895309, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 16403 + }, + { + "epoch": 0.16404, + "grad_norm": 0.9187639580508348, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 16404 + }, + { + "epoch": 0.16405, + "grad_norm": 1.0592065825007824, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 16405 + }, + { + "epoch": 0.16406, + "grad_norm": 1.0370700965993263, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 16406 + }, + { + "epoch": 0.16407, + "grad_norm": 0.8660839214754755, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 16407 + }, + { + "epoch": 0.16408, + "grad_norm": 0.7866820033002769, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 16408 + }, + { + "epoch": 0.16409, + "grad_norm": 0.9302742716268162, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 16409 + }, + { + "epoch": 0.1641, + "grad_norm": 1.10096361603536, + "learning_rate": 0.003, + "loss": 4.075, + "step": 16410 + }, + { + "epoch": 0.16411, + "grad_norm": 0.8598337818949621, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 16411 + }, + { + "epoch": 0.16412, + "grad_norm": 0.8344572187111701, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 16412 + }, + { + "epoch": 0.16413, + "grad_norm": 0.935291715553534, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 16413 + }, + { + "epoch": 0.16414, + "grad_norm": 0.9773771759755003, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 16414 + }, + { + "epoch": 0.16415, + "grad_norm": 1.0974982570234062, + "learning_rate": 0.003, + "loss": 4.062, + "step": 16415 + }, + { + "epoch": 0.16416, + "grad_norm": 0.9382359630702094, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 16416 + }, + { + "epoch": 0.16417, + "grad_norm": 0.8753993968104263, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16417 + }, + { + "epoch": 0.16418, + "grad_norm": 0.8890932886780114, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 16418 + }, + { + "epoch": 0.16419, + "grad_norm": 0.9773397196278301, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 16419 + }, + { + "epoch": 0.1642, + "grad_norm": 0.9569700811156342, + "learning_rate": 0.003, + "loss": 4.1154, + "step": 16420 + }, + { + "epoch": 0.16421, + "grad_norm": 0.9914231867676052, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 16421 + }, + { + "epoch": 0.16422, + "grad_norm": 1.061614469418064, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 16422 + }, + { + "epoch": 0.16423, + "grad_norm": 0.9802931770326155, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 16423 + }, + { + "epoch": 0.16424, + "grad_norm": 1.0521374393850662, + "learning_rate": 0.003, + "loss": 4.079, + "step": 16424 + }, + { + "epoch": 0.16425, + "grad_norm": 0.8627856733774925, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 16425 + }, + { + "epoch": 0.16426, + "grad_norm": 0.7240879568833387, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 16426 + }, + { + "epoch": 0.16427, + "grad_norm": 0.6263409993198499, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 16427 + }, + { + "epoch": 0.16428, + "grad_norm": 0.5612375917314804, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 16428 + }, + { + "epoch": 0.16429, + "grad_norm": 0.5676625695669967, + "learning_rate": 0.003, + "loss": 4.063, + "step": 16429 + }, + { + "epoch": 0.1643, + "grad_norm": 0.635340784985962, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 16430 + }, + { + "epoch": 0.16431, + "grad_norm": 0.7579785372543869, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16431 + }, + { + "epoch": 0.16432, + "grad_norm": 0.854104037492739, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 16432 + }, + { + "epoch": 0.16433, + "grad_norm": 0.9854975460081391, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 16433 + }, + { + "epoch": 0.16434, + "grad_norm": 1.054683157363344, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 16434 + }, + { + "epoch": 0.16435, + "grad_norm": 0.9408684826274759, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 16435 + }, + { + "epoch": 0.16436, + "grad_norm": 0.9383415831827594, + "learning_rate": 0.003, + "loss": 4.067, + "step": 16436 + }, + { + "epoch": 0.16437, + "grad_norm": 0.8808857016465607, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 16437 + }, + { + "epoch": 0.16438, + "grad_norm": 0.7824759991232686, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 16438 + }, + { + "epoch": 0.16439, + "grad_norm": 0.7433967595371673, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 16439 + }, + { + "epoch": 0.1644, + "grad_norm": 0.8026972361328236, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 16440 + }, + { + "epoch": 0.16441, + "grad_norm": 0.8636161760661323, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 16441 + }, + { + "epoch": 0.16442, + "grad_norm": 0.9149443557754856, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 16442 + }, + { + "epoch": 0.16443, + "grad_norm": 0.9370834323878338, + "learning_rate": 0.003, + "loss": 4.098, + "step": 16443 + }, + { + "epoch": 0.16444, + "grad_norm": 0.9478938788650303, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 16444 + }, + { + "epoch": 0.16445, + "grad_norm": 0.92959502518127, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 16445 + }, + { + "epoch": 0.16446, + "grad_norm": 0.8715990239886671, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 16446 + }, + { + "epoch": 0.16447, + "grad_norm": 0.9400935955138296, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 16447 + }, + { + "epoch": 0.16448, + "grad_norm": 0.9811240436203806, + "learning_rate": 0.003, + "loss": 4.092, + "step": 16448 + }, + { + "epoch": 0.16449, + "grad_norm": 0.8918189523580856, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 16449 + }, + { + "epoch": 0.1645, + "grad_norm": 0.7194053286802724, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 16450 + }, + { + "epoch": 0.16451, + "grad_norm": 0.8774922168689883, + "learning_rate": 0.003, + "loss": 4.076, + "step": 16451 + }, + { + "epoch": 0.16452, + "grad_norm": 1.0458245173342429, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 16452 + }, + { + "epoch": 0.16453, + "grad_norm": 1.0098898686793907, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 16453 + }, + { + "epoch": 0.16454, + "grad_norm": 1.048560219790088, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 16454 + }, + { + "epoch": 0.16455, + "grad_norm": 0.92618933646125, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 16455 + }, + { + "epoch": 0.16456, + "grad_norm": 0.8927600222337699, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 16456 + }, + { + "epoch": 0.16457, + "grad_norm": 0.8178539590392163, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 16457 + }, + { + "epoch": 0.16458, + "grad_norm": 0.7201786957361529, + "learning_rate": 0.003, + "loss": 4.055, + "step": 16458 + }, + { + "epoch": 0.16459, + "grad_norm": 0.8265961160008374, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16459 + }, + { + "epoch": 0.1646, + "grad_norm": 0.8666697955312822, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 16460 + }, + { + "epoch": 0.16461, + "grad_norm": 0.87034742661441, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 16461 + }, + { + "epoch": 0.16462, + "grad_norm": 1.1384608015954887, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 16462 + }, + { + "epoch": 0.16463, + "grad_norm": 0.9153887291272492, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 16463 + }, + { + "epoch": 0.16464, + "grad_norm": 0.8115386706398814, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 16464 + }, + { + "epoch": 0.16465, + "grad_norm": 0.8466232506914497, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 16465 + }, + { + "epoch": 0.16466, + "grad_norm": 0.861983793500373, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 16466 + }, + { + "epoch": 0.16467, + "grad_norm": 0.8624133882028252, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 16467 + }, + { + "epoch": 0.16468, + "grad_norm": 0.9831016147187888, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 16468 + }, + { + "epoch": 0.16469, + "grad_norm": 1.086376722450923, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 16469 + }, + { + "epoch": 0.1647, + "grad_norm": 0.9782884286370237, + "learning_rate": 0.003, + "loss": 4.068, + "step": 16470 + }, + { + "epoch": 0.16471, + "grad_norm": 1.0671488756490388, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 16471 + }, + { + "epoch": 0.16472, + "grad_norm": 0.9754565797535374, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 16472 + }, + { + "epoch": 0.16473, + "grad_norm": 0.9847542732824185, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 16473 + }, + { + "epoch": 0.16474, + "grad_norm": 0.9742436486561515, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 16474 + }, + { + "epoch": 0.16475, + "grad_norm": 0.9689392722162178, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 16475 + }, + { + "epoch": 0.16476, + "grad_norm": 1.095996390726339, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 16476 + }, + { + "epoch": 0.16477, + "grad_norm": 0.9124930058545174, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 16477 + }, + { + "epoch": 0.16478, + "grad_norm": 0.9138302018141485, + "learning_rate": 0.003, + "loss": 4.1073, + "step": 16478 + }, + { + "epoch": 0.16479, + "grad_norm": 0.8989537665410646, + "learning_rate": 0.003, + "loss": 4.08, + "step": 16479 + }, + { + "epoch": 0.1648, + "grad_norm": 0.9419921335711516, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 16480 + }, + { + "epoch": 0.16481, + "grad_norm": 0.9818098036597075, + "learning_rate": 0.003, + "loss": 4.08, + "step": 16481 + }, + { + "epoch": 0.16482, + "grad_norm": 0.9857383085844478, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 16482 + }, + { + "epoch": 0.16483, + "grad_norm": 0.9670746894077116, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 16483 + }, + { + "epoch": 0.16484, + "grad_norm": 1.1323020032533249, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 16484 + }, + { + "epoch": 0.16485, + "grad_norm": 0.9834520956577053, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 16485 + }, + { + "epoch": 0.16486, + "grad_norm": 0.8026751155382573, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 16486 + }, + { + "epoch": 0.16487, + "grad_norm": 0.6228904546696008, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 16487 + }, + { + "epoch": 0.16488, + "grad_norm": 0.6050614777062014, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 16488 + }, + { + "epoch": 0.16489, + "grad_norm": 0.6695305780666493, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 16489 + }, + { + "epoch": 0.1649, + "grad_norm": 0.7154645577664862, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 16490 + }, + { + "epoch": 0.16491, + "grad_norm": 0.7377130840357033, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 16491 + }, + { + "epoch": 0.16492, + "grad_norm": 0.7203077541945648, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 16492 + }, + { + "epoch": 0.16493, + "grad_norm": 0.7365393404657381, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 16493 + }, + { + "epoch": 0.16494, + "grad_norm": 0.8644012481872585, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 16494 + }, + { + "epoch": 0.16495, + "grad_norm": 1.0341330029439595, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 16495 + }, + { + "epoch": 0.16496, + "grad_norm": 1.0165424018268916, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 16496 + }, + { + "epoch": 0.16497, + "grad_norm": 0.8879322560066397, + "learning_rate": 0.003, + "loss": 4.025, + "step": 16497 + }, + { + "epoch": 0.16498, + "grad_norm": 0.7631123782889997, + "learning_rate": 0.003, + "loss": 4.014, + "step": 16498 + }, + { + "epoch": 0.16499, + "grad_norm": 0.5842517604191956, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 16499 + }, + { + "epoch": 0.165, + "grad_norm": 0.5699230875663656, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 16500 + }, + { + "epoch": 0.16501, + "grad_norm": 0.5382001439361487, + "learning_rate": 0.003, + "loss": 4.069, + "step": 16501 + }, + { + "epoch": 0.16502, + "grad_norm": 0.5982219845821781, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 16502 + }, + { + "epoch": 0.16503, + "grad_norm": 0.6010589571428604, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 16503 + }, + { + "epoch": 0.16504, + "grad_norm": 0.5939698677680427, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 16504 + }, + { + "epoch": 0.16505, + "grad_norm": 0.6108774309732249, + "learning_rate": 0.003, + "loss": 4.068, + "step": 16505 + }, + { + "epoch": 0.16506, + "grad_norm": 0.6375858425394414, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 16506 + }, + { + "epoch": 0.16507, + "grad_norm": 0.6959426724754599, + "learning_rate": 0.003, + "loss": 4.061, + "step": 16507 + }, + { + "epoch": 0.16508, + "grad_norm": 0.8847577219107231, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 16508 + }, + { + "epoch": 0.16509, + "grad_norm": 1.190871093756928, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 16509 + }, + { + "epoch": 0.1651, + "grad_norm": 0.9961046853051119, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 16510 + }, + { + "epoch": 0.16511, + "grad_norm": 1.0788096121797417, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 16511 + }, + { + "epoch": 0.16512, + "grad_norm": 0.8699285198446157, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 16512 + }, + { + "epoch": 0.16513, + "grad_norm": 0.7673644941699407, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 16513 + }, + { + "epoch": 0.16514, + "grad_norm": 0.6915244832844373, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 16514 + }, + { + "epoch": 0.16515, + "grad_norm": 0.6743873426586414, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 16515 + }, + { + "epoch": 0.16516, + "grad_norm": 0.7454244375026999, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 16516 + }, + { + "epoch": 0.16517, + "grad_norm": 0.9299992427701205, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 16517 + }, + { + "epoch": 0.16518, + "grad_norm": 1.0993407224898688, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 16518 + }, + { + "epoch": 0.16519, + "grad_norm": 0.9677895313461976, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 16519 + }, + { + "epoch": 0.1652, + "grad_norm": 0.9507837271180546, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 16520 + }, + { + "epoch": 0.16521, + "grad_norm": 0.7637014412033272, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 16521 + }, + { + "epoch": 0.16522, + "grad_norm": 0.7530902468779367, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 16522 + }, + { + "epoch": 0.16523, + "grad_norm": 0.7588941082123793, + "learning_rate": 0.003, + "loss": 4.076, + "step": 16523 + }, + { + "epoch": 0.16524, + "grad_norm": 0.7409911017180301, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 16524 + }, + { + "epoch": 0.16525, + "grad_norm": 0.7187504248398197, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 16525 + }, + { + "epoch": 0.16526, + "grad_norm": 0.6484130371471625, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 16526 + }, + { + "epoch": 0.16527, + "grad_norm": 0.7458645932079909, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 16527 + }, + { + "epoch": 0.16528, + "grad_norm": 0.926064666970427, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 16528 + }, + { + "epoch": 0.16529, + "grad_norm": 1.2064446265719213, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 16529 + }, + { + "epoch": 0.1653, + "grad_norm": 0.9259164416529385, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 16530 + }, + { + "epoch": 0.16531, + "grad_norm": 0.8681080010290746, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 16531 + }, + { + "epoch": 0.16532, + "grad_norm": 0.8127777297072766, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 16532 + }, + { + "epoch": 0.16533, + "grad_norm": 0.8678429048267168, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 16533 + }, + { + "epoch": 0.16534, + "grad_norm": 0.8985386617651252, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 16534 + }, + { + "epoch": 0.16535, + "grad_norm": 1.0167292675159916, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 16535 + }, + { + "epoch": 0.16536, + "grad_norm": 1.0698354837164155, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 16536 + }, + { + "epoch": 0.16537, + "grad_norm": 0.9633811963508263, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 16537 + }, + { + "epoch": 0.16538, + "grad_norm": 0.9497531805329531, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 16538 + }, + { + "epoch": 0.16539, + "grad_norm": 1.0383203381577468, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 16539 + }, + { + "epoch": 0.1654, + "grad_norm": 1.0997067760078798, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 16540 + }, + { + "epoch": 0.16541, + "grad_norm": 1.0736621453658377, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 16541 + }, + { + "epoch": 0.16542, + "grad_norm": 0.9040243208029298, + "learning_rate": 0.003, + "loss": 4.068, + "step": 16542 + }, + { + "epoch": 0.16543, + "grad_norm": 0.8621822194244528, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 16543 + }, + { + "epoch": 0.16544, + "grad_norm": 0.9084383717664533, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 16544 + }, + { + "epoch": 0.16545, + "grad_norm": 0.9375564496584654, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 16545 + }, + { + "epoch": 0.16546, + "grad_norm": 0.9801135091236702, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 16546 + }, + { + "epoch": 0.16547, + "grad_norm": 1.0640556603633693, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 16547 + }, + { + "epoch": 0.16548, + "grad_norm": 1.1697631645113367, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 16548 + }, + { + "epoch": 0.16549, + "grad_norm": 1.040338745911738, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 16549 + }, + { + "epoch": 0.1655, + "grad_norm": 1.152265149590593, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 16550 + }, + { + "epoch": 0.16551, + "grad_norm": 0.9438970192293612, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 16551 + }, + { + "epoch": 0.16552, + "grad_norm": 0.9472087474276947, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 16552 + }, + { + "epoch": 0.16553, + "grad_norm": 0.8642983983001351, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 16553 + }, + { + "epoch": 0.16554, + "grad_norm": 0.7846420637771692, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 16554 + }, + { + "epoch": 0.16555, + "grad_norm": 0.8344787520798734, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 16555 + }, + { + "epoch": 0.16556, + "grad_norm": 0.9895997187897928, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 16556 + }, + { + "epoch": 0.16557, + "grad_norm": 1.2059192307357727, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 16557 + }, + { + "epoch": 0.16558, + "grad_norm": 0.8108602019844887, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 16558 + }, + { + "epoch": 0.16559, + "grad_norm": 0.8079073457719984, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 16559 + }, + { + "epoch": 0.1656, + "grad_norm": 0.9325977252728038, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 16560 + }, + { + "epoch": 0.16561, + "grad_norm": 1.0383802288301007, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 16561 + }, + { + "epoch": 0.16562, + "grad_norm": 1.0914868529759103, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 16562 + }, + { + "epoch": 0.16563, + "grad_norm": 0.8461510347991466, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 16563 + }, + { + "epoch": 0.16564, + "grad_norm": 0.7545415344715569, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 16564 + }, + { + "epoch": 0.16565, + "grad_norm": 0.6071515416738734, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 16565 + }, + { + "epoch": 0.16566, + "grad_norm": 0.6269742992012324, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 16566 + }, + { + "epoch": 0.16567, + "grad_norm": 0.7214176988230578, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 16567 + }, + { + "epoch": 0.16568, + "grad_norm": 0.793749521150152, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 16568 + }, + { + "epoch": 0.16569, + "grad_norm": 0.8544473789887507, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 16569 + }, + { + "epoch": 0.1657, + "grad_norm": 0.8478055298450039, + "learning_rate": 0.003, + "loss": 4.058, + "step": 16570 + }, + { + "epoch": 0.16571, + "grad_norm": 0.8715598001196075, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 16571 + }, + { + "epoch": 0.16572, + "grad_norm": 1.035496007750827, + "learning_rate": 0.003, + "loss": 4.08, + "step": 16572 + }, + { + "epoch": 0.16573, + "grad_norm": 1.1043299237619633, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 16573 + }, + { + "epoch": 0.16574, + "grad_norm": 0.8472709139237138, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 16574 + }, + { + "epoch": 0.16575, + "grad_norm": 0.8707354926263987, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 16575 + }, + { + "epoch": 0.16576, + "grad_norm": 0.9025740564326497, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 16576 + }, + { + "epoch": 0.16577, + "grad_norm": 0.9134741559472201, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 16577 + }, + { + "epoch": 0.16578, + "grad_norm": 0.9904958243395321, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 16578 + }, + { + "epoch": 0.16579, + "grad_norm": 1.0378807489332245, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 16579 + }, + { + "epoch": 0.1658, + "grad_norm": 1.0165662059122322, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 16580 + }, + { + "epoch": 0.16581, + "grad_norm": 1.1727087192109356, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 16581 + }, + { + "epoch": 0.16582, + "grad_norm": 0.9098497869999458, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 16582 + }, + { + "epoch": 0.16583, + "grad_norm": 1.1607073860592807, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 16583 + }, + { + "epoch": 0.16584, + "grad_norm": 0.98421831903711, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 16584 + }, + { + "epoch": 0.16585, + "grad_norm": 1.0352122742152954, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 16585 + }, + { + "epoch": 0.16586, + "grad_norm": 1.0343020941022698, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 16586 + }, + { + "epoch": 0.16587, + "grad_norm": 0.9208642439125199, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 16587 + }, + { + "epoch": 0.16588, + "grad_norm": 0.988283427584848, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 16588 + }, + { + "epoch": 0.16589, + "grad_norm": 1.0723578577185815, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 16589 + }, + { + "epoch": 0.1659, + "grad_norm": 1.0865388838071601, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 16590 + }, + { + "epoch": 0.16591, + "grad_norm": 0.7835732754712912, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 16591 + }, + { + "epoch": 0.16592, + "grad_norm": 0.7080299673094684, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 16592 + }, + { + "epoch": 0.16593, + "grad_norm": 0.7952753455467056, + "learning_rate": 0.003, + "loss": 4.07, + "step": 16593 + }, + { + "epoch": 0.16594, + "grad_norm": 0.8025023491675204, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 16594 + }, + { + "epoch": 0.16595, + "grad_norm": 0.8434752063860484, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 16595 + }, + { + "epoch": 0.16596, + "grad_norm": 0.7576039699436765, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 16596 + }, + { + "epoch": 0.16597, + "grad_norm": 0.8040176359188829, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 16597 + }, + { + "epoch": 0.16598, + "grad_norm": 0.8108078582877047, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 16598 + }, + { + "epoch": 0.16599, + "grad_norm": 0.9098138069546619, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 16599 + }, + { + "epoch": 0.166, + "grad_norm": 1.1119201512737737, + "learning_rate": 0.003, + "loss": 4.035, + "step": 16600 + }, + { + "epoch": 0.16601, + "grad_norm": 1.1905088097032037, + "learning_rate": 0.003, + "loss": 4.1055, + "step": 16601 + }, + { + "epoch": 0.16602, + "grad_norm": 0.8303069600930817, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 16602 + }, + { + "epoch": 0.16603, + "grad_norm": 0.7689642488159598, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 16603 + }, + { + "epoch": 0.16604, + "grad_norm": 0.8280858237676337, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 16604 + }, + { + "epoch": 0.16605, + "grad_norm": 0.8886330843711158, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 16605 + }, + { + "epoch": 0.16606, + "grad_norm": 1.0974928983661676, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 16606 + }, + { + "epoch": 0.16607, + "grad_norm": 0.9574499041192838, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 16607 + }, + { + "epoch": 0.16608, + "grad_norm": 0.9010072595434921, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 16608 + }, + { + "epoch": 0.16609, + "grad_norm": 0.9643310481457389, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 16609 + }, + { + "epoch": 0.1661, + "grad_norm": 1.0793575735585432, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 16610 + }, + { + "epoch": 0.16611, + "grad_norm": 1.1066751960261507, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 16611 + }, + { + "epoch": 0.16612, + "grad_norm": 0.8653315805591723, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 16612 + }, + { + "epoch": 0.16613, + "grad_norm": 0.7261582929574493, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 16613 + }, + { + "epoch": 0.16614, + "grad_norm": 0.6829087825872896, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 16614 + }, + { + "epoch": 0.16615, + "grad_norm": 0.7054220761382889, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 16615 + }, + { + "epoch": 0.16616, + "grad_norm": 0.7378934716762852, + "learning_rate": 0.003, + "loss": 4.1173, + "step": 16616 + }, + { + "epoch": 0.16617, + "grad_norm": 0.785770852990258, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 16617 + }, + { + "epoch": 0.16618, + "grad_norm": 0.8678169089350518, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 16618 + }, + { + "epoch": 0.16619, + "grad_norm": 1.0215330215473912, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 16619 + }, + { + "epoch": 0.1662, + "grad_norm": 1.1641532489683266, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 16620 + }, + { + "epoch": 0.16621, + "grad_norm": 0.9076208909961722, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 16621 + }, + { + "epoch": 0.16622, + "grad_norm": 0.8762027267478063, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 16622 + }, + { + "epoch": 0.16623, + "grad_norm": 0.7619567616157618, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 16623 + }, + { + "epoch": 0.16624, + "grad_norm": 0.7046215411103308, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 16624 + }, + { + "epoch": 0.16625, + "grad_norm": 0.7429471799885354, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 16625 + }, + { + "epoch": 0.16626, + "grad_norm": 0.878992974495442, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 16626 + }, + { + "epoch": 0.16627, + "grad_norm": 0.9455512526890147, + "learning_rate": 0.003, + "loss": 4.076, + "step": 16627 + }, + { + "epoch": 0.16628, + "grad_norm": 1.0127633391896531, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 16628 + }, + { + "epoch": 0.16629, + "grad_norm": 0.9397277193198198, + "learning_rate": 0.003, + "loss": 4.1036, + "step": 16629 + }, + { + "epoch": 0.1663, + "grad_norm": 0.9040521564205963, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 16630 + }, + { + "epoch": 0.16631, + "grad_norm": 0.8983255252938565, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 16631 + }, + { + "epoch": 0.16632, + "grad_norm": 0.8984472253596174, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 16632 + }, + { + "epoch": 0.16633, + "grad_norm": 1.0058179192726155, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 16633 + }, + { + "epoch": 0.16634, + "grad_norm": 1.0815490574704374, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 16634 + }, + { + "epoch": 0.16635, + "grad_norm": 0.8688003355942993, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 16635 + }, + { + "epoch": 0.16636, + "grad_norm": 0.7486544256271357, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 16636 + }, + { + "epoch": 0.16637, + "grad_norm": 0.6631626591427247, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 16637 + }, + { + "epoch": 0.16638, + "grad_norm": 0.6639624215462988, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 16638 + }, + { + "epoch": 0.16639, + "grad_norm": 0.7204267843529744, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 16639 + }, + { + "epoch": 0.1664, + "grad_norm": 0.7674416371335842, + "learning_rate": 0.003, + "loss": 4.088, + "step": 16640 + }, + { + "epoch": 0.16641, + "grad_norm": 0.7348282416012347, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 16641 + }, + { + "epoch": 0.16642, + "grad_norm": 0.6580299164326259, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16642 + }, + { + "epoch": 0.16643, + "grad_norm": 0.7486574461277886, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 16643 + }, + { + "epoch": 0.16644, + "grad_norm": 0.8410891945812161, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 16644 + }, + { + "epoch": 0.16645, + "grad_norm": 0.9015567725756612, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 16645 + }, + { + "epoch": 0.16646, + "grad_norm": 0.9497849783573655, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 16646 + }, + { + "epoch": 0.16647, + "grad_norm": 1.044784560631695, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 16647 + }, + { + "epoch": 0.16648, + "grad_norm": 1.2408840364808897, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 16648 + }, + { + "epoch": 0.16649, + "grad_norm": 0.9118155901804015, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 16649 + }, + { + "epoch": 0.1665, + "grad_norm": 0.8503313840325569, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 16650 + }, + { + "epoch": 0.16651, + "grad_norm": 0.8852096406645081, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 16651 + }, + { + "epoch": 0.16652, + "grad_norm": 0.8274706247869366, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 16652 + }, + { + "epoch": 0.16653, + "grad_norm": 0.8336659895344166, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 16653 + }, + { + "epoch": 0.16654, + "grad_norm": 0.7964732585715956, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 16654 + }, + { + "epoch": 0.16655, + "grad_norm": 0.713936537676871, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 16655 + }, + { + "epoch": 0.16656, + "grad_norm": 0.7636968286517395, + "learning_rate": 0.003, + "loss": 4.046, + "step": 16656 + }, + { + "epoch": 0.16657, + "grad_norm": 0.9199190974969985, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 16657 + }, + { + "epoch": 0.16658, + "grad_norm": 1.1881161215787814, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 16658 + }, + { + "epoch": 0.16659, + "grad_norm": 0.8359390921559996, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 16659 + }, + { + "epoch": 0.1666, + "grad_norm": 0.6908175104092933, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 16660 + }, + { + "epoch": 0.16661, + "grad_norm": 0.7125203353404347, + "learning_rate": 0.003, + "loss": 4.04, + "step": 16661 + }, + { + "epoch": 0.16662, + "grad_norm": 0.7957518128079261, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 16662 + }, + { + "epoch": 0.16663, + "grad_norm": 0.8126516335374254, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 16663 + }, + { + "epoch": 0.16664, + "grad_norm": 0.8981446676842118, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 16664 + }, + { + "epoch": 0.16665, + "grad_norm": 0.9207710774603982, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 16665 + }, + { + "epoch": 0.16666, + "grad_norm": 1.0670631481523027, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 16666 + }, + { + "epoch": 0.16667, + "grad_norm": 1.1159190925384723, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16667 + }, + { + "epoch": 0.16668, + "grad_norm": 0.8755316547655639, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 16668 + }, + { + "epoch": 0.16669, + "grad_norm": 0.823923889411584, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 16669 + }, + { + "epoch": 0.1667, + "grad_norm": 0.9523476313505719, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 16670 + }, + { + "epoch": 0.16671, + "grad_norm": 1.1044233384045328, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 16671 + }, + { + "epoch": 0.16672, + "grad_norm": 1.0050174126244413, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 16672 + }, + { + "epoch": 0.16673, + "grad_norm": 0.9198252547094358, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 16673 + }, + { + "epoch": 0.16674, + "grad_norm": 1.0250224458877255, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 16674 + }, + { + "epoch": 0.16675, + "grad_norm": 1.0674086051971232, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 16675 + }, + { + "epoch": 0.16676, + "grad_norm": 1.034548655186563, + "learning_rate": 0.003, + "loss": 4.059, + "step": 16676 + }, + { + "epoch": 0.16677, + "grad_norm": 0.8363754754531006, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 16677 + }, + { + "epoch": 0.16678, + "grad_norm": 1.0221453849310609, + "learning_rate": 0.003, + "loss": 4.085, + "step": 16678 + }, + { + "epoch": 0.16679, + "grad_norm": 1.2720385853628837, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 16679 + }, + { + "epoch": 0.1668, + "grad_norm": 0.8660987181035079, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 16680 + }, + { + "epoch": 0.16681, + "grad_norm": 0.8373537570517481, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 16681 + }, + { + "epoch": 0.16682, + "grad_norm": 0.7504840831491315, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 16682 + }, + { + "epoch": 0.16683, + "grad_norm": 0.8274429077104438, + "learning_rate": 0.003, + "loss": 4.068, + "step": 16683 + }, + { + "epoch": 0.16684, + "grad_norm": 0.8174868885434718, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 16684 + }, + { + "epoch": 0.16685, + "grad_norm": 0.6824686537776473, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 16685 + }, + { + "epoch": 0.16686, + "grad_norm": 0.6726549298968327, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 16686 + }, + { + "epoch": 0.16687, + "grad_norm": 0.7826364611423184, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 16687 + }, + { + "epoch": 0.16688, + "grad_norm": 0.8458249616171628, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 16688 + }, + { + "epoch": 0.16689, + "grad_norm": 0.892481267185803, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 16689 + }, + { + "epoch": 0.1669, + "grad_norm": 1.119182684290395, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 16690 + }, + { + "epoch": 0.16691, + "grad_norm": 1.0885676419308759, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 16691 + }, + { + "epoch": 0.16692, + "grad_norm": 0.8871868582661631, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 16692 + }, + { + "epoch": 0.16693, + "grad_norm": 0.9075311293445458, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 16693 + }, + { + "epoch": 0.16694, + "grad_norm": 0.8219362385852115, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 16694 + }, + { + "epoch": 0.16695, + "grad_norm": 0.7217350868604105, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 16695 + }, + { + "epoch": 0.16696, + "grad_norm": 0.6635000555575337, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 16696 + }, + { + "epoch": 0.16697, + "grad_norm": 0.7468625388147383, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 16697 + }, + { + "epoch": 0.16698, + "grad_norm": 0.7569472862038678, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 16698 + }, + { + "epoch": 0.16699, + "grad_norm": 0.688775075962298, + "learning_rate": 0.003, + "loss": 4.041, + "step": 16699 + }, + { + "epoch": 0.167, + "grad_norm": 0.7881719998264674, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 16700 + }, + { + "epoch": 0.16701, + "grad_norm": 0.9828491392111073, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 16701 + }, + { + "epoch": 0.16702, + "grad_norm": 1.1219184881620372, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 16702 + }, + { + "epoch": 0.16703, + "grad_norm": 1.030981913210529, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 16703 + }, + { + "epoch": 0.16704, + "grad_norm": 0.9494470020091834, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 16704 + }, + { + "epoch": 0.16705, + "grad_norm": 1.1174629570337802, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 16705 + }, + { + "epoch": 0.16706, + "grad_norm": 1.0142969666302373, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 16706 + }, + { + "epoch": 0.16707, + "grad_norm": 0.7572277314580239, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 16707 + }, + { + "epoch": 0.16708, + "grad_norm": 0.7004178767027441, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 16708 + }, + { + "epoch": 0.16709, + "grad_norm": 0.6831002573420436, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 16709 + }, + { + "epoch": 0.1671, + "grad_norm": 0.7516793822840889, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 16710 + }, + { + "epoch": 0.16711, + "grad_norm": 0.7018536133780285, + "learning_rate": 0.003, + "loss": 4.034, + "step": 16711 + }, + { + "epoch": 0.16712, + "grad_norm": 0.6719485607798802, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 16712 + }, + { + "epoch": 0.16713, + "grad_norm": 0.6965262460342196, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 16713 + }, + { + "epoch": 0.16714, + "grad_norm": 0.6724165567767315, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 16714 + }, + { + "epoch": 0.16715, + "grad_norm": 0.5872136714796434, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 16715 + }, + { + "epoch": 0.16716, + "grad_norm": 0.5840214977574147, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 16716 + }, + { + "epoch": 0.16717, + "grad_norm": 0.7265828835126165, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 16717 + }, + { + "epoch": 0.16718, + "grad_norm": 1.0963337524529386, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 16718 + }, + { + "epoch": 0.16719, + "grad_norm": 1.1380126288081818, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 16719 + }, + { + "epoch": 0.1672, + "grad_norm": 0.8385039946950212, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 16720 + }, + { + "epoch": 0.16721, + "grad_norm": 0.680709102532998, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 16721 + }, + { + "epoch": 0.16722, + "grad_norm": 0.653449249403669, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 16722 + }, + { + "epoch": 0.16723, + "grad_norm": 0.716150777958729, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 16723 + }, + { + "epoch": 0.16724, + "grad_norm": 0.7306702448232821, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 16724 + }, + { + "epoch": 0.16725, + "grad_norm": 0.9621919836828025, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 16725 + }, + { + "epoch": 0.16726, + "grad_norm": 1.1186628933956517, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 16726 + }, + { + "epoch": 0.16727, + "grad_norm": 0.7780106037705267, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 16727 + }, + { + "epoch": 0.16728, + "grad_norm": 0.853706730233926, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16728 + }, + { + "epoch": 0.16729, + "grad_norm": 0.9424718190049373, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 16729 + }, + { + "epoch": 0.1673, + "grad_norm": 1.122484521377639, + "learning_rate": 0.003, + "loss": 4.12, + "step": 16730 + }, + { + "epoch": 0.16731, + "grad_norm": 1.0243242933080432, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16731 + }, + { + "epoch": 0.16732, + "grad_norm": 1.0680102268558531, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 16732 + }, + { + "epoch": 0.16733, + "grad_norm": 1.0557716208896646, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 16733 + }, + { + "epoch": 0.16734, + "grad_norm": 1.0717652968490432, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 16734 + }, + { + "epoch": 0.16735, + "grad_norm": 1.0822275586531358, + "learning_rate": 0.003, + "loss": 4.1083, + "step": 16735 + }, + { + "epoch": 0.16736, + "grad_norm": 1.039304281817275, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 16736 + }, + { + "epoch": 0.16737, + "grad_norm": 1.1432653601237608, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 16737 + }, + { + "epoch": 0.16738, + "grad_norm": 0.9824314171564776, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 16738 + }, + { + "epoch": 0.16739, + "grad_norm": 0.9860950685151864, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 16739 + }, + { + "epoch": 0.1674, + "grad_norm": 1.0728495562678955, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 16740 + }, + { + "epoch": 0.16741, + "grad_norm": 1.0102484941139138, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 16741 + }, + { + "epoch": 0.16742, + "grad_norm": 1.0639365017792461, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 16742 + }, + { + "epoch": 0.16743, + "grad_norm": 1.0673694705953947, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 16743 + }, + { + "epoch": 0.16744, + "grad_norm": 1.1906288768068023, + "learning_rate": 0.003, + "loss": 4.1095, + "step": 16744 + }, + { + "epoch": 0.16745, + "grad_norm": 1.0099529214659242, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 16745 + }, + { + "epoch": 0.16746, + "grad_norm": 0.7696819389456069, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 16746 + }, + { + "epoch": 0.16747, + "grad_norm": 0.7087115276953534, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 16747 + }, + { + "epoch": 0.16748, + "grad_norm": 0.6618824327696065, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 16748 + }, + { + "epoch": 0.16749, + "grad_norm": 0.7209320568784208, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 16749 + }, + { + "epoch": 0.1675, + "grad_norm": 0.64605340164109, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 16750 + }, + { + "epoch": 0.16751, + "grad_norm": 0.6514167008229829, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 16751 + }, + { + "epoch": 0.16752, + "grad_norm": 0.6984598851722965, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 16752 + }, + { + "epoch": 0.16753, + "grad_norm": 0.806041428263782, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 16753 + }, + { + "epoch": 0.16754, + "grad_norm": 0.8372914861193819, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 16754 + }, + { + "epoch": 0.16755, + "grad_norm": 0.8948825987007641, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 16755 + }, + { + "epoch": 0.16756, + "grad_norm": 0.9066786063886627, + "learning_rate": 0.003, + "loss": 4.035, + "step": 16756 + }, + { + "epoch": 0.16757, + "grad_norm": 0.8809460789147684, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 16757 + }, + { + "epoch": 0.16758, + "grad_norm": 0.9161379809246357, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 16758 + }, + { + "epoch": 0.16759, + "grad_norm": 0.8440855019154663, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 16759 + }, + { + "epoch": 0.1676, + "grad_norm": 0.8539885715059418, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 16760 + }, + { + "epoch": 0.16761, + "grad_norm": 0.9609780072492677, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 16761 + }, + { + "epoch": 0.16762, + "grad_norm": 1.1560485532817966, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 16762 + }, + { + "epoch": 0.16763, + "grad_norm": 0.8631234786759063, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 16763 + }, + { + "epoch": 0.16764, + "grad_norm": 0.8246721104258516, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 16764 + }, + { + "epoch": 0.16765, + "grad_norm": 0.7935901119471722, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16765 + }, + { + "epoch": 0.16766, + "grad_norm": 0.7896718099564398, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 16766 + }, + { + "epoch": 0.16767, + "grad_norm": 0.8989561638589851, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 16767 + }, + { + "epoch": 0.16768, + "grad_norm": 0.8084974032941903, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 16768 + }, + { + "epoch": 0.16769, + "grad_norm": 0.6636195747898832, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 16769 + }, + { + "epoch": 0.1677, + "grad_norm": 0.6090637753516286, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 16770 + }, + { + "epoch": 0.16771, + "grad_norm": 0.716113509832652, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 16771 + }, + { + "epoch": 0.16772, + "grad_norm": 1.0752000136954618, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 16772 + }, + { + "epoch": 0.16773, + "grad_norm": 1.2568956870851655, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 16773 + }, + { + "epoch": 0.16774, + "grad_norm": 0.690306332321971, + "learning_rate": 0.003, + "loss": 4.068, + "step": 16774 + }, + { + "epoch": 0.16775, + "grad_norm": 0.7003821794820632, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 16775 + }, + { + "epoch": 0.16776, + "grad_norm": 0.7885371785917205, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 16776 + }, + { + "epoch": 0.16777, + "grad_norm": 0.7484215582500324, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 16777 + }, + { + "epoch": 0.16778, + "grad_norm": 0.8139857628190452, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 16778 + }, + { + "epoch": 0.16779, + "grad_norm": 0.7820292388934653, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 16779 + }, + { + "epoch": 0.1678, + "grad_norm": 0.8103586517041421, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 16780 + }, + { + "epoch": 0.16781, + "grad_norm": 0.9732764987293661, + "learning_rate": 0.003, + "loss": 4.1115, + "step": 16781 + }, + { + "epoch": 0.16782, + "grad_norm": 1.0508315937354935, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 16782 + }, + { + "epoch": 0.16783, + "grad_norm": 1.1351229332308257, + "learning_rate": 0.003, + "loss": 4.093, + "step": 16783 + }, + { + "epoch": 0.16784, + "grad_norm": 0.9168068840896803, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 16784 + }, + { + "epoch": 0.16785, + "grad_norm": 0.8953446696867371, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 16785 + }, + { + "epoch": 0.16786, + "grad_norm": 0.8365816798135379, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 16786 + }, + { + "epoch": 0.16787, + "grad_norm": 0.8323806854912139, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 16787 + }, + { + "epoch": 0.16788, + "grad_norm": 0.9497142736006731, + "learning_rate": 0.003, + "loss": 4.095, + "step": 16788 + }, + { + "epoch": 0.16789, + "grad_norm": 1.096878713543592, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 16789 + }, + { + "epoch": 0.1679, + "grad_norm": 1.1031005450161495, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 16790 + }, + { + "epoch": 0.16791, + "grad_norm": 0.8738495470403181, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 16791 + }, + { + "epoch": 0.16792, + "grad_norm": 0.9152539305550742, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 16792 + }, + { + "epoch": 0.16793, + "grad_norm": 0.9043708831672599, + "learning_rate": 0.003, + "loss": 4.058, + "step": 16793 + }, + { + "epoch": 0.16794, + "grad_norm": 0.875149090923743, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 16794 + }, + { + "epoch": 0.16795, + "grad_norm": 0.8435276843851752, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 16795 + }, + { + "epoch": 0.16796, + "grad_norm": 0.8694933853930026, + "learning_rate": 0.003, + "loss": 4.035, + "step": 16796 + }, + { + "epoch": 0.16797, + "grad_norm": 0.8594162744869177, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 16797 + }, + { + "epoch": 0.16798, + "grad_norm": 0.8335938110380197, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 16798 + }, + { + "epoch": 0.16799, + "grad_norm": 0.8401286558201116, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 16799 + }, + { + "epoch": 0.168, + "grad_norm": 1.0148988626211617, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 16800 + }, + { + "epoch": 0.16801, + "grad_norm": 1.1784713719652615, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 16801 + }, + { + "epoch": 0.16802, + "grad_norm": 0.9581177199857444, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 16802 + }, + { + "epoch": 0.16803, + "grad_norm": 0.938081288603496, + "learning_rate": 0.003, + "loss": 4.106, + "step": 16803 + }, + { + "epoch": 0.16804, + "grad_norm": 0.9632259496080161, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 16804 + }, + { + "epoch": 0.16805, + "grad_norm": 0.934841255896249, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 16805 + }, + { + "epoch": 0.16806, + "grad_norm": 0.901274662548973, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 16806 + }, + { + "epoch": 0.16807, + "grad_norm": 0.9566533578116455, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 16807 + }, + { + "epoch": 0.16808, + "grad_norm": 0.9186767560775385, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 16808 + }, + { + "epoch": 0.16809, + "grad_norm": 0.9062104024577505, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 16809 + }, + { + "epoch": 0.1681, + "grad_norm": 0.9360000010205733, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 16810 + }, + { + "epoch": 0.16811, + "grad_norm": 1.130700436145991, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 16811 + }, + { + "epoch": 0.16812, + "grad_norm": 1.0286298599655, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 16812 + }, + { + "epoch": 0.16813, + "grad_norm": 1.0227472417847407, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 16813 + }, + { + "epoch": 0.16814, + "grad_norm": 0.9291237381618836, + "learning_rate": 0.003, + "loss": 4.05, + "step": 16814 + }, + { + "epoch": 0.16815, + "grad_norm": 0.7853585844044454, + "learning_rate": 0.003, + "loss": 4.111, + "step": 16815 + }, + { + "epoch": 0.16816, + "grad_norm": 0.7630891220068708, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 16816 + }, + { + "epoch": 0.16817, + "grad_norm": 0.8176219127342664, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 16817 + }, + { + "epoch": 0.16818, + "grad_norm": 0.8442687593616304, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 16818 + }, + { + "epoch": 0.16819, + "grad_norm": 0.7733673292555855, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 16819 + }, + { + "epoch": 0.1682, + "grad_norm": 0.7667584541217659, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 16820 + }, + { + "epoch": 0.16821, + "grad_norm": 0.7720845035105838, + "learning_rate": 0.003, + "loss": 4.034, + "step": 16821 + }, + { + "epoch": 0.16822, + "grad_norm": 0.7949675295815203, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 16822 + }, + { + "epoch": 0.16823, + "grad_norm": 0.845912537748116, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 16823 + }, + { + "epoch": 0.16824, + "grad_norm": 0.9190522756506045, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 16824 + }, + { + "epoch": 0.16825, + "grad_norm": 1.0767841771935078, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 16825 + }, + { + "epoch": 0.16826, + "grad_norm": 0.960922952409664, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 16826 + }, + { + "epoch": 0.16827, + "grad_norm": 0.812503792352589, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 16827 + }, + { + "epoch": 0.16828, + "grad_norm": 0.7715450756967425, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 16828 + }, + { + "epoch": 0.16829, + "grad_norm": 0.7138493846893461, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 16829 + }, + { + "epoch": 0.1683, + "grad_norm": 0.7559171902088815, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 16830 + }, + { + "epoch": 0.16831, + "grad_norm": 0.8454544722767993, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 16831 + }, + { + "epoch": 0.16832, + "grad_norm": 0.947185485561565, + "learning_rate": 0.003, + "loss": 4.059, + "step": 16832 + }, + { + "epoch": 0.16833, + "grad_norm": 1.177057153893723, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 16833 + }, + { + "epoch": 0.16834, + "grad_norm": 0.8277736530030745, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 16834 + }, + { + "epoch": 0.16835, + "grad_norm": 0.8403656624243396, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 16835 + }, + { + "epoch": 0.16836, + "grad_norm": 0.9701520335255543, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 16836 + }, + { + "epoch": 0.16837, + "grad_norm": 0.9429902242346855, + "learning_rate": 0.003, + "loss": 4.063, + "step": 16837 + }, + { + "epoch": 0.16838, + "grad_norm": 0.8886462276062898, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 16838 + }, + { + "epoch": 0.16839, + "grad_norm": 0.8376289744620194, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 16839 + }, + { + "epoch": 0.1684, + "grad_norm": 0.7963345186826152, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 16840 + }, + { + "epoch": 0.16841, + "grad_norm": 0.8549701938022071, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 16841 + }, + { + "epoch": 0.16842, + "grad_norm": 0.9960237379242167, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 16842 + }, + { + "epoch": 0.16843, + "grad_norm": 1.193643044924253, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 16843 + }, + { + "epoch": 0.16844, + "grad_norm": 0.9164143389429955, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 16844 + }, + { + "epoch": 0.16845, + "grad_norm": 0.8538321551106992, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 16845 + }, + { + "epoch": 0.16846, + "grad_norm": 0.9043740316393933, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 16846 + }, + { + "epoch": 0.16847, + "grad_norm": 0.9090509100819298, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 16847 + }, + { + "epoch": 0.16848, + "grad_norm": 0.9047829484022039, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 16848 + }, + { + "epoch": 0.16849, + "grad_norm": 0.9713622214580283, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 16849 + }, + { + "epoch": 0.1685, + "grad_norm": 1.039021503963327, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 16850 + }, + { + "epoch": 0.16851, + "grad_norm": 1.0507703816270693, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 16851 + }, + { + "epoch": 0.16852, + "grad_norm": 0.8394518194000056, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 16852 + }, + { + "epoch": 0.16853, + "grad_norm": 1.0097475429211138, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 16853 + }, + { + "epoch": 0.16854, + "grad_norm": 1.1792438245766939, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 16854 + }, + { + "epoch": 0.16855, + "grad_norm": 0.8313906406345619, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 16855 + }, + { + "epoch": 0.16856, + "grad_norm": 0.7988631433459336, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 16856 + }, + { + "epoch": 0.16857, + "grad_norm": 0.9343265695967697, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 16857 + }, + { + "epoch": 0.16858, + "grad_norm": 1.0242695154794679, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 16858 + }, + { + "epoch": 0.16859, + "grad_norm": 0.9642330942979943, + "learning_rate": 0.003, + "loss": 4.064, + "step": 16859 + }, + { + "epoch": 0.1686, + "grad_norm": 0.8937438287868996, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 16860 + }, + { + "epoch": 0.16861, + "grad_norm": 0.752504664058674, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16861 + }, + { + "epoch": 0.16862, + "grad_norm": 0.6544359707823012, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 16862 + }, + { + "epoch": 0.16863, + "grad_norm": 0.5945977205078491, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 16863 + }, + { + "epoch": 0.16864, + "grad_norm": 0.6396058325438535, + "learning_rate": 0.003, + "loss": 4.059, + "step": 16864 + }, + { + "epoch": 0.16865, + "grad_norm": 0.7140251684892681, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 16865 + }, + { + "epoch": 0.16866, + "grad_norm": 0.9480995066162021, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 16866 + }, + { + "epoch": 0.16867, + "grad_norm": 1.128713407564246, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 16867 + }, + { + "epoch": 0.16868, + "grad_norm": 1.0667592928487957, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 16868 + }, + { + "epoch": 0.16869, + "grad_norm": 0.8599187547357423, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 16869 + }, + { + "epoch": 0.1687, + "grad_norm": 0.720566056469061, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 16870 + }, + { + "epoch": 0.16871, + "grad_norm": 0.7918365376640389, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 16871 + }, + { + "epoch": 0.16872, + "grad_norm": 0.8167634892644354, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 16872 + }, + { + "epoch": 0.16873, + "grad_norm": 0.7492730495927574, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 16873 + }, + { + "epoch": 0.16874, + "grad_norm": 0.8433626659332906, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 16874 + }, + { + "epoch": 0.16875, + "grad_norm": 1.1800267673653257, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 16875 + }, + { + "epoch": 0.16876, + "grad_norm": 0.7818492223452557, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 16876 + }, + { + "epoch": 0.16877, + "grad_norm": 0.6623675644844721, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 16877 + }, + { + "epoch": 0.16878, + "grad_norm": 0.7037544274431712, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 16878 + }, + { + "epoch": 0.16879, + "grad_norm": 0.6733415151107367, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 16879 + }, + { + "epoch": 0.1688, + "grad_norm": 0.6994694575696391, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 16880 + }, + { + "epoch": 0.16881, + "grad_norm": 0.6649583705484471, + "learning_rate": 0.003, + "loss": 4.045, + "step": 16881 + }, + { + "epoch": 0.16882, + "grad_norm": 0.6816005272499164, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 16882 + }, + { + "epoch": 0.16883, + "grad_norm": 0.7513439213100401, + "learning_rate": 0.003, + "loss": 4.07, + "step": 16883 + }, + { + "epoch": 0.16884, + "grad_norm": 0.7662707567485666, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 16884 + }, + { + "epoch": 0.16885, + "grad_norm": 0.8737869709380984, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 16885 + }, + { + "epoch": 0.16886, + "grad_norm": 1.0491863553098975, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 16886 + }, + { + "epoch": 0.16887, + "grad_norm": 0.9325895461294644, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 16887 + }, + { + "epoch": 0.16888, + "grad_norm": 0.9386174844283485, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 16888 + }, + { + "epoch": 0.16889, + "grad_norm": 0.9615205005299152, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 16889 + }, + { + "epoch": 0.1689, + "grad_norm": 0.824528194402005, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 16890 + }, + { + "epoch": 0.16891, + "grad_norm": 0.8956823000248313, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 16891 + }, + { + "epoch": 0.16892, + "grad_norm": 1.0088333248260806, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 16892 + }, + { + "epoch": 0.16893, + "grad_norm": 1.0957559904151173, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 16893 + }, + { + "epoch": 0.16894, + "grad_norm": 0.9732333595934364, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 16894 + }, + { + "epoch": 0.16895, + "grad_norm": 1.0542448399915376, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 16895 + }, + { + "epoch": 0.16896, + "grad_norm": 1.0619033548809447, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 16896 + }, + { + "epoch": 0.16897, + "grad_norm": 1.0510395230679006, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 16897 + }, + { + "epoch": 0.16898, + "grad_norm": 1.0410623036910303, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 16898 + }, + { + "epoch": 0.16899, + "grad_norm": 0.9883500016051606, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 16899 + }, + { + "epoch": 0.169, + "grad_norm": 1.0462935391497992, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 16900 + }, + { + "epoch": 0.16901, + "grad_norm": 0.9338345911769358, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 16901 + }, + { + "epoch": 0.16902, + "grad_norm": 0.9126746141590983, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 16902 + }, + { + "epoch": 0.16903, + "grad_norm": 1.0236213603262032, + "learning_rate": 0.003, + "loss": 4.067, + "step": 16903 + }, + { + "epoch": 0.16904, + "grad_norm": 1.0459999430308622, + "learning_rate": 0.003, + "loss": 4.1192, + "step": 16904 + }, + { + "epoch": 0.16905, + "grad_norm": 0.8938821231306587, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 16905 + }, + { + "epoch": 0.16906, + "grad_norm": 0.8681891094987683, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 16906 + }, + { + "epoch": 0.16907, + "grad_norm": 0.9247683188024998, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16907 + }, + { + "epoch": 0.16908, + "grad_norm": 0.8913802332168116, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 16908 + }, + { + "epoch": 0.16909, + "grad_norm": 0.9217980419563984, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 16909 + }, + { + "epoch": 0.1691, + "grad_norm": 0.995531226179752, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 16910 + }, + { + "epoch": 0.16911, + "grad_norm": 1.0025440270873596, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 16911 + }, + { + "epoch": 0.16912, + "grad_norm": 1.0363056443442558, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 16912 + }, + { + "epoch": 0.16913, + "grad_norm": 0.9892047324839762, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 16913 + }, + { + "epoch": 0.16914, + "grad_norm": 1.0864104874845777, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 16914 + }, + { + "epoch": 0.16915, + "grad_norm": 0.843171475146014, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 16915 + }, + { + "epoch": 0.16916, + "grad_norm": 0.8561440496616702, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 16916 + }, + { + "epoch": 0.16917, + "grad_norm": 0.8163104147218143, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 16917 + }, + { + "epoch": 0.16918, + "grad_norm": 0.7246679817960276, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 16918 + }, + { + "epoch": 0.16919, + "grad_norm": 0.6402709068614789, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 16919 + }, + { + "epoch": 0.1692, + "grad_norm": 0.6744517276472858, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 16920 + }, + { + "epoch": 0.16921, + "grad_norm": 0.6548234665914778, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 16921 + }, + { + "epoch": 0.16922, + "grad_norm": 0.8375466753682494, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 16922 + }, + { + "epoch": 0.16923, + "grad_norm": 1.1733751383402757, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 16923 + }, + { + "epoch": 0.16924, + "grad_norm": 1.0683641233583197, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 16924 + }, + { + "epoch": 0.16925, + "grad_norm": 0.8290224063060054, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 16925 + }, + { + "epoch": 0.16926, + "grad_norm": 0.8198511691450935, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 16926 + }, + { + "epoch": 0.16927, + "grad_norm": 0.8725762743683417, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 16927 + }, + { + "epoch": 0.16928, + "grad_norm": 0.8838825462665559, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 16928 + }, + { + "epoch": 0.16929, + "grad_norm": 0.9107745212778506, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 16929 + }, + { + "epoch": 0.1693, + "grad_norm": 1.0482183640735203, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 16930 + }, + { + "epoch": 0.16931, + "grad_norm": 1.1633738422282487, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 16931 + }, + { + "epoch": 0.16932, + "grad_norm": 0.8834701888199146, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16932 + }, + { + "epoch": 0.16933, + "grad_norm": 0.7874448060794419, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 16933 + }, + { + "epoch": 0.16934, + "grad_norm": 0.752700718602391, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 16934 + }, + { + "epoch": 0.16935, + "grad_norm": 0.7261182902276447, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 16935 + }, + { + "epoch": 0.16936, + "grad_norm": 0.5815443494154847, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 16936 + }, + { + "epoch": 0.16937, + "grad_norm": 0.5413130876992222, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 16937 + }, + { + "epoch": 0.16938, + "grad_norm": 0.7148824713935833, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 16938 + }, + { + "epoch": 0.16939, + "grad_norm": 0.8113751917587134, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 16939 + }, + { + "epoch": 0.1694, + "grad_norm": 0.8331120584804382, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 16940 + }, + { + "epoch": 0.16941, + "grad_norm": 0.8783155396073156, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 16941 + }, + { + "epoch": 0.16942, + "grad_norm": 1.0229106091113127, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 16942 + }, + { + "epoch": 0.16943, + "grad_norm": 0.9913001669954191, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 16943 + }, + { + "epoch": 0.16944, + "grad_norm": 0.8444409482864496, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 16944 + }, + { + "epoch": 0.16945, + "grad_norm": 0.8183763055959205, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 16945 + }, + { + "epoch": 0.16946, + "grad_norm": 0.9617361905409932, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 16946 + }, + { + "epoch": 0.16947, + "grad_norm": 1.186039695712203, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 16947 + }, + { + "epoch": 0.16948, + "grad_norm": 0.9797662129440634, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 16948 + }, + { + "epoch": 0.16949, + "grad_norm": 0.8866228193879615, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 16949 + }, + { + "epoch": 0.1695, + "grad_norm": 0.9455632193764086, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 16950 + }, + { + "epoch": 0.16951, + "grad_norm": 0.9845815656400316, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 16951 + }, + { + "epoch": 0.16952, + "grad_norm": 0.9698630666524807, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 16952 + }, + { + "epoch": 0.16953, + "grad_norm": 0.9263444676583883, + "learning_rate": 0.003, + "loss": 4.07, + "step": 16953 + }, + { + "epoch": 0.16954, + "grad_norm": 0.9877001745289326, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 16954 + }, + { + "epoch": 0.16955, + "grad_norm": 1.0565279828511083, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 16955 + }, + { + "epoch": 0.16956, + "grad_norm": 0.8518040471049105, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 16956 + }, + { + "epoch": 0.16957, + "grad_norm": 0.7375777017857805, + "learning_rate": 0.003, + "loss": 4.077, + "step": 16957 + }, + { + "epoch": 0.16958, + "grad_norm": 0.6535865882538622, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 16958 + }, + { + "epoch": 0.16959, + "grad_norm": 0.6095598143028842, + "learning_rate": 0.003, + "loss": 4.04, + "step": 16959 + }, + { + "epoch": 0.1696, + "grad_norm": 0.6236536986198375, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 16960 + }, + { + "epoch": 0.16961, + "grad_norm": 0.6409798092580766, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 16961 + }, + { + "epoch": 0.16962, + "grad_norm": 0.8309216838697241, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 16962 + }, + { + "epoch": 0.16963, + "grad_norm": 0.9791274783309264, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 16963 + }, + { + "epoch": 0.16964, + "grad_norm": 0.9180726458211976, + "learning_rate": 0.003, + "loss": 4.09, + "step": 16964 + }, + { + "epoch": 0.16965, + "grad_norm": 0.80150772996483, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 16965 + }, + { + "epoch": 0.16966, + "grad_norm": 0.7810169671486233, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 16966 + }, + { + "epoch": 0.16967, + "grad_norm": 0.835625859098452, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 16967 + }, + { + "epoch": 0.16968, + "grad_norm": 0.7935090509567666, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 16968 + }, + { + "epoch": 0.16969, + "grad_norm": 0.6853971941848793, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 16969 + }, + { + "epoch": 0.1697, + "grad_norm": 0.686601229050851, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 16970 + }, + { + "epoch": 0.16971, + "grad_norm": 0.6647273955786298, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 16971 + }, + { + "epoch": 0.16972, + "grad_norm": 0.8557843940767302, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 16972 + }, + { + "epoch": 0.16973, + "grad_norm": 1.3058272509003233, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 16973 + }, + { + "epoch": 0.16974, + "grad_norm": 1.1413326711907645, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 16974 + }, + { + "epoch": 0.16975, + "grad_norm": 0.8381572260259262, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 16975 + }, + { + "epoch": 0.16976, + "grad_norm": 0.7782455868370698, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 16976 + }, + { + "epoch": 0.16977, + "grad_norm": 0.9132732569395081, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 16977 + }, + { + "epoch": 0.16978, + "grad_norm": 1.0169807904158978, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 16978 + }, + { + "epoch": 0.16979, + "grad_norm": 0.7758826977474127, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 16979 + }, + { + "epoch": 0.1698, + "grad_norm": 0.7800625323486636, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 16980 + }, + { + "epoch": 0.16981, + "grad_norm": 0.9805905272868272, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 16981 + }, + { + "epoch": 0.16982, + "grad_norm": 1.20335505975646, + "learning_rate": 0.003, + "loss": 4.0959, + "step": 16982 + }, + { + "epoch": 0.16983, + "grad_norm": 0.8596746358564572, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 16983 + }, + { + "epoch": 0.16984, + "grad_norm": 0.8592945046367677, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 16984 + }, + { + "epoch": 0.16985, + "grad_norm": 0.7542904586121594, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 16985 + }, + { + "epoch": 0.16986, + "grad_norm": 0.7917575943374668, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 16986 + }, + { + "epoch": 0.16987, + "grad_norm": 0.9634107763726459, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 16987 + }, + { + "epoch": 0.16988, + "grad_norm": 1.2190631152217408, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 16988 + }, + { + "epoch": 0.16989, + "grad_norm": 1.08970278598891, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 16989 + }, + { + "epoch": 0.1699, + "grad_norm": 0.9868812206834904, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 16990 + }, + { + "epoch": 0.16991, + "grad_norm": 0.9841322262971984, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 16991 + }, + { + "epoch": 0.16992, + "grad_norm": 1.0484360604769196, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 16992 + }, + { + "epoch": 0.16993, + "grad_norm": 0.9260306894912795, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 16993 + }, + { + "epoch": 0.16994, + "grad_norm": 0.9066405327558062, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 16994 + }, + { + "epoch": 0.16995, + "grad_norm": 1.0348207431462089, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 16995 + }, + { + "epoch": 0.16996, + "grad_norm": 0.9191441797665568, + "learning_rate": 0.003, + "loss": 4.054, + "step": 16996 + }, + { + "epoch": 0.16997, + "grad_norm": 0.730506594938041, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 16997 + }, + { + "epoch": 0.16998, + "grad_norm": 0.639203737255759, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 16998 + }, + { + "epoch": 0.16999, + "grad_norm": 0.6657379714702873, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 16999 + }, + { + "epoch": 0.17, + "grad_norm": 0.775351235315513, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 17000 + }, + { + "epoch": 0.17001, + "grad_norm": 0.7991960468995907, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 17001 + }, + { + "epoch": 0.17002, + "grad_norm": 0.87198689526849, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 17002 + }, + { + "epoch": 0.17003, + "grad_norm": 0.9285139097854309, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 17003 + }, + { + "epoch": 0.17004, + "grad_norm": 0.7932451824493771, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 17004 + }, + { + "epoch": 0.17005, + "grad_norm": 0.7035403585610972, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 17005 + }, + { + "epoch": 0.17006, + "grad_norm": 0.740581962118741, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 17006 + }, + { + "epoch": 0.17007, + "grad_norm": 0.8592684849435771, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 17007 + }, + { + "epoch": 0.17008, + "grad_norm": 1.108864136209052, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 17008 + }, + { + "epoch": 0.17009, + "grad_norm": 1.0887487399263103, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 17009 + }, + { + "epoch": 0.1701, + "grad_norm": 0.902741685828884, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 17010 + }, + { + "epoch": 0.17011, + "grad_norm": 0.9621009107416355, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 17011 + }, + { + "epoch": 0.17012, + "grad_norm": 0.9067105355899577, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 17012 + }, + { + "epoch": 0.17013, + "grad_norm": 0.8505921216731109, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 17013 + }, + { + "epoch": 0.17014, + "grad_norm": 0.8078186108669917, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 17014 + }, + { + "epoch": 0.17015, + "grad_norm": 0.8912264605475723, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 17015 + }, + { + "epoch": 0.17016, + "grad_norm": 0.8849887110904964, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 17016 + }, + { + "epoch": 0.17017, + "grad_norm": 0.8490462547011455, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 17017 + }, + { + "epoch": 0.17018, + "grad_norm": 0.8282127625312425, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 17018 + }, + { + "epoch": 0.17019, + "grad_norm": 0.7833936367868143, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 17019 + }, + { + "epoch": 0.1702, + "grad_norm": 0.7466115154262697, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 17020 + }, + { + "epoch": 0.17021, + "grad_norm": 0.7434084377161972, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 17021 + }, + { + "epoch": 0.17022, + "grad_norm": 0.779901566712444, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 17022 + }, + { + "epoch": 0.17023, + "grad_norm": 0.8029196783649748, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 17023 + }, + { + "epoch": 0.17024, + "grad_norm": 0.8266168430994425, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 17024 + }, + { + "epoch": 0.17025, + "grad_norm": 0.8070319728002024, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 17025 + }, + { + "epoch": 0.17026, + "grad_norm": 0.7384919804956861, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 17026 + }, + { + "epoch": 0.17027, + "grad_norm": 0.9538535182299456, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 17027 + }, + { + "epoch": 0.17028, + "grad_norm": 1.2190767863757057, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 17028 + }, + { + "epoch": 0.17029, + "grad_norm": 0.9038072468944277, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 17029 + }, + { + "epoch": 0.1703, + "grad_norm": 0.8002975089514168, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 17030 + }, + { + "epoch": 0.17031, + "grad_norm": 0.8229617505492536, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 17031 + }, + { + "epoch": 0.17032, + "grad_norm": 0.8853914211914459, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 17032 + }, + { + "epoch": 0.17033, + "grad_norm": 1.2535034786768737, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 17033 + }, + { + "epoch": 0.17034, + "grad_norm": 1.1634495331454608, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 17034 + }, + { + "epoch": 0.17035, + "grad_norm": 0.9770624289561806, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 17035 + }, + { + "epoch": 0.17036, + "grad_norm": 0.9636810900500455, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 17036 + }, + { + "epoch": 0.17037, + "grad_norm": 0.9677334965571742, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 17037 + }, + { + "epoch": 0.17038, + "grad_norm": 1.0137456362567268, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 17038 + }, + { + "epoch": 0.17039, + "grad_norm": 0.9311855857985665, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 17039 + }, + { + "epoch": 0.1704, + "grad_norm": 0.8705171238203692, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 17040 + }, + { + "epoch": 0.17041, + "grad_norm": 0.9120460747983312, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 17041 + }, + { + "epoch": 0.17042, + "grad_norm": 0.9000611445305694, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 17042 + }, + { + "epoch": 0.17043, + "grad_norm": 0.9149092235683032, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 17043 + }, + { + "epoch": 0.17044, + "grad_norm": 0.9581749747685351, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 17044 + }, + { + "epoch": 0.17045, + "grad_norm": 1.110605387279622, + "learning_rate": 0.003, + "loss": 4.088, + "step": 17045 + }, + { + "epoch": 0.17046, + "grad_norm": 1.0004888932585712, + "learning_rate": 0.003, + "loss": 4.0884, + "step": 17046 + }, + { + "epoch": 0.17047, + "grad_norm": 1.1423249502789607, + "learning_rate": 0.003, + "loss": 4.087, + "step": 17047 + }, + { + "epoch": 0.17048, + "grad_norm": 1.0049720562907372, + "learning_rate": 0.003, + "loss": 4.101, + "step": 17048 + }, + { + "epoch": 0.17049, + "grad_norm": 0.9518434773031919, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 17049 + }, + { + "epoch": 0.1705, + "grad_norm": 0.8336815905772503, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 17050 + }, + { + "epoch": 0.17051, + "grad_norm": 0.8464438414693694, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 17051 + }, + { + "epoch": 0.17052, + "grad_norm": 0.7407423122925679, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 17052 + }, + { + "epoch": 0.17053, + "grad_norm": 0.693597921646556, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 17053 + }, + { + "epoch": 0.17054, + "grad_norm": 0.7224347609205675, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 17054 + }, + { + "epoch": 0.17055, + "grad_norm": 0.727109434885961, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 17055 + }, + { + "epoch": 0.17056, + "grad_norm": 0.7085929468906867, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 17056 + }, + { + "epoch": 0.17057, + "grad_norm": 0.8981748123460447, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 17057 + }, + { + "epoch": 0.17058, + "grad_norm": 2.2373522381619755, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 17058 + }, + { + "epoch": 0.17059, + "grad_norm": 0.9093744697365344, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 17059 + }, + { + "epoch": 0.1706, + "grad_norm": 1.1277623734429008, + "learning_rate": 0.003, + "loss": 4.1043, + "step": 17060 + }, + { + "epoch": 0.17061, + "grad_norm": 1.5430590966089344, + "learning_rate": 0.003, + "loss": 4.1294, + "step": 17061 + }, + { + "epoch": 0.17062, + "grad_norm": 2.038189289617899, + "learning_rate": 0.003, + "loss": 4.1207, + "step": 17062 + }, + { + "epoch": 0.17063, + "grad_norm": 1.2126336199709753, + "learning_rate": 0.003, + "loss": 4.1344, + "step": 17063 + }, + { + "epoch": 0.17064, + "grad_norm": 1.5421381235140816, + "learning_rate": 0.003, + "loss": 4.1432, + "step": 17064 + }, + { + "epoch": 0.17065, + "grad_norm": 1.1488223955809493, + "learning_rate": 0.003, + "loss": 4.1663, + "step": 17065 + }, + { + "epoch": 0.17066, + "grad_norm": 1.207433546749588, + "learning_rate": 0.003, + "loss": 4.1518, + "step": 17066 + }, + { + "epoch": 0.17067, + "grad_norm": 1.0140043474005396, + "learning_rate": 0.003, + "loss": 4.1684, + "step": 17067 + }, + { + "epoch": 0.17068, + "grad_norm": 1.188130617619142, + "learning_rate": 0.003, + "loss": 4.1824, + "step": 17068 + }, + { + "epoch": 0.17069, + "grad_norm": 0.919306923692545, + "learning_rate": 0.003, + "loss": 4.1696, + "step": 17069 + }, + { + "epoch": 0.1707, + "grad_norm": 0.9866630603168417, + "learning_rate": 0.003, + "loss": 4.1475, + "step": 17070 + }, + { + "epoch": 0.17071, + "grad_norm": 1.329511049506572, + "learning_rate": 0.003, + "loss": 4.1381, + "step": 17071 + }, + { + "epoch": 0.17072, + "grad_norm": 0.9106345385856193, + "learning_rate": 0.003, + "loss": 4.1335, + "step": 17072 + }, + { + "epoch": 0.17073, + "grad_norm": 1.1764960321502664, + "learning_rate": 0.003, + "loss": 4.1519, + "step": 17073 + }, + { + "epoch": 0.17074, + "grad_norm": 1.2598383828849962, + "learning_rate": 0.003, + "loss": 4.1498, + "step": 17074 + }, + { + "epoch": 0.17075, + "grad_norm": 1.0509211538495036, + "learning_rate": 0.003, + "loss": 4.1384, + "step": 17075 + }, + { + "epoch": 0.17076, + "grad_norm": 0.9102646245035444, + "learning_rate": 0.003, + "loss": 4.1117, + "step": 17076 + }, + { + "epoch": 0.17077, + "grad_norm": 0.8591655765967405, + "learning_rate": 0.003, + "loss": 4.1403, + "step": 17077 + }, + { + "epoch": 0.17078, + "grad_norm": 0.9587677674346851, + "learning_rate": 0.003, + "loss": 4.1208, + "step": 17078 + }, + { + "epoch": 0.17079, + "grad_norm": 0.9367928599234552, + "learning_rate": 0.003, + "loss": 4.118, + "step": 17079 + }, + { + "epoch": 0.1708, + "grad_norm": 1.0437382963401483, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 17080 + }, + { + "epoch": 0.17081, + "grad_norm": 1.1473030302395222, + "learning_rate": 0.003, + "loss": 4.1126, + "step": 17081 + }, + { + "epoch": 0.17082, + "grad_norm": 0.9106070465700841, + "learning_rate": 0.003, + "loss": 4.1496, + "step": 17082 + }, + { + "epoch": 0.17083, + "grad_norm": 0.8471341271322517, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 17083 + }, + { + "epoch": 0.17084, + "grad_norm": 0.7975919978994044, + "learning_rate": 0.003, + "loss": 4.1122, + "step": 17084 + }, + { + "epoch": 0.17085, + "grad_norm": 0.7186499293677238, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 17085 + }, + { + "epoch": 0.17086, + "grad_norm": 0.6105221851269854, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 17086 + }, + { + "epoch": 0.17087, + "grad_norm": 0.5659929406154532, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 17087 + }, + { + "epoch": 0.17088, + "grad_norm": 0.5785630679244134, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 17088 + }, + { + "epoch": 0.17089, + "grad_norm": 0.5387770406885679, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 17089 + }, + { + "epoch": 0.1709, + "grad_norm": 0.5224281719319868, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 17090 + }, + { + "epoch": 0.17091, + "grad_norm": 0.5466749487337251, + "learning_rate": 0.003, + "loss": 4.1053, + "step": 17091 + }, + { + "epoch": 0.17092, + "grad_norm": 0.5773224442912067, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 17092 + }, + { + "epoch": 0.17093, + "grad_norm": 0.6079574601308142, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 17093 + }, + { + "epoch": 0.17094, + "grad_norm": 0.7256985184918995, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 17094 + }, + { + "epoch": 0.17095, + "grad_norm": 0.9023129629485823, + "learning_rate": 0.003, + "loss": 4.055, + "step": 17095 + }, + { + "epoch": 0.17096, + "grad_norm": 1.0036109631300578, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 17096 + }, + { + "epoch": 0.17097, + "grad_norm": 0.9691106360794101, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 17097 + }, + { + "epoch": 0.17098, + "grad_norm": 0.8144532410996149, + "learning_rate": 0.003, + "loss": 4.0969, + "step": 17098 + }, + { + "epoch": 0.17099, + "grad_norm": 0.8244833413472513, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 17099 + }, + { + "epoch": 0.171, + "grad_norm": 0.8515963531151998, + "learning_rate": 0.003, + "loss": 4.07, + "step": 17100 + }, + { + "epoch": 0.17101, + "grad_norm": 0.8307615043135952, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 17101 + }, + { + "epoch": 0.17102, + "grad_norm": 0.8218764246903447, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 17102 + }, + { + "epoch": 0.17103, + "grad_norm": 1.0301794365666588, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 17103 + }, + { + "epoch": 0.17104, + "grad_norm": 1.165111959697851, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 17104 + }, + { + "epoch": 0.17105, + "grad_norm": 0.7810556750976614, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17105 + }, + { + "epoch": 0.17106, + "grad_norm": 0.6791549396745541, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 17106 + }, + { + "epoch": 0.17107, + "grad_norm": 0.6623671796945579, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 17107 + }, + { + "epoch": 0.17108, + "grad_norm": 0.626084325194252, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 17108 + }, + { + "epoch": 0.17109, + "grad_norm": 0.7191946119738822, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 17109 + }, + { + "epoch": 0.1711, + "grad_norm": 0.7556421999131269, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 17110 + }, + { + "epoch": 0.17111, + "grad_norm": 0.892806033351293, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 17111 + }, + { + "epoch": 0.17112, + "grad_norm": 0.9300008384526975, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 17112 + }, + { + "epoch": 0.17113, + "grad_norm": 0.7855435882492539, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 17113 + }, + { + "epoch": 0.17114, + "grad_norm": 0.5585246965743592, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 17114 + }, + { + "epoch": 0.17115, + "grad_norm": 0.5980668243524988, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 17115 + }, + { + "epoch": 0.17116, + "grad_norm": 0.7086760225422278, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 17116 + }, + { + "epoch": 0.17117, + "grad_norm": 0.729464383261198, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 17117 + }, + { + "epoch": 0.17118, + "grad_norm": 0.7507987290637065, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 17118 + }, + { + "epoch": 0.17119, + "grad_norm": 0.7513906769526895, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 17119 + }, + { + "epoch": 0.1712, + "grad_norm": 0.867350690912068, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 17120 + }, + { + "epoch": 0.17121, + "grad_norm": 0.9372292709027423, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 17121 + }, + { + "epoch": 0.17122, + "grad_norm": 1.0368161169014245, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 17122 + }, + { + "epoch": 0.17123, + "grad_norm": 0.9395479308947648, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 17123 + }, + { + "epoch": 0.17124, + "grad_norm": 0.7663719934384913, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 17124 + }, + { + "epoch": 0.17125, + "grad_norm": 0.8749188981054825, + "learning_rate": 0.003, + "loss": 4.097, + "step": 17125 + }, + { + "epoch": 0.17126, + "grad_norm": 0.8560411118521094, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 17126 + }, + { + "epoch": 0.17127, + "grad_norm": 0.7120271980266843, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 17127 + }, + { + "epoch": 0.17128, + "grad_norm": 0.6213948093478162, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 17128 + }, + { + "epoch": 0.17129, + "grad_norm": 0.66528896986041, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 17129 + }, + { + "epoch": 0.1713, + "grad_norm": 0.6503010118308575, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 17130 + }, + { + "epoch": 0.17131, + "grad_norm": 0.7130942176247402, + "learning_rate": 0.003, + "loss": 4.046, + "step": 17131 + }, + { + "epoch": 0.17132, + "grad_norm": 0.7998220976445287, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17132 + }, + { + "epoch": 0.17133, + "grad_norm": 1.1540708912237823, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 17133 + }, + { + "epoch": 0.17134, + "grad_norm": 1.1640268003139134, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 17134 + }, + { + "epoch": 0.17135, + "grad_norm": 0.7680178275501996, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 17135 + }, + { + "epoch": 0.17136, + "grad_norm": 0.7458487994792794, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17136 + }, + { + "epoch": 0.17137, + "grad_norm": 0.743231006541354, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17137 + }, + { + "epoch": 0.17138, + "grad_norm": 0.8241515608769896, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 17138 + }, + { + "epoch": 0.17139, + "grad_norm": 0.8841436926148614, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 17139 + }, + { + "epoch": 0.1714, + "grad_norm": 0.926111263878185, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 17140 + }, + { + "epoch": 0.17141, + "grad_norm": 1.003759581953196, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 17141 + }, + { + "epoch": 0.17142, + "grad_norm": 1.0681413291701944, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 17142 + }, + { + "epoch": 0.17143, + "grad_norm": 0.8920035370304565, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 17143 + }, + { + "epoch": 0.17144, + "grad_norm": 0.8063588549762267, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 17144 + }, + { + "epoch": 0.17145, + "grad_norm": 0.8201650322026025, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 17145 + }, + { + "epoch": 0.17146, + "grad_norm": 0.8139812085778229, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17146 + }, + { + "epoch": 0.17147, + "grad_norm": 0.7137289999997846, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 17147 + }, + { + "epoch": 0.17148, + "grad_norm": 0.6362790283551898, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 17148 + }, + { + "epoch": 0.17149, + "grad_norm": 0.7363548722007726, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 17149 + }, + { + "epoch": 0.1715, + "grad_norm": 1.0749319646285522, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 17150 + }, + { + "epoch": 0.17151, + "grad_norm": 1.1873761681442723, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 17151 + }, + { + "epoch": 0.17152, + "grad_norm": 0.6517262037252678, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 17152 + }, + { + "epoch": 0.17153, + "grad_norm": 0.6456980554673047, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 17153 + }, + { + "epoch": 0.17154, + "grad_norm": 0.6401664283315516, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 17154 + }, + { + "epoch": 0.17155, + "grad_norm": 0.6479454585183663, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 17155 + }, + { + "epoch": 0.17156, + "grad_norm": 0.7063221325480135, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 17156 + }, + { + "epoch": 0.17157, + "grad_norm": 0.7449691162562174, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 17157 + }, + { + "epoch": 0.17158, + "grad_norm": 0.7688052210946774, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 17158 + }, + { + "epoch": 0.17159, + "grad_norm": 0.8663240449410309, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 17159 + }, + { + "epoch": 0.1716, + "grad_norm": 0.9350470754519611, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 17160 + }, + { + "epoch": 0.17161, + "grad_norm": 1.105918303174733, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 17161 + }, + { + "epoch": 0.17162, + "grad_norm": 0.9582373552828001, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 17162 + }, + { + "epoch": 0.17163, + "grad_norm": 0.8519330079536576, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 17163 + }, + { + "epoch": 0.17164, + "grad_norm": 0.7843883129538464, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 17164 + }, + { + "epoch": 0.17165, + "grad_norm": 0.8759360611959321, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 17165 + }, + { + "epoch": 0.17166, + "grad_norm": 0.9449295350433374, + "learning_rate": 0.003, + "loss": 4.073, + "step": 17166 + }, + { + "epoch": 0.17167, + "grad_norm": 0.9685718447274639, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 17167 + }, + { + "epoch": 0.17168, + "grad_norm": 0.8829189093930374, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 17168 + }, + { + "epoch": 0.17169, + "grad_norm": 1.0172291499134838, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 17169 + }, + { + "epoch": 0.1717, + "grad_norm": 1.031474876754636, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 17170 + }, + { + "epoch": 0.17171, + "grad_norm": 1.0627549761853843, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 17171 + }, + { + "epoch": 0.17172, + "grad_norm": 1.192244620493509, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 17172 + }, + { + "epoch": 0.17173, + "grad_norm": 0.9696467606689188, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 17173 + }, + { + "epoch": 0.17174, + "grad_norm": 1.0099346721484064, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 17174 + }, + { + "epoch": 0.17175, + "grad_norm": 0.9430570332335579, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 17175 + }, + { + "epoch": 0.17176, + "grad_norm": 0.7416781895159501, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 17176 + }, + { + "epoch": 0.17177, + "grad_norm": 0.5362280596041956, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 17177 + }, + { + "epoch": 0.17178, + "grad_norm": 0.5213637027578487, + "learning_rate": 0.003, + "loss": 4.064, + "step": 17178 + }, + { + "epoch": 0.17179, + "grad_norm": 0.5910661842224878, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 17179 + }, + { + "epoch": 0.1718, + "grad_norm": 0.7203841849058878, + "learning_rate": 0.003, + "loss": 4.076, + "step": 17180 + }, + { + "epoch": 0.17181, + "grad_norm": 0.9174065527544728, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 17181 + }, + { + "epoch": 0.17182, + "grad_norm": 1.1515831223067883, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 17182 + }, + { + "epoch": 0.17183, + "grad_norm": 0.7875882064641271, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 17183 + }, + { + "epoch": 0.17184, + "grad_norm": 0.6113362737273356, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 17184 + }, + { + "epoch": 0.17185, + "grad_norm": 0.6769858413569317, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 17185 + }, + { + "epoch": 0.17186, + "grad_norm": 0.7857685015948951, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 17186 + }, + { + "epoch": 0.17187, + "grad_norm": 0.8887083327904423, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 17187 + }, + { + "epoch": 0.17188, + "grad_norm": 0.9355285903796494, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 17188 + }, + { + "epoch": 0.17189, + "grad_norm": 0.9699610435380733, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 17189 + }, + { + "epoch": 0.1719, + "grad_norm": 0.968892520735324, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 17190 + }, + { + "epoch": 0.17191, + "grad_norm": 0.9278366759031452, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 17191 + }, + { + "epoch": 0.17192, + "grad_norm": 1.0093456528697768, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 17192 + }, + { + "epoch": 0.17193, + "grad_norm": 0.962765750270637, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 17193 + }, + { + "epoch": 0.17194, + "grad_norm": 1.015938938634198, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 17194 + }, + { + "epoch": 0.17195, + "grad_norm": 1.0202078310085068, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 17195 + }, + { + "epoch": 0.17196, + "grad_norm": 0.7320971398547693, + "learning_rate": 0.003, + "loss": 4.037, + "step": 17196 + }, + { + "epoch": 0.17197, + "grad_norm": 0.6858718177060639, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 17197 + }, + { + "epoch": 0.17198, + "grad_norm": 0.8182467796436648, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 17198 + }, + { + "epoch": 0.17199, + "grad_norm": 0.8029616233649419, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 17199 + }, + { + "epoch": 0.172, + "grad_norm": 0.8529230097017862, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 17200 + }, + { + "epoch": 0.17201, + "grad_norm": 0.8384852167472535, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 17201 + }, + { + "epoch": 0.17202, + "grad_norm": 0.8417210218463609, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 17202 + }, + { + "epoch": 0.17203, + "grad_norm": 1.0840588938958944, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 17203 + }, + { + "epoch": 0.17204, + "grad_norm": 1.062999691063719, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 17204 + }, + { + "epoch": 0.17205, + "grad_norm": 0.8714578232781969, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 17205 + }, + { + "epoch": 0.17206, + "grad_norm": 0.8553273735258046, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 17206 + }, + { + "epoch": 0.17207, + "grad_norm": 0.8002136610055574, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 17207 + }, + { + "epoch": 0.17208, + "grad_norm": 0.7176815025210549, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 17208 + }, + { + "epoch": 0.17209, + "grad_norm": 0.6207109948968895, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 17209 + }, + { + "epoch": 0.1721, + "grad_norm": 0.6247885292327184, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 17210 + }, + { + "epoch": 0.17211, + "grad_norm": 0.6722672956311609, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 17211 + }, + { + "epoch": 0.17212, + "grad_norm": 0.6949125408976792, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 17212 + }, + { + "epoch": 0.17213, + "grad_norm": 0.8528555681479099, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 17213 + }, + { + "epoch": 0.17214, + "grad_norm": 0.9799468799478441, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17214 + }, + { + "epoch": 0.17215, + "grad_norm": 1.2014577690733408, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 17215 + }, + { + "epoch": 0.17216, + "grad_norm": 1.0459659173443419, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 17216 + }, + { + "epoch": 0.17217, + "grad_norm": 1.0545798776296638, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 17217 + }, + { + "epoch": 0.17218, + "grad_norm": 1.009583047859804, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 17218 + }, + { + "epoch": 0.17219, + "grad_norm": 1.0471807416764718, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 17219 + }, + { + "epoch": 0.1722, + "grad_norm": 0.8585335653507692, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 17220 + }, + { + "epoch": 0.17221, + "grad_norm": 0.848650943483469, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 17221 + }, + { + "epoch": 0.17222, + "grad_norm": 0.9650349686441796, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 17222 + }, + { + "epoch": 0.17223, + "grad_norm": 0.985307049479663, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 17223 + }, + { + "epoch": 0.17224, + "grad_norm": 0.9812775977592947, + "learning_rate": 0.003, + "loss": 4.0976, + "step": 17224 + }, + { + "epoch": 0.17225, + "grad_norm": 0.9237673151249101, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 17225 + }, + { + "epoch": 0.17226, + "grad_norm": 0.8417361738016759, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 17226 + }, + { + "epoch": 0.17227, + "grad_norm": 0.8717303731859316, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 17227 + }, + { + "epoch": 0.17228, + "grad_norm": 0.8886103465575612, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 17228 + }, + { + "epoch": 0.17229, + "grad_norm": 0.8754470496635101, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 17229 + }, + { + "epoch": 0.1723, + "grad_norm": 0.8245545788155056, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 17230 + }, + { + "epoch": 0.17231, + "grad_norm": 0.7937219643903761, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 17231 + }, + { + "epoch": 0.17232, + "grad_norm": 0.7994651748103063, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 17232 + }, + { + "epoch": 0.17233, + "grad_norm": 0.9901505651795912, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 17233 + }, + { + "epoch": 0.17234, + "grad_norm": 1.1057812119967034, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 17234 + }, + { + "epoch": 0.17235, + "grad_norm": 1.0501265727346045, + "learning_rate": 0.003, + "loss": 4.044, + "step": 17235 + }, + { + "epoch": 0.17236, + "grad_norm": 0.9980172470686348, + "learning_rate": 0.003, + "loss": 4.103, + "step": 17236 + }, + { + "epoch": 0.17237, + "grad_norm": 1.055511422736484, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 17237 + }, + { + "epoch": 0.17238, + "grad_norm": 1.0280707586992661, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 17238 + }, + { + "epoch": 0.17239, + "grad_norm": 0.9969897409512702, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 17239 + }, + { + "epoch": 0.1724, + "grad_norm": 0.8827205706856681, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 17240 + }, + { + "epoch": 0.17241, + "grad_norm": 0.8572914366402444, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 17241 + }, + { + "epoch": 0.17242, + "grad_norm": 0.9145285684578551, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 17242 + }, + { + "epoch": 0.17243, + "grad_norm": 0.9361469788165563, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 17243 + }, + { + "epoch": 0.17244, + "grad_norm": 1.0484950899124539, + "learning_rate": 0.003, + "loss": 4.1087, + "step": 17244 + }, + { + "epoch": 0.17245, + "grad_norm": 0.9460009857292923, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 17245 + }, + { + "epoch": 0.17246, + "grad_norm": 0.8691325082971929, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 17246 + }, + { + "epoch": 0.17247, + "grad_norm": 0.833335179981703, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 17247 + }, + { + "epoch": 0.17248, + "grad_norm": 0.7813119985619414, + "learning_rate": 0.003, + "loss": 4.083, + "step": 17248 + }, + { + "epoch": 0.17249, + "grad_norm": 0.7580442016495044, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 17249 + }, + { + "epoch": 0.1725, + "grad_norm": 0.8757261798240211, + "learning_rate": 0.003, + "loss": 4.087, + "step": 17250 + }, + { + "epoch": 0.17251, + "grad_norm": 0.8324547581132363, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 17251 + }, + { + "epoch": 0.17252, + "grad_norm": 0.7270042248527319, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 17252 + }, + { + "epoch": 0.17253, + "grad_norm": 0.7199955731550661, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 17253 + }, + { + "epoch": 0.17254, + "grad_norm": 0.7646752198671325, + "learning_rate": 0.003, + "loss": 4.027, + "step": 17254 + }, + { + "epoch": 0.17255, + "grad_norm": 0.8604880952711906, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 17255 + }, + { + "epoch": 0.17256, + "grad_norm": 0.9151988992040045, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 17256 + }, + { + "epoch": 0.17257, + "grad_norm": 0.8548837132435793, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 17257 + }, + { + "epoch": 0.17258, + "grad_norm": 0.6261999558389855, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 17258 + }, + { + "epoch": 0.17259, + "grad_norm": 0.5410557986097894, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 17259 + }, + { + "epoch": 0.1726, + "grad_norm": 0.624676544911726, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 17260 + }, + { + "epoch": 0.17261, + "grad_norm": 0.7859287744538213, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 17261 + }, + { + "epoch": 0.17262, + "grad_norm": 0.8624456349182722, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 17262 + }, + { + "epoch": 0.17263, + "grad_norm": 0.8731087489875774, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 17263 + }, + { + "epoch": 0.17264, + "grad_norm": 0.8091508299507951, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 17264 + }, + { + "epoch": 0.17265, + "grad_norm": 0.7110748362336592, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 17265 + }, + { + "epoch": 0.17266, + "grad_norm": 0.7232852377630583, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 17266 + }, + { + "epoch": 0.17267, + "grad_norm": 0.8192577509209505, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 17267 + }, + { + "epoch": 0.17268, + "grad_norm": 0.8469306064141809, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 17268 + }, + { + "epoch": 0.17269, + "grad_norm": 0.8655466936923366, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 17269 + }, + { + "epoch": 0.1727, + "grad_norm": 0.9277575011427177, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 17270 + }, + { + "epoch": 0.17271, + "grad_norm": 1.1779658679488627, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 17271 + }, + { + "epoch": 0.17272, + "grad_norm": 0.9689316268022872, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 17272 + }, + { + "epoch": 0.17273, + "grad_norm": 0.8243911467881477, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 17273 + }, + { + "epoch": 0.17274, + "grad_norm": 0.7626622528624412, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 17274 + }, + { + "epoch": 0.17275, + "grad_norm": 0.7991502451912672, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 17275 + }, + { + "epoch": 0.17276, + "grad_norm": 0.7410218460367459, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 17276 + }, + { + "epoch": 0.17277, + "grad_norm": 0.7888996169821576, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 17277 + }, + { + "epoch": 0.17278, + "grad_norm": 0.8333273603853033, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17278 + }, + { + "epoch": 0.17279, + "grad_norm": 0.8617489204975749, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 17279 + }, + { + "epoch": 0.1728, + "grad_norm": 0.9461896957998942, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 17280 + }, + { + "epoch": 0.17281, + "grad_norm": 0.9875240275964974, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 17281 + }, + { + "epoch": 0.17282, + "grad_norm": 1.0286590722274698, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 17282 + }, + { + "epoch": 0.17283, + "grad_norm": 1.0599897332944377, + "learning_rate": 0.003, + "loss": 4.072, + "step": 17283 + }, + { + "epoch": 0.17284, + "grad_norm": 1.0120046528501239, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 17284 + }, + { + "epoch": 0.17285, + "grad_norm": 0.8778888577381524, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 17285 + }, + { + "epoch": 0.17286, + "grad_norm": 0.7973275547969494, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 17286 + }, + { + "epoch": 0.17287, + "grad_norm": 0.8452580436387402, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 17287 + }, + { + "epoch": 0.17288, + "grad_norm": 0.8505597531306186, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 17288 + }, + { + "epoch": 0.17289, + "grad_norm": 0.8317333037306122, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 17289 + }, + { + "epoch": 0.1729, + "grad_norm": 1.008993386006391, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 17290 + }, + { + "epoch": 0.17291, + "grad_norm": 1.114491086597863, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 17291 + }, + { + "epoch": 0.17292, + "grad_norm": 0.9808995520484136, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 17292 + }, + { + "epoch": 0.17293, + "grad_norm": 1.0501849633682052, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 17293 + }, + { + "epoch": 0.17294, + "grad_norm": 0.9247467345092111, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 17294 + }, + { + "epoch": 0.17295, + "grad_norm": 0.8879352488478393, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 17295 + }, + { + "epoch": 0.17296, + "grad_norm": 0.8544677992278479, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 17296 + }, + { + "epoch": 0.17297, + "grad_norm": 0.9801565895747134, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 17297 + }, + { + "epoch": 0.17298, + "grad_norm": 1.3174293108271267, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 17298 + }, + { + "epoch": 0.17299, + "grad_norm": 0.8574921353754716, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 17299 + }, + { + "epoch": 0.173, + "grad_norm": 0.9528195091626303, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 17300 + }, + { + "epoch": 0.17301, + "grad_norm": 0.8429109507322878, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 17301 + }, + { + "epoch": 0.17302, + "grad_norm": 0.7462237690461228, + "learning_rate": 0.003, + "loss": 4.067, + "step": 17302 + }, + { + "epoch": 0.17303, + "grad_norm": 0.795428100025416, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 17303 + }, + { + "epoch": 0.17304, + "grad_norm": 0.75221237706329, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 17304 + }, + { + "epoch": 0.17305, + "grad_norm": 0.8322306659265789, + "learning_rate": 0.003, + "loss": 4.053, + "step": 17305 + }, + { + "epoch": 0.17306, + "grad_norm": 0.8282803772155906, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 17306 + }, + { + "epoch": 0.17307, + "grad_norm": 0.8147818327391885, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 17307 + }, + { + "epoch": 0.17308, + "grad_norm": 0.6959409443735983, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 17308 + }, + { + "epoch": 0.17309, + "grad_norm": 0.8336328467593661, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 17309 + }, + { + "epoch": 0.1731, + "grad_norm": 1.216790763242895, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 17310 + }, + { + "epoch": 0.17311, + "grad_norm": 1.1392406129899704, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 17311 + }, + { + "epoch": 0.17312, + "grad_norm": 0.7787514008594593, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 17312 + }, + { + "epoch": 0.17313, + "grad_norm": 0.6446576291053752, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 17313 + }, + { + "epoch": 0.17314, + "grad_norm": 0.7132887345769777, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 17314 + }, + { + "epoch": 0.17315, + "grad_norm": 0.6951002272597862, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 17315 + }, + { + "epoch": 0.17316, + "grad_norm": 0.7157765569698302, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 17316 + }, + { + "epoch": 0.17317, + "grad_norm": 0.8041583097445666, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 17317 + }, + { + "epoch": 0.17318, + "grad_norm": 1.0373881978167832, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 17318 + }, + { + "epoch": 0.17319, + "grad_norm": 1.0578100222463147, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 17319 + }, + { + "epoch": 0.1732, + "grad_norm": 0.8543196037379642, + "learning_rate": 0.003, + "loss": 4.047, + "step": 17320 + }, + { + "epoch": 0.17321, + "grad_norm": 0.8089396912332841, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 17321 + }, + { + "epoch": 0.17322, + "grad_norm": 0.7506929583233183, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 17322 + }, + { + "epoch": 0.17323, + "grad_norm": 0.7580967782892147, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 17323 + }, + { + "epoch": 0.17324, + "grad_norm": 0.7019955689096509, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 17324 + }, + { + "epoch": 0.17325, + "grad_norm": 0.8004041472192007, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 17325 + }, + { + "epoch": 0.17326, + "grad_norm": 0.906514365447904, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 17326 + }, + { + "epoch": 0.17327, + "grad_norm": 1.0761067587509687, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 17327 + }, + { + "epoch": 0.17328, + "grad_norm": 0.9990357087343317, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 17328 + }, + { + "epoch": 0.17329, + "grad_norm": 0.9859818289448217, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 17329 + }, + { + "epoch": 0.1733, + "grad_norm": 1.0741354235531915, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 17330 + }, + { + "epoch": 0.17331, + "grad_norm": 0.8676919064567706, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 17331 + }, + { + "epoch": 0.17332, + "grad_norm": 0.7908551989976313, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 17332 + }, + { + "epoch": 0.17333, + "grad_norm": 0.7377602197492152, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 17333 + }, + { + "epoch": 0.17334, + "grad_norm": 0.8150042288786687, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 17334 + }, + { + "epoch": 0.17335, + "grad_norm": 0.6685684565585098, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 17335 + }, + { + "epoch": 0.17336, + "grad_norm": 0.6184537779101107, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 17336 + }, + { + "epoch": 0.17337, + "grad_norm": 0.6336402118718607, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 17337 + }, + { + "epoch": 0.17338, + "grad_norm": 0.7629785540495987, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 17338 + }, + { + "epoch": 0.17339, + "grad_norm": 0.7654657701343637, + "learning_rate": 0.003, + "loss": 4.075, + "step": 17339 + }, + { + "epoch": 0.1734, + "grad_norm": 0.7814302560778268, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 17340 + }, + { + "epoch": 0.17341, + "grad_norm": 1.0116673944445607, + "learning_rate": 0.003, + "loss": 4.04, + "step": 17341 + }, + { + "epoch": 0.17342, + "grad_norm": 1.3156796022672015, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 17342 + }, + { + "epoch": 0.17343, + "grad_norm": 0.6835139520659282, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 17343 + }, + { + "epoch": 0.17344, + "grad_norm": 0.653366109440773, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 17344 + }, + { + "epoch": 0.17345, + "grad_norm": 0.7199124076803893, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 17345 + }, + { + "epoch": 0.17346, + "grad_norm": 0.7325160021916057, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 17346 + }, + { + "epoch": 0.17347, + "grad_norm": 0.758612119806983, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 17347 + }, + { + "epoch": 0.17348, + "grad_norm": 0.7589921271569143, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 17348 + }, + { + "epoch": 0.17349, + "grad_norm": 0.7258675784146073, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 17349 + }, + { + "epoch": 0.1735, + "grad_norm": 0.7771286353223587, + "learning_rate": 0.003, + "loss": 4.044, + "step": 17350 + }, + { + "epoch": 0.17351, + "grad_norm": 0.7997971082043684, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 17351 + }, + { + "epoch": 0.17352, + "grad_norm": 0.9160256675432547, + "learning_rate": 0.003, + "loss": 4.056, + "step": 17352 + }, + { + "epoch": 0.17353, + "grad_norm": 0.9831985554775784, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 17353 + }, + { + "epoch": 0.17354, + "grad_norm": 0.920199886221615, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 17354 + }, + { + "epoch": 0.17355, + "grad_norm": 0.9193153173406821, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 17355 + }, + { + "epoch": 0.17356, + "grad_norm": 0.9952068634206899, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 17356 + }, + { + "epoch": 0.17357, + "grad_norm": 1.0809133928846821, + "learning_rate": 0.003, + "loss": 4.1047, + "step": 17357 + }, + { + "epoch": 0.17358, + "grad_norm": 0.9751393261045358, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 17358 + }, + { + "epoch": 0.17359, + "grad_norm": 1.0974371466481527, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 17359 + }, + { + "epoch": 0.1736, + "grad_norm": 1.2407129187930825, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 17360 + }, + { + "epoch": 0.17361, + "grad_norm": 0.912345854382728, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 17361 + }, + { + "epoch": 0.17362, + "grad_norm": 1.0023088642121027, + "learning_rate": 0.003, + "loss": 4.073, + "step": 17362 + }, + { + "epoch": 0.17363, + "grad_norm": 1.1697749023106154, + "learning_rate": 0.003, + "loss": 4.1015, + "step": 17363 + }, + { + "epoch": 0.17364, + "grad_norm": 0.852100101056053, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 17364 + }, + { + "epoch": 0.17365, + "grad_norm": 0.8037060442021475, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 17365 + }, + { + "epoch": 0.17366, + "grad_norm": 0.9146820611097617, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 17366 + }, + { + "epoch": 0.17367, + "grad_norm": 0.9773714867247653, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 17367 + }, + { + "epoch": 0.17368, + "grad_norm": 0.9700324896984772, + "learning_rate": 0.003, + "loss": 4.043, + "step": 17368 + }, + { + "epoch": 0.17369, + "grad_norm": 0.9282414802865684, + "learning_rate": 0.003, + "loss": 4.071, + "step": 17369 + }, + { + "epoch": 0.1737, + "grad_norm": 0.9687659983616759, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 17370 + }, + { + "epoch": 0.17371, + "grad_norm": 0.9981063118178767, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 17371 + }, + { + "epoch": 0.17372, + "grad_norm": 1.1171285873302719, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 17372 + }, + { + "epoch": 0.17373, + "grad_norm": 1.1013248404196918, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 17373 + }, + { + "epoch": 0.17374, + "grad_norm": 1.046819789427008, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 17374 + }, + { + "epoch": 0.17375, + "grad_norm": 0.9488223410056068, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 17375 + }, + { + "epoch": 0.17376, + "grad_norm": 0.8939021914190054, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 17376 + }, + { + "epoch": 0.17377, + "grad_norm": 0.8071698421728923, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 17377 + }, + { + "epoch": 0.17378, + "grad_norm": 0.7831473096804777, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 17378 + }, + { + "epoch": 0.17379, + "grad_norm": 0.7593526077225092, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 17379 + }, + { + "epoch": 0.1738, + "grad_norm": 0.780580847798873, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 17380 + }, + { + "epoch": 0.17381, + "grad_norm": 0.8454248660344981, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 17381 + }, + { + "epoch": 0.17382, + "grad_norm": 0.9889286500472355, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 17382 + }, + { + "epoch": 0.17383, + "grad_norm": 0.9074023780782959, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 17383 + }, + { + "epoch": 0.17384, + "grad_norm": 0.8810076460238118, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17384 + }, + { + "epoch": 0.17385, + "grad_norm": 0.8953587302784513, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 17385 + }, + { + "epoch": 0.17386, + "grad_norm": 0.9250273848925091, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 17386 + }, + { + "epoch": 0.17387, + "grad_norm": 0.8891341176753061, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 17387 + }, + { + "epoch": 0.17388, + "grad_norm": 0.7799053717768336, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 17388 + }, + { + "epoch": 0.17389, + "grad_norm": 0.7298546868214523, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 17389 + }, + { + "epoch": 0.1739, + "grad_norm": 0.7766456205453687, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 17390 + }, + { + "epoch": 0.17391, + "grad_norm": 0.7500950132298999, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 17391 + }, + { + "epoch": 0.17392, + "grad_norm": 0.7613533634802031, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 17392 + }, + { + "epoch": 0.17393, + "grad_norm": 0.8939331275223772, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 17393 + }, + { + "epoch": 0.17394, + "grad_norm": 0.9631422829173715, + "learning_rate": 0.003, + "loss": 4.078, + "step": 17394 + }, + { + "epoch": 0.17395, + "grad_norm": 1.0997327445199163, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 17395 + }, + { + "epoch": 0.17396, + "grad_norm": 0.9850191451235975, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 17396 + }, + { + "epoch": 0.17397, + "grad_norm": 0.9921984540290422, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 17397 + }, + { + "epoch": 0.17398, + "grad_norm": 0.9958769014421154, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 17398 + }, + { + "epoch": 0.17399, + "grad_norm": 1.0535711570251973, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 17399 + }, + { + "epoch": 0.174, + "grad_norm": 0.8891907893217653, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 17400 + }, + { + "epoch": 0.17401, + "grad_norm": 0.915005863358091, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 17401 + }, + { + "epoch": 0.17402, + "grad_norm": 1.0196831636719055, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 17402 + }, + { + "epoch": 0.17403, + "grad_norm": 1.1226770599304043, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 17403 + }, + { + "epoch": 0.17404, + "grad_norm": 0.9806517081879206, + "learning_rate": 0.003, + "loss": 4.061, + "step": 17404 + }, + { + "epoch": 0.17405, + "grad_norm": 1.0671346149211505, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 17405 + }, + { + "epoch": 0.17406, + "grad_norm": 0.8848281560108651, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 17406 + }, + { + "epoch": 0.17407, + "grad_norm": 0.8453865737751833, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 17407 + }, + { + "epoch": 0.17408, + "grad_norm": 0.7968385644112044, + "learning_rate": 0.003, + "loss": 4.085, + "step": 17408 + }, + { + "epoch": 0.17409, + "grad_norm": 0.6942131001547551, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 17409 + }, + { + "epoch": 0.1741, + "grad_norm": 0.7163708449335096, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 17410 + }, + { + "epoch": 0.17411, + "grad_norm": 0.6716894864268983, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 17411 + }, + { + "epoch": 0.17412, + "grad_norm": 0.775760487226005, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 17412 + }, + { + "epoch": 0.17413, + "grad_norm": 0.8662987353372363, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 17413 + }, + { + "epoch": 0.17414, + "grad_norm": 0.9356244272211784, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 17414 + }, + { + "epoch": 0.17415, + "grad_norm": 0.8988221550088145, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 17415 + }, + { + "epoch": 0.17416, + "grad_norm": 0.7942533406696778, + "learning_rate": 0.003, + "loss": 4.063, + "step": 17416 + }, + { + "epoch": 0.17417, + "grad_norm": 0.7416019879203941, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 17417 + }, + { + "epoch": 0.17418, + "grad_norm": 0.7069953785734495, + "learning_rate": 0.003, + "loss": 4.045, + "step": 17418 + }, + { + "epoch": 0.17419, + "grad_norm": 0.7428780027332129, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 17419 + }, + { + "epoch": 0.1742, + "grad_norm": 0.8926578190396952, + "learning_rate": 0.003, + "loss": 4.033, + "step": 17420 + }, + { + "epoch": 0.17421, + "grad_norm": 1.0843610623028588, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 17421 + }, + { + "epoch": 0.17422, + "grad_norm": 1.038564518601075, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 17422 + }, + { + "epoch": 0.17423, + "grad_norm": 0.8328262876514982, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 17423 + }, + { + "epoch": 0.17424, + "grad_norm": 0.6821089088106999, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 17424 + }, + { + "epoch": 0.17425, + "grad_norm": 0.6826071597439007, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 17425 + }, + { + "epoch": 0.17426, + "grad_norm": 0.695004814553453, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 17426 + }, + { + "epoch": 0.17427, + "grad_norm": 0.6393247156055374, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 17427 + }, + { + "epoch": 0.17428, + "grad_norm": 0.637871628292266, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 17428 + }, + { + "epoch": 0.17429, + "grad_norm": 0.7777178047163732, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 17429 + }, + { + "epoch": 0.1743, + "grad_norm": 0.8661779560341816, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 17430 + }, + { + "epoch": 0.17431, + "grad_norm": 0.9412312669248412, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 17431 + }, + { + "epoch": 0.17432, + "grad_norm": 1.1525591841199643, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 17432 + }, + { + "epoch": 0.17433, + "grad_norm": 0.9383373213894208, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 17433 + }, + { + "epoch": 0.17434, + "grad_norm": 0.9936019420397292, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 17434 + }, + { + "epoch": 0.17435, + "grad_norm": 0.9421004121289757, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 17435 + }, + { + "epoch": 0.17436, + "grad_norm": 0.8402850560318735, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 17436 + }, + { + "epoch": 0.17437, + "grad_norm": 0.8675813238878146, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 17437 + }, + { + "epoch": 0.17438, + "grad_norm": 0.7833673318869098, + "learning_rate": 0.003, + "loss": 4.071, + "step": 17438 + }, + { + "epoch": 0.17439, + "grad_norm": 0.6890685150324481, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17439 + }, + { + "epoch": 0.1744, + "grad_norm": 0.8281242550417617, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 17440 + }, + { + "epoch": 0.17441, + "grad_norm": 0.9063872311122002, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 17441 + }, + { + "epoch": 0.17442, + "grad_norm": 1.2388567936666337, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 17442 + }, + { + "epoch": 0.17443, + "grad_norm": 1.0132032163351703, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 17443 + }, + { + "epoch": 0.17444, + "grad_norm": 1.225345906624636, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 17444 + }, + { + "epoch": 0.17445, + "grad_norm": 0.8511032109496721, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 17445 + }, + { + "epoch": 0.17446, + "grad_norm": 0.7447706794121896, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 17446 + }, + { + "epoch": 0.17447, + "grad_norm": 0.7090157341706049, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 17447 + }, + { + "epoch": 0.17448, + "grad_norm": 0.7505360344565417, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 17448 + }, + { + "epoch": 0.17449, + "grad_norm": 0.848405949917275, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 17449 + }, + { + "epoch": 0.1745, + "grad_norm": 0.8858009254230506, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 17450 + }, + { + "epoch": 0.17451, + "grad_norm": 0.8937772087997056, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 17451 + }, + { + "epoch": 0.17452, + "grad_norm": 0.9326565300713741, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 17452 + }, + { + "epoch": 0.17453, + "grad_norm": 1.0995121363945681, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 17453 + }, + { + "epoch": 0.17454, + "grad_norm": 0.9659465972815166, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 17454 + }, + { + "epoch": 0.17455, + "grad_norm": 1.0733968597019738, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 17455 + }, + { + "epoch": 0.17456, + "grad_norm": 0.9250485837570622, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 17456 + }, + { + "epoch": 0.17457, + "grad_norm": 0.8429212214718267, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 17457 + }, + { + "epoch": 0.17458, + "grad_norm": 0.8508206086301354, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 17458 + }, + { + "epoch": 0.17459, + "grad_norm": 0.8916709654437155, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 17459 + }, + { + "epoch": 0.1746, + "grad_norm": 1.1477856385306506, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 17460 + }, + { + "epoch": 0.17461, + "grad_norm": 1.155867703290962, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 17461 + }, + { + "epoch": 0.17462, + "grad_norm": 1.0692742880481143, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 17462 + }, + { + "epoch": 0.17463, + "grad_norm": 0.990337648032987, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 17463 + }, + { + "epoch": 0.17464, + "grad_norm": 0.8268968118405826, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 17464 + }, + { + "epoch": 0.17465, + "grad_norm": 0.7154827931963107, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 17465 + }, + { + "epoch": 0.17466, + "grad_norm": 0.5525510500888201, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 17466 + }, + { + "epoch": 0.17467, + "grad_norm": 0.4735389124705273, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 17467 + }, + { + "epoch": 0.17468, + "grad_norm": 0.49886317222344084, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17468 + }, + { + "epoch": 0.17469, + "grad_norm": 0.5751635571493581, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 17469 + }, + { + "epoch": 0.1747, + "grad_norm": 0.7130241338777848, + "learning_rate": 0.003, + "loss": 4.077, + "step": 17470 + }, + { + "epoch": 0.17471, + "grad_norm": 0.9869723748518184, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 17471 + }, + { + "epoch": 0.17472, + "grad_norm": 1.2346072474558896, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 17472 + }, + { + "epoch": 0.17473, + "grad_norm": 0.771892335133957, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 17473 + }, + { + "epoch": 0.17474, + "grad_norm": 0.624937231162835, + "learning_rate": 0.003, + "loss": 4.051, + "step": 17474 + }, + { + "epoch": 0.17475, + "grad_norm": 0.628631224984318, + "learning_rate": 0.003, + "loss": 4.038, + "step": 17475 + }, + { + "epoch": 0.17476, + "grad_norm": 0.7573342877762853, + "learning_rate": 0.003, + "loss": 4.066, + "step": 17476 + }, + { + "epoch": 0.17477, + "grad_norm": 0.8432150934639794, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 17477 + }, + { + "epoch": 0.17478, + "grad_norm": 0.8697572738867382, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 17478 + }, + { + "epoch": 0.17479, + "grad_norm": 0.7884222181806019, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 17479 + }, + { + "epoch": 0.1748, + "grad_norm": 0.7342124982656747, + "learning_rate": 0.003, + "loss": 4.06, + "step": 17480 + }, + { + "epoch": 0.17481, + "grad_norm": 0.7582116739894746, + "learning_rate": 0.003, + "loss": 4.039, + "step": 17481 + }, + { + "epoch": 0.17482, + "grad_norm": 0.9322740002099004, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 17482 + }, + { + "epoch": 0.17483, + "grad_norm": 1.1975977579067585, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 17483 + }, + { + "epoch": 0.17484, + "grad_norm": 0.9279283498512066, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 17484 + }, + { + "epoch": 0.17485, + "grad_norm": 0.9773264109216454, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 17485 + }, + { + "epoch": 0.17486, + "grad_norm": 1.1255628443882768, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 17486 + }, + { + "epoch": 0.17487, + "grad_norm": 1.0057673704881918, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 17487 + }, + { + "epoch": 0.17488, + "grad_norm": 0.944213008071853, + "learning_rate": 0.003, + "loss": 4.057, + "step": 17488 + }, + { + "epoch": 0.17489, + "grad_norm": 0.9705320820215475, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 17489 + }, + { + "epoch": 0.1749, + "grad_norm": 1.0360271483695411, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 17490 + }, + { + "epoch": 0.17491, + "grad_norm": 1.1121761504399486, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 17491 + }, + { + "epoch": 0.17492, + "grad_norm": 0.9461535202916413, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 17492 + }, + { + "epoch": 0.17493, + "grad_norm": 0.9638805346180465, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 17493 + }, + { + "epoch": 0.17494, + "grad_norm": 0.9806457833729898, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 17494 + }, + { + "epoch": 0.17495, + "grad_norm": 1.0116307502999224, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 17495 + }, + { + "epoch": 0.17496, + "grad_norm": 1.0115411892178496, + "learning_rate": 0.003, + "loss": 4.053, + "step": 17496 + }, + { + "epoch": 0.17497, + "grad_norm": 1.1104635419976652, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 17497 + }, + { + "epoch": 0.17498, + "grad_norm": 0.9160038775468329, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 17498 + }, + { + "epoch": 0.17499, + "grad_norm": 0.8858753450072356, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 17499 + }, + { + "epoch": 0.175, + "grad_norm": 0.8965716533669236, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 17500 + }, + { + "epoch": 0.17501, + "grad_norm": 0.9137417740197247, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 17501 + }, + { + "epoch": 0.17502, + "grad_norm": 0.8679411526719324, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 17502 + }, + { + "epoch": 0.17503, + "grad_norm": 0.7832916001414321, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 17503 + }, + { + "epoch": 0.17504, + "grad_norm": 0.8647366432455418, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 17504 + }, + { + "epoch": 0.17505, + "grad_norm": 0.924796909872522, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 17505 + }, + { + "epoch": 0.17506, + "grad_norm": 1.0500999561969173, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 17506 + }, + { + "epoch": 0.17507, + "grad_norm": 0.9968136172953102, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 17507 + }, + { + "epoch": 0.17508, + "grad_norm": 1.0441344830453017, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 17508 + }, + { + "epoch": 0.17509, + "grad_norm": 0.9907929715900188, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 17509 + }, + { + "epoch": 0.1751, + "grad_norm": 1.0625363858834336, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 17510 + }, + { + "epoch": 0.17511, + "grad_norm": 0.9310743830942303, + "learning_rate": 0.003, + "loss": 4.061, + "step": 17511 + }, + { + "epoch": 0.17512, + "grad_norm": 0.9430515190103738, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 17512 + }, + { + "epoch": 0.17513, + "grad_norm": 0.946283325212689, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 17513 + }, + { + "epoch": 0.17514, + "grad_norm": 0.9539086214780715, + "learning_rate": 0.003, + "loss": 4.085, + "step": 17514 + }, + { + "epoch": 0.17515, + "grad_norm": 0.9192559681004006, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 17515 + }, + { + "epoch": 0.17516, + "grad_norm": 0.9613466243063407, + "learning_rate": 0.003, + "loss": 4.067, + "step": 17516 + }, + { + "epoch": 0.17517, + "grad_norm": 0.855057663188413, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 17517 + }, + { + "epoch": 0.17518, + "grad_norm": 0.7819737238790523, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 17518 + }, + { + "epoch": 0.17519, + "grad_norm": 0.7894929215269617, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 17519 + }, + { + "epoch": 0.1752, + "grad_norm": 0.7785174026098362, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 17520 + }, + { + "epoch": 0.17521, + "grad_norm": 0.6451900631314694, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 17521 + }, + { + "epoch": 0.17522, + "grad_norm": 0.7348322208828698, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 17522 + }, + { + "epoch": 0.17523, + "grad_norm": 0.787242442960829, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 17523 + }, + { + "epoch": 0.17524, + "grad_norm": 0.8435581819140278, + "learning_rate": 0.003, + "loss": 4.063, + "step": 17524 + }, + { + "epoch": 0.17525, + "grad_norm": 1.0518817907549147, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 17525 + }, + { + "epoch": 0.17526, + "grad_norm": 1.072043433733588, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 17526 + }, + { + "epoch": 0.17527, + "grad_norm": 1.0008141177785768, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 17527 + }, + { + "epoch": 0.17528, + "grad_norm": 0.8761648169689856, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 17528 + }, + { + "epoch": 0.17529, + "grad_norm": 0.808953566953239, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 17529 + }, + { + "epoch": 0.1753, + "grad_norm": 1.0491460634838619, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 17530 + }, + { + "epoch": 0.17531, + "grad_norm": 1.1254342282308454, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 17531 + }, + { + "epoch": 0.17532, + "grad_norm": 0.9106072703230489, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 17532 + }, + { + "epoch": 0.17533, + "grad_norm": 0.8459672349588316, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 17533 + }, + { + "epoch": 0.17534, + "grad_norm": 0.7859129329620995, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 17534 + }, + { + "epoch": 0.17535, + "grad_norm": 0.7164187780365021, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 17535 + }, + { + "epoch": 0.17536, + "grad_norm": 0.6758578745030279, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 17536 + }, + { + "epoch": 0.17537, + "grad_norm": 0.6301918071033128, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 17537 + }, + { + "epoch": 0.17538, + "grad_norm": 0.6946016540411071, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 17538 + }, + { + "epoch": 0.17539, + "grad_norm": 0.9137357060371742, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 17539 + }, + { + "epoch": 0.1754, + "grad_norm": 1.2438535308097833, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 17540 + }, + { + "epoch": 0.17541, + "grad_norm": 0.8563545188060449, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 17541 + }, + { + "epoch": 0.17542, + "grad_norm": 0.585566432087187, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 17542 + }, + { + "epoch": 0.17543, + "grad_norm": 0.6158963241099712, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 17543 + }, + { + "epoch": 0.17544, + "grad_norm": 0.7051359847712503, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 17544 + }, + { + "epoch": 0.17545, + "grad_norm": 0.7906001545104807, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 17545 + }, + { + "epoch": 0.17546, + "grad_norm": 0.8228927302192418, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 17546 + }, + { + "epoch": 0.17547, + "grad_norm": 0.7972687877468351, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 17547 + }, + { + "epoch": 0.17548, + "grad_norm": 0.7823780064342654, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 17548 + }, + { + "epoch": 0.17549, + "grad_norm": 0.7672987644721564, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 17549 + }, + { + "epoch": 0.1755, + "grad_norm": 0.8392083698199194, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 17550 + }, + { + "epoch": 0.17551, + "grad_norm": 0.7915752478764481, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 17551 + }, + { + "epoch": 0.17552, + "grad_norm": 0.7277287945535047, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 17552 + }, + { + "epoch": 0.17553, + "grad_norm": 0.6824068623893177, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 17553 + }, + { + "epoch": 0.17554, + "grad_norm": 0.6826232765114713, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 17554 + }, + { + "epoch": 0.17555, + "grad_norm": 0.7615756928790316, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 17555 + }, + { + "epoch": 0.17556, + "grad_norm": 0.8903101580904664, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 17556 + }, + { + "epoch": 0.17557, + "grad_norm": 1.259822393119585, + "learning_rate": 0.003, + "loss": 4.055, + "step": 17557 + }, + { + "epoch": 0.17558, + "grad_norm": 0.8839182258854181, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 17558 + }, + { + "epoch": 0.17559, + "grad_norm": 0.923864898305606, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 17559 + }, + { + "epoch": 0.1756, + "grad_norm": 0.9402325745136028, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 17560 + }, + { + "epoch": 0.17561, + "grad_norm": 1.1413235289170909, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 17561 + }, + { + "epoch": 0.17562, + "grad_norm": 1.075106099535397, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 17562 + }, + { + "epoch": 0.17563, + "grad_norm": 0.9673705072801845, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 17563 + }, + { + "epoch": 0.17564, + "grad_norm": 1.0466206078148117, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 17564 + }, + { + "epoch": 0.17565, + "grad_norm": 1.0182783625602205, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 17565 + }, + { + "epoch": 0.17566, + "grad_norm": 1.0326103998641878, + "learning_rate": 0.003, + "loss": 4.045, + "step": 17566 + }, + { + "epoch": 0.17567, + "grad_norm": 0.9968830108954899, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 17567 + }, + { + "epoch": 0.17568, + "grad_norm": 0.9007540450812659, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 17568 + }, + { + "epoch": 0.17569, + "grad_norm": 0.8979482803940627, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 17569 + }, + { + "epoch": 0.1757, + "grad_norm": 0.9629958494496546, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 17570 + }, + { + "epoch": 0.17571, + "grad_norm": 1.0521231509896205, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 17571 + }, + { + "epoch": 0.17572, + "grad_norm": 0.9850792220121751, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 17572 + }, + { + "epoch": 0.17573, + "grad_norm": 0.8663062852170624, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 17573 + }, + { + "epoch": 0.17574, + "grad_norm": 0.8124220092050762, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 17574 + }, + { + "epoch": 0.17575, + "grad_norm": 0.8607388898653316, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 17575 + }, + { + "epoch": 0.17576, + "grad_norm": 0.901962162034722, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 17576 + }, + { + "epoch": 0.17577, + "grad_norm": 1.2222671148394237, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 17577 + }, + { + "epoch": 0.17578, + "grad_norm": 1.019216447215782, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 17578 + }, + { + "epoch": 0.17579, + "grad_norm": 0.9295758215508766, + "learning_rate": 0.003, + "loss": 4.074, + "step": 17579 + }, + { + "epoch": 0.1758, + "grad_norm": 0.837650754271571, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 17580 + }, + { + "epoch": 0.17581, + "grad_norm": 0.9027296338426551, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 17581 + }, + { + "epoch": 0.17582, + "grad_norm": 1.0045155133886974, + "learning_rate": 0.003, + "loss": 4.1029, + "step": 17582 + }, + { + "epoch": 0.17583, + "grad_norm": 0.8359872382161975, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 17583 + }, + { + "epoch": 0.17584, + "grad_norm": 0.7548350159816791, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 17584 + }, + { + "epoch": 0.17585, + "grad_norm": 0.7254987194509632, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 17585 + }, + { + "epoch": 0.17586, + "grad_norm": 0.6495839319516847, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 17586 + }, + { + "epoch": 0.17587, + "grad_norm": 0.6828203570598029, + "learning_rate": 0.003, + "loss": 4.071, + "step": 17587 + }, + { + "epoch": 0.17588, + "grad_norm": 0.7517051766671944, + "learning_rate": 0.003, + "loss": 4.1027, + "step": 17588 + }, + { + "epoch": 0.17589, + "grad_norm": 0.7495410651746253, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 17589 + }, + { + "epoch": 0.1759, + "grad_norm": 0.6624188009957733, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 17590 + }, + { + "epoch": 0.17591, + "grad_norm": 0.7281784386661555, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 17591 + }, + { + "epoch": 0.17592, + "grad_norm": 0.8842637341012457, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 17592 + }, + { + "epoch": 0.17593, + "grad_norm": 1.139081154896682, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 17593 + }, + { + "epoch": 0.17594, + "grad_norm": 1.0886166939298445, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 17594 + }, + { + "epoch": 0.17595, + "grad_norm": 0.8427242543047653, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 17595 + }, + { + "epoch": 0.17596, + "grad_norm": 0.8662505373315522, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 17596 + }, + { + "epoch": 0.17597, + "grad_norm": 0.8607201150234504, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 17597 + }, + { + "epoch": 0.17598, + "grad_norm": 0.7840325764956231, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 17598 + }, + { + "epoch": 0.17599, + "grad_norm": 0.7577531649744832, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 17599 + }, + { + "epoch": 0.176, + "grad_norm": 0.6664602135017921, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 17600 + }, + { + "epoch": 0.17601, + "grad_norm": 0.6938148426517162, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 17601 + }, + { + "epoch": 0.17602, + "grad_norm": 0.6539775755754308, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 17602 + }, + { + "epoch": 0.17603, + "grad_norm": 0.6069523128590745, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 17603 + }, + { + "epoch": 0.17604, + "grad_norm": 0.6773916182111083, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 17604 + }, + { + "epoch": 0.17605, + "grad_norm": 1.0012014432715137, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 17605 + }, + { + "epoch": 0.17606, + "grad_norm": 1.4370955588934409, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 17606 + }, + { + "epoch": 0.17607, + "grad_norm": 0.6531011281218092, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 17607 + }, + { + "epoch": 0.17608, + "grad_norm": 0.7464058321070703, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 17608 + }, + { + "epoch": 0.17609, + "grad_norm": 0.7218063876980279, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 17609 + }, + { + "epoch": 0.1761, + "grad_norm": 0.8173621803491683, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 17610 + }, + { + "epoch": 0.17611, + "grad_norm": 0.9427938699169145, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 17611 + }, + { + "epoch": 0.17612, + "grad_norm": 1.0849678597089432, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 17612 + }, + { + "epoch": 0.17613, + "grad_norm": 0.9679820228531881, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 17613 + }, + { + "epoch": 0.17614, + "grad_norm": 0.8665253923119886, + "learning_rate": 0.003, + "loss": 4.034, + "step": 17614 + }, + { + "epoch": 0.17615, + "grad_norm": 0.9042479329654656, + "learning_rate": 0.003, + "loss": 4.037, + "step": 17615 + }, + { + "epoch": 0.17616, + "grad_norm": 0.9616079844836394, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 17616 + }, + { + "epoch": 0.17617, + "grad_norm": 1.0036374450591588, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 17617 + }, + { + "epoch": 0.17618, + "grad_norm": 1.2022344861569625, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 17618 + }, + { + "epoch": 0.17619, + "grad_norm": 0.947988080738896, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 17619 + }, + { + "epoch": 0.1762, + "grad_norm": 0.9869514541150077, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 17620 + }, + { + "epoch": 0.17621, + "grad_norm": 1.1205504433298326, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 17621 + }, + { + "epoch": 0.17622, + "grad_norm": 0.9509252371225612, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 17622 + }, + { + "epoch": 0.17623, + "grad_norm": 0.9135463708548369, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 17623 + }, + { + "epoch": 0.17624, + "grad_norm": 0.9731374729663851, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 17624 + }, + { + "epoch": 0.17625, + "grad_norm": 1.0870998953977447, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 17625 + }, + { + "epoch": 0.17626, + "grad_norm": 0.9541337789351465, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 17626 + }, + { + "epoch": 0.17627, + "grad_norm": 1.0374954049945748, + "learning_rate": 0.003, + "loss": 4.1013, + "step": 17627 + }, + { + "epoch": 0.17628, + "grad_norm": 0.9253188613706079, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 17628 + }, + { + "epoch": 0.17629, + "grad_norm": 0.9231641047895617, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 17629 + }, + { + "epoch": 0.1763, + "grad_norm": 0.7818472035667626, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 17630 + }, + { + "epoch": 0.17631, + "grad_norm": 0.8139538974411653, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 17631 + }, + { + "epoch": 0.17632, + "grad_norm": 0.8106230937634593, + "learning_rate": 0.003, + "loss": 4.1212, + "step": 17632 + }, + { + "epoch": 0.17633, + "grad_norm": 0.6904086027683515, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 17633 + }, + { + "epoch": 0.17634, + "grad_norm": 0.6876779716839564, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 17634 + }, + { + "epoch": 0.17635, + "grad_norm": 0.7604731473197992, + "learning_rate": 0.003, + "loss": 4.05, + "step": 17635 + }, + { + "epoch": 0.17636, + "grad_norm": 0.9055509128125596, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 17636 + }, + { + "epoch": 0.17637, + "grad_norm": 0.827680104335568, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 17637 + }, + { + "epoch": 0.17638, + "grad_norm": 0.6740103740480011, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 17638 + }, + { + "epoch": 0.17639, + "grad_norm": 0.6649361019173743, + "learning_rate": 0.003, + "loss": 4.06, + "step": 17639 + }, + { + "epoch": 0.1764, + "grad_norm": 0.8923943342551018, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 17640 + }, + { + "epoch": 0.17641, + "grad_norm": 1.2564295114377186, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 17641 + }, + { + "epoch": 0.17642, + "grad_norm": 1.116295566826262, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 17642 + }, + { + "epoch": 0.17643, + "grad_norm": 0.8359441106024887, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 17643 + }, + { + "epoch": 0.17644, + "grad_norm": 0.7950901805727237, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 17644 + }, + { + "epoch": 0.17645, + "grad_norm": 0.7428549364789354, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 17645 + }, + { + "epoch": 0.17646, + "grad_norm": 0.6895843602383841, + "learning_rate": 0.003, + "loss": 4.045, + "step": 17646 + }, + { + "epoch": 0.17647, + "grad_norm": 0.7174011282035472, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 17647 + }, + { + "epoch": 0.17648, + "grad_norm": 0.7753711222624593, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 17648 + }, + { + "epoch": 0.17649, + "grad_norm": 0.8521666068533342, + "learning_rate": 0.003, + "loss": 4.041, + "step": 17649 + }, + { + "epoch": 0.1765, + "grad_norm": 1.0663392302108505, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 17650 + }, + { + "epoch": 0.17651, + "grad_norm": 1.0774798343451986, + "learning_rate": 0.003, + "loss": 4.063, + "step": 17651 + }, + { + "epoch": 0.17652, + "grad_norm": 1.0407904728304735, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 17652 + }, + { + "epoch": 0.17653, + "grad_norm": 0.8356324203494921, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 17653 + }, + { + "epoch": 0.17654, + "grad_norm": 0.6126536594835358, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 17654 + }, + { + "epoch": 0.17655, + "grad_norm": 0.6086643446139258, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 17655 + }, + { + "epoch": 0.17656, + "grad_norm": 0.6849985521799608, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 17656 + }, + { + "epoch": 0.17657, + "grad_norm": 0.7664003335440541, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 17657 + }, + { + "epoch": 0.17658, + "grad_norm": 0.9162724064067518, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 17658 + }, + { + "epoch": 0.17659, + "grad_norm": 1.1762867318446273, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 17659 + }, + { + "epoch": 0.1766, + "grad_norm": 1.171536516985786, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 17660 + }, + { + "epoch": 0.17661, + "grad_norm": 0.8788337587723518, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 17661 + }, + { + "epoch": 0.17662, + "grad_norm": 0.8182153037944079, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 17662 + }, + { + "epoch": 0.17663, + "grad_norm": 0.7129332044919899, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 17663 + }, + { + "epoch": 0.17664, + "grad_norm": 0.7108767454704167, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 17664 + }, + { + "epoch": 0.17665, + "grad_norm": 0.7158462859575461, + "learning_rate": 0.003, + "loss": 4.067, + "step": 17665 + }, + { + "epoch": 0.17666, + "grad_norm": 0.8142065368066004, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 17666 + }, + { + "epoch": 0.17667, + "grad_norm": 1.1001353519020953, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 17667 + }, + { + "epoch": 0.17668, + "grad_norm": 1.125336827208438, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 17668 + }, + { + "epoch": 0.17669, + "grad_norm": 0.9262014892175805, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 17669 + }, + { + "epoch": 0.1767, + "grad_norm": 0.9351966265618215, + "learning_rate": 0.003, + "loss": 4.05, + "step": 17670 + }, + { + "epoch": 0.17671, + "grad_norm": 1.0079906903639217, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 17671 + }, + { + "epoch": 0.17672, + "grad_norm": 1.1162098820290083, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 17672 + }, + { + "epoch": 0.17673, + "grad_norm": 0.8669269208345238, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 17673 + }, + { + "epoch": 0.17674, + "grad_norm": 0.8963895136624581, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 17674 + }, + { + "epoch": 0.17675, + "grad_norm": 0.9956867098591692, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 17675 + }, + { + "epoch": 0.17676, + "grad_norm": 1.0116655226855777, + "learning_rate": 0.003, + "loss": 4.077, + "step": 17676 + }, + { + "epoch": 0.17677, + "grad_norm": 0.9397732140747828, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 17677 + }, + { + "epoch": 0.17678, + "grad_norm": 0.9501246526436264, + "learning_rate": 0.003, + "loss": 4.1023, + "step": 17678 + }, + { + "epoch": 0.17679, + "grad_norm": 0.907004381747051, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 17679 + }, + { + "epoch": 0.1768, + "grad_norm": 0.9668483402391215, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 17680 + }, + { + "epoch": 0.17681, + "grad_norm": 1.1724624604996552, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 17681 + }, + { + "epoch": 0.17682, + "grad_norm": 0.9348259883600136, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 17682 + }, + { + "epoch": 0.17683, + "grad_norm": 0.9512258847754369, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 17683 + }, + { + "epoch": 0.17684, + "grad_norm": 0.9823492002675904, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 17684 + }, + { + "epoch": 0.17685, + "grad_norm": 0.9649539217405146, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 17685 + }, + { + "epoch": 0.17686, + "grad_norm": 0.9478093402862565, + "learning_rate": 0.003, + "loss": 4.081, + "step": 17686 + }, + { + "epoch": 0.17687, + "grad_norm": 1.0166366580765052, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 17687 + }, + { + "epoch": 0.17688, + "grad_norm": 1.0379627024703006, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 17688 + }, + { + "epoch": 0.17689, + "grad_norm": 1.0749147037028024, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 17689 + }, + { + "epoch": 0.1769, + "grad_norm": 0.9386869748228578, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 17690 + }, + { + "epoch": 0.17691, + "grad_norm": 1.0263503233986766, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 17691 + }, + { + "epoch": 0.17692, + "grad_norm": 1.2009848361111113, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 17692 + }, + { + "epoch": 0.17693, + "grad_norm": 0.9468106159021873, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 17693 + }, + { + "epoch": 0.17694, + "grad_norm": 0.7928861683500125, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 17694 + }, + { + "epoch": 0.17695, + "grad_norm": 0.713073859342127, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 17695 + }, + { + "epoch": 0.17696, + "grad_norm": 0.5780474132367384, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 17696 + }, + { + "epoch": 0.17697, + "grad_norm": 0.4888099583148028, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 17697 + }, + { + "epoch": 0.17698, + "grad_norm": 0.5240675538308072, + "learning_rate": 0.003, + "loss": 4.062, + "step": 17698 + }, + { + "epoch": 0.17699, + "grad_norm": 0.5808100200357558, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 17699 + }, + { + "epoch": 0.177, + "grad_norm": 0.652105814167825, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 17700 + }, + { + "epoch": 0.17701, + "grad_norm": 0.6724715074058332, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 17701 + }, + { + "epoch": 0.17702, + "grad_norm": 0.6273551136784689, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 17702 + }, + { + "epoch": 0.17703, + "grad_norm": 0.7188470214720498, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 17703 + }, + { + "epoch": 0.17704, + "grad_norm": 0.9196777006848662, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 17704 + }, + { + "epoch": 0.17705, + "grad_norm": 1.233604016695256, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 17705 + }, + { + "epoch": 0.17706, + "grad_norm": 0.8099758682921231, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 17706 + }, + { + "epoch": 0.17707, + "grad_norm": 0.7922889457768697, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 17707 + }, + { + "epoch": 0.17708, + "grad_norm": 0.9294910034701283, + "learning_rate": 0.003, + "loss": 4.051, + "step": 17708 + }, + { + "epoch": 0.17709, + "grad_norm": 0.9634719340790691, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 17709 + }, + { + "epoch": 0.1771, + "grad_norm": 0.8542883355439383, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 17710 + }, + { + "epoch": 0.17711, + "grad_norm": 1.008147096757313, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 17711 + }, + { + "epoch": 0.17712, + "grad_norm": 0.9995776782234835, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 17712 + }, + { + "epoch": 0.17713, + "grad_norm": 1.035762142246252, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 17713 + }, + { + "epoch": 0.17714, + "grad_norm": 1.0812204223566275, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 17714 + }, + { + "epoch": 0.17715, + "grad_norm": 1.0160003851823913, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 17715 + }, + { + "epoch": 0.17716, + "grad_norm": 0.9901224019194541, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 17716 + }, + { + "epoch": 0.17717, + "grad_norm": 1.0105523828866692, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 17717 + }, + { + "epoch": 0.17718, + "grad_norm": 0.8803703923794938, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 17718 + }, + { + "epoch": 0.17719, + "grad_norm": 0.8475179522916448, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 17719 + }, + { + "epoch": 0.1772, + "grad_norm": 0.8505723124785796, + "learning_rate": 0.003, + "loss": 4.052, + "step": 17720 + }, + { + "epoch": 0.17721, + "grad_norm": 0.8355218228494463, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 17721 + }, + { + "epoch": 0.17722, + "grad_norm": 0.7703889576636893, + "learning_rate": 0.003, + "loss": 4.036, + "step": 17722 + }, + { + "epoch": 0.17723, + "grad_norm": 0.8495813991491132, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 17723 + }, + { + "epoch": 0.17724, + "grad_norm": 0.9134821544671048, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 17724 + }, + { + "epoch": 0.17725, + "grad_norm": 0.8144739276381116, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 17725 + }, + { + "epoch": 0.17726, + "grad_norm": 0.6939416908790277, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 17726 + }, + { + "epoch": 0.17727, + "grad_norm": 0.6454072470607094, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 17727 + }, + { + "epoch": 0.17728, + "grad_norm": 0.7014165435036105, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 17728 + }, + { + "epoch": 0.17729, + "grad_norm": 0.7311048415725621, + "learning_rate": 0.003, + "loss": 4.06, + "step": 17729 + }, + { + "epoch": 0.1773, + "grad_norm": 0.8872622358163939, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 17730 + }, + { + "epoch": 0.17731, + "grad_norm": 1.2137089571607587, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 17731 + }, + { + "epoch": 0.17732, + "grad_norm": 0.9294622943031654, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 17732 + }, + { + "epoch": 0.17733, + "grad_norm": 0.7542301139801861, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 17733 + }, + { + "epoch": 0.17734, + "grad_norm": 0.6185305422801383, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 17734 + }, + { + "epoch": 0.17735, + "grad_norm": 0.6137864649173381, + "learning_rate": 0.003, + "loss": 4.052, + "step": 17735 + }, + { + "epoch": 0.17736, + "grad_norm": 0.6659455575828724, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 17736 + }, + { + "epoch": 0.17737, + "grad_norm": 0.8345702722822673, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 17737 + }, + { + "epoch": 0.17738, + "grad_norm": 0.9022353673728907, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 17738 + }, + { + "epoch": 0.17739, + "grad_norm": 0.8191784473938587, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 17739 + }, + { + "epoch": 0.1774, + "grad_norm": 0.8883865655499635, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 17740 + }, + { + "epoch": 0.17741, + "grad_norm": 0.9811315410758518, + "learning_rate": 0.003, + "loss": 4.07, + "step": 17741 + }, + { + "epoch": 0.17742, + "grad_norm": 1.2013511256150107, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 17742 + }, + { + "epoch": 0.17743, + "grad_norm": 1.092672255113907, + "learning_rate": 0.003, + "loss": 4.106, + "step": 17743 + }, + { + "epoch": 0.17744, + "grad_norm": 0.9219106015146562, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 17744 + }, + { + "epoch": 0.17745, + "grad_norm": 0.812974922409387, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 17745 + }, + { + "epoch": 0.17746, + "grad_norm": 0.8316941146679137, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 17746 + }, + { + "epoch": 0.17747, + "grad_norm": 0.9060983849781237, + "learning_rate": 0.003, + "loss": 4.065, + "step": 17747 + }, + { + "epoch": 0.17748, + "grad_norm": 0.889852248776499, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 17748 + }, + { + "epoch": 0.17749, + "grad_norm": 0.7936664451323253, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 17749 + }, + { + "epoch": 0.1775, + "grad_norm": 0.7516260574375117, + "learning_rate": 0.003, + "loss": 4.061, + "step": 17750 + }, + { + "epoch": 0.17751, + "grad_norm": 0.6923171146385486, + "learning_rate": 0.003, + "loss": 4.079, + "step": 17751 + }, + { + "epoch": 0.17752, + "grad_norm": 0.6154399217859506, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 17752 + }, + { + "epoch": 0.17753, + "grad_norm": 0.5548021200243216, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 17753 + }, + { + "epoch": 0.17754, + "grad_norm": 0.6382749059286056, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 17754 + }, + { + "epoch": 0.17755, + "grad_norm": 0.6703350674066354, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 17755 + }, + { + "epoch": 0.17756, + "grad_norm": 0.7765406248457878, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 17756 + }, + { + "epoch": 0.17757, + "grad_norm": 0.9874011405256912, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 17757 + }, + { + "epoch": 0.17758, + "grad_norm": 1.1864614021848556, + "learning_rate": 0.003, + "loss": 4.081, + "step": 17758 + }, + { + "epoch": 0.17759, + "grad_norm": 0.7966573306432746, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 17759 + }, + { + "epoch": 0.1776, + "grad_norm": 0.8550868859423417, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 17760 + }, + { + "epoch": 0.17761, + "grad_norm": 0.8653694868982882, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 17761 + }, + { + "epoch": 0.17762, + "grad_norm": 1.13111863908971, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 17762 + }, + { + "epoch": 0.17763, + "grad_norm": 1.369890814521613, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 17763 + }, + { + "epoch": 0.17764, + "grad_norm": 0.9022243922006958, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 17764 + }, + { + "epoch": 0.17765, + "grad_norm": 1.0692650764857046, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 17765 + }, + { + "epoch": 0.17766, + "grad_norm": 1.047074446704455, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 17766 + }, + { + "epoch": 0.17767, + "grad_norm": 0.9448472426070241, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 17767 + }, + { + "epoch": 0.17768, + "grad_norm": 0.965255522704598, + "learning_rate": 0.003, + "loss": 4.051, + "step": 17768 + }, + { + "epoch": 0.17769, + "grad_norm": 0.9941641729165432, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 17769 + }, + { + "epoch": 0.1777, + "grad_norm": 0.9188787222419813, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 17770 + }, + { + "epoch": 0.17771, + "grad_norm": 0.7719858937803888, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 17771 + }, + { + "epoch": 0.17772, + "grad_norm": 0.8321469503134653, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 17772 + }, + { + "epoch": 0.17773, + "grad_norm": 0.8980540599473181, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 17773 + }, + { + "epoch": 0.17774, + "grad_norm": 1.1251410166996434, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 17774 + }, + { + "epoch": 0.17775, + "grad_norm": 0.8668630414936879, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 17775 + }, + { + "epoch": 0.17776, + "grad_norm": 0.7228707330959857, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 17776 + }, + { + "epoch": 0.17777, + "grad_norm": 0.7450778998710901, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 17777 + }, + { + "epoch": 0.17778, + "grad_norm": 0.6943377392989274, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 17778 + }, + { + "epoch": 0.17779, + "grad_norm": 0.7187853790848213, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 17779 + }, + { + "epoch": 0.1778, + "grad_norm": 0.7349671084053078, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 17780 + }, + { + "epoch": 0.17781, + "grad_norm": 0.711743364709573, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 17781 + }, + { + "epoch": 0.17782, + "grad_norm": 0.8813112801703069, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 17782 + }, + { + "epoch": 0.17783, + "grad_norm": 1.0788721319362986, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 17783 + }, + { + "epoch": 0.17784, + "grad_norm": 0.9905193408532994, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 17784 + }, + { + "epoch": 0.17785, + "grad_norm": 1.0033328942452602, + "learning_rate": 0.003, + "loss": 4.045, + "step": 17785 + }, + { + "epoch": 0.17786, + "grad_norm": 1.1187785421076482, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 17786 + }, + { + "epoch": 0.17787, + "grad_norm": 1.0117980388825438, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 17787 + }, + { + "epoch": 0.17788, + "grad_norm": 0.9705070010981849, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 17788 + }, + { + "epoch": 0.17789, + "grad_norm": 0.7926296671649389, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 17789 + }, + { + "epoch": 0.1779, + "grad_norm": 0.8256372867819184, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 17790 + }, + { + "epoch": 0.17791, + "grad_norm": 1.0366312697839044, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 17791 + }, + { + "epoch": 0.17792, + "grad_norm": 1.0290880745328101, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 17792 + }, + { + "epoch": 0.17793, + "grad_norm": 0.9556266395700164, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 17793 + }, + { + "epoch": 0.17794, + "grad_norm": 1.0961147248935896, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 17794 + }, + { + "epoch": 0.17795, + "grad_norm": 1.2133294082716444, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 17795 + }, + { + "epoch": 0.17796, + "grad_norm": 1.0670244159468336, + "learning_rate": 0.003, + "loss": 4.053, + "step": 17796 + }, + { + "epoch": 0.17797, + "grad_norm": 0.9168176835378857, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 17797 + }, + { + "epoch": 0.17798, + "grad_norm": 0.8872025151479986, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 17798 + }, + { + "epoch": 0.17799, + "grad_norm": 0.8922375868449132, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 17799 + }, + { + "epoch": 0.178, + "grad_norm": 0.9132471119186745, + "learning_rate": 0.003, + "loss": 4.075, + "step": 17800 + }, + { + "epoch": 0.17801, + "grad_norm": 1.012904222596664, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 17801 + }, + { + "epoch": 0.17802, + "grad_norm": 0.9200202318398326, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 17802 + }, + { + "epoch": 0.17803, + "grad_norm": 0.9801841750868228, + "learning_rate": 0.003, + "loss": 4.081, + "step": 17803 + }, + { + "epoch": 0.17804, + "grad_norm": 1.1263800660087637, + "learning_rate": 0.003, + "loss": 4.091, + "step": 17804 + }, + { + "epoch": 0.17805, + "grad_norm": 0.9465104112523818, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 17805 + }, + { + "epoch": 0.17806, + "grad_norm": 0.7347905293908259, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 17806 + }, + { + "epoch": 0.17807, + "grad_norm": 0.6378036178893862, + "learning_rate": 0.003, + "loss": 4.055, + "step": 17807 + }, + { + "epoch": 0.17808, + "grad_norm": 0.6231379764645049, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 17808 + }, + { + "epoch": 0.17809, + "grad_norm": 0.574030254558826, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 17809 + }, + { + "epoch": 0.1781, + "grad_norm": 0.6262467973941603, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 17810 + }, + { + "epoch": 0.17811, + "grad_norm": 0.6503924772853084, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17811 + }, + { + "epoch": 0.17812, + "grad_norm": 0.6428411925342502, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 17812 + }, + { + "epoch": 0.17813, + "grad_norm": 0.7227626352575178, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 17813 + }, + { + "epoch": 0.17814, + "grad_norm": 0.9585038957744706, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 17814 + }, + { + "epoch": 0.17815, + "grad_norm": 1.136555587105889, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 17815 + }, + { + "epoch": 0.17816, + "grad_norm": 0.9097401319619741, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 17816 + }, + { + "epoch": 0.17817, + "grad_norm": 0.9773536311525911, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 17817 + }, + { + "epoch": 0.17818, + "grad_norm": 1.0389111413400334, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 17818 + }, + { + "epoch": 0.17819, + "grad_norm": 1.1815291234110734, + "learning_rate": 0.003, + "loss": 4.087, + "step": 17819 + }, + { + "epoch": 0.1782, + "grad_norm": 0.91207429700373, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 17820 + }, + { + "epoch": 0.17821, + "grad_norm": 0.7860232391886783, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 17821 + }, + { + "epoch": 0.17822, + "grad_norm": 0.7035842037145283, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17822 + }, + { + "epoch": 0.17823, + "grad_norm": 0.6840969647439207, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 17823 + }, + { + "epoch": 0.17824, + "grad_norm": 0.6217284845722004, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 17824 + }, + { + "epoch": 0.17825, + "grad_norm": 0.6704324314407434, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 17825 + }, + { + "epoch": 0.17826, + "grad_norm": 0.8147753576290412, + "learning_rate": 0.003, + "loss": 4.062, + "step": 17826 + }, + { + "epoch": 0.17827, + "grad_norm": 1.0191376389186433, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 17827 + }, + { + "epoch": 0.17828, + "grad_norm": 1.0841394308763148, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 17828 + }, + { + "epoch": 0.17829, + "grad_norm": 0.8344426691123225, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 17829 + }, + { + "epoch": 0.1783, + "grad_norm": 0.7544609278945689, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 17830 + }, + { + "epoch": 0.17831, + "grad_norm": 0.703177202987652, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 17831 + }, + { + "epoch": 0.17832, + "grad_norm": 0.8088154018493255, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 17832 + }, + { + "epoch": 0.17833, + "grad_norm": 0.846507696887801, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 17833 + }, + { + "epoch": 0.17834, + "grad_norm": 0.9479917459460131, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 17834 + }, + { + "epoch": 0.17835, + "grad_norm": 1.0166749442753464, + "learning_rate": 0.003, + "loss": 4.075, + "step": 17835 + }, + { + "epoch": 0.17836, + "grad_norm": 0.9724725426647024, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 17836 + }, + { + "epoch": 0.17837, + "grad_norm": 0.9442101310692572, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 17837 + }, + { + "epoch": 0.17838, + "grad_norm": 0.8850063830339444, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 17838 + }, + { + "epoch": 0.17839, + "grad_norm": 0.9788694520353807, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 17839 + }, + { + "epoch": 0.1784, + "grad_norm": 1.0457320811640962, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 17840 + }, + { + "epoch": 0.17841, + "grad_norm": 0.8789040112559586, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 17841 + }, + { + "epoch": 0.17842, + "grad_norm": 0.9464363365637045, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 17842 + }, + { + "epoch": 0.17843, + "grad_norm": 1.029546704797995, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 17843 + }, + { + "epoch": 0.17844, + "grad_norm": 1.1172100045966578, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 17844 + }, + { + "epoch": 0.17845, + "grad_norm": 1.0220370572765092, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 17845 + }, + { + "epoch": 0.17846, + "grad_norm": 0.9969638416217529, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 17846 + }, + { + "epoch": 0.17847, + "grad_norm": 0.988076425895647, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 17847 + }, + { + "epoch": 0.17848, + "grad_norm": 0.8811541259749696, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 17848 + }, + { + "epoch": 0.17849, + "grad_norm": 0.8635706958060115, + "learning_rate": 0.003, + "loss": 4.096, + "step": 17849 + }, + { + "epoch": 0.1785, + "grad_norm": 0.892420935401702, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 17850 + }, + { + "epoch": 0.17851, + "grad_norm": 1.016620971248911, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 17851 + }, + { + "epoch": 0.17852, + "grad_norm": 1.146981148203576, + "learning_rate": 0.003, + "loss": 4.084, + "step": 17852 + }, + { + "epoch": 0.17853, + "grad_norm": 0.9661364880061537, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 17853 + }, + { + "epoch": 0.17854, + "grad_norm": 1.030066776504963, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 17854 + }, + { + "epoch": 0.17855, + "grad_norm": 1.0163877980167015, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 17855 + }, + { + "epoch": 0.17856, + "grad_norm": 1.0598006866318361, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 17856 + }, + { + "epoch": 0.17857, + "grad_norm": 1.11541742937424, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 17857 + }, + { + "epoch": 0.17858, + "grad_norm": 0.8217026406075862, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 17858 + }, + { + "epoch": 0.17859, + "grad_norm": 0.7526103482689931, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 17859 + }, + { + "epoch": 0.1786, + "grad_norm": 0.8674666582553064, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 17860 + }, + { + "epoch": 0.17861, + "grad_norm": 1.0235191697043038, + "learning_rate": 0.003, + "loss": 4.1045, + "step": 17861 + }, + { + "epoch": 0.17862, + "grad_norm": 1.113427436262814, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 17862 + }, + { + "epoch": 0.17863, + "grad_norm": 0.9294745221925278, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 17863 + }, + { + "epoch": 0.17864, + "grad_norm": 0.8695417549804368, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 17864 + }, + { + "epoch": 0.17865, + "grad_norm": 0.8222361364773434, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 17865 + }, + { + "epoch": 0.17866, + "grad_norm": 0.7582328748735376, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 17866 + }, + { + "epoch": 0.17867, + "grad_norm": 0.8418979652681704, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 17867 + }, + { + "epoch": 0.17868, + "grad_norm": 0.823726393265443, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 17868 + }, + { + "epoch": 0.17869, + "grad_norm": 0.8347670839103752, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 17869 + }, + { + "epoch": 0.1787, + "grad_norm": 0.8844912797792485, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 17870 + }, + { + "epoch": 0.17871, + "grad_norm": 1.0457242349054428, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 17871 + }, + { + "epoch": 0.17872, + "grad_norm": 0.9981604675465449, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 17872 + }, + { + "epoch": 0.17873, + "grad_norm": 0.9000260217232073, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 17873 + }, + { + "epoch": 0.17874, + "grad_norm": 0.8210044898990569, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 17874 + }, + { + "epoch": 0.17875, + "grad_norm": 0.7098912171295579, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 17875 + }, + { + "epoch": 0.17876, + "grad_norm": 0.6314395218838892, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 17876 + }, + { + "epoch": 0.17877, + "grad_norm": 0.7458507738392257, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 17877 + }, + { + "epoch": 0.17878, + "grad_norm": 0.845141126710546, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 17878 + }, + { + "epoch": 0.17879, + "grad_norm": 1.0997008575780136, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 17879 + }, + { + "epoch": 0.1788, + "grad_norm": 1.0677830801770296, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 17880 + }, + { + "epoch": 0.17881, + "grad_norm": 0.8967100099923909, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 17881 + }, + { + "epoch": 0.17882, + "grad_norm": 0.8116601445634104, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 17882 + }, + { + "epoch": 0.17883, + "grad_norm": 0.7008666203406305, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 17883 + }, + { + "epoch": 0.17884, + "grad_norm": 0.7504508937434677, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 17884 + }, + { + "epoch": 0.17885, + "grad_norm": 0.7932994676003893, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 17885 + }, + { + "epoch": 0.17886, + "grad_norm": 0.9830876472836074, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 17886 + }, + { + "epoch": 0.17887, + "grad_norm": 1.1207446961132943, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 17887 + }, + { + "epoch": 0.17888, + "grad_norm": 0.8747089539131329, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 17888 + }, + { + "epoch": 0.17889, + "grad_norm": 0.918664755130003, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 17889 + }, + { + "epoch": 0.1789, + "grad_norm": 0.9903103179316689, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 17890 + }, + { + "epoch": 0.17891, + "grad_norm": 0.9649525416433561, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 17891 + }, + { + "epoch": 0.17892, + "grad_norm": 0.9506342680304857, + "learning_rate": 0.003, + "loss": 4.1057, + "step": 17892 + }, + { + "epoch": 0.17893, + "grad_norm": 0.9434584248212771, + "learning_rate": 0.003, + "loss": 4.053, + "step": 17893 + }, + { + "epoch": 0.17894, + "grad_norm": 1.1081695893566654, + "learning_rate": 0.003, + "loss": 4.072, + "step": 17894 + }, + { + "epoch": 0.17895, + "grad_norm": 0.8594611751215389, + "learning_rate": 0.003, + "loss": 4.1016, + "step": 17895 + }, + { + "epoch": 0.17896, + "grad_norm": 0.870788515369203, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 17896 + }, + { + "epoch": 0.17897, + "grad_norm": 0.893168486716251, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 17897 + }, + { + "epoch": 0.17898, + "grad_norm": 0.8393792542325452, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 17898 + }, + { + "epoch": 0.17899, + "grad_norm": 0.8016239868093878, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 17899 + }, + { + "epoch": 0.179, + "grad_norm": 0.7366597111961479, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 17900 + }, + { + "epoch": 0.17901, + "grad_norm": 0.6606758482109569, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 17901 + }, + { + "epoch": 0.17902, + "grad_norm": 0.7029842079254771, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 17902 + }, + { + "epoch": 0.17903, + "grad_norm": 0.9665779376585047, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 17903 + }, + { + "epoch": 0.17904, + "grad_norm": 1.434524948923059, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 17904 + }, + { + "epoch": 0.17905, + "grad_norm": 0.6099934258825926, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 17905 + }, + { + "epoch": 0.17906, + "grad_norm": 0.7921307035709443, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 17906 + }, + { + "epoch": 0.17907, + "grad_norm": 0.8637347471785389, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 17907 + }, + { + "epoch": 0.17908, + "grad_norm": 0.7657210128835413, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 17908 + }, + { + "epoch": 0.17909, + "grad_norm": 0.7102705729311167, + "learning_rate": 0.003, + "loss": 4.083, + "step": 17909 + }, + { + "epoch": 0.1791, + "grad_norm": 0.7722022559628559, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 17910 + }, + { + "epoch": 0.17911, + "grad_norm": 0.7027911251903038, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 17911 + }, + { + "epoch": 0.17912, + "grad_norm": 0.7761335176024163, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 17912 + }, + { + "epoch": 0.17913, + "grad_norm": 0.9494240497237516, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 17913 + }, + { + "epoch": 0.17914, + "grad_norm": 1.1110165029892805, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 17914 + }, + { + "epoch": 0.17915, + "grad_norm": 0.9374310699867275, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 17915 + }, + { + "epoch": 0.17916, + "grad_norm": 0.8897764269231634, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 17916 + }, + { + "epoch": 0.17917, + "grad_norm": 0.7992151892069709, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 17917 + }, + { + "epoch": 0.17918, + "grad_norm": 0.8797485344863202, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 17918 + }, + { + "epoch": 0.17919, + "grad_norm": 0.8672758974774529, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 17919 + }, + { + "epoch": 0.1792, + "grad_norm": 0.8062273152118529, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 17920 + }, + { + "epoch": 0.17921, + "grad_norm": 0.7614024663606663, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 17921 + }, + { + "epoch": 0.17922, + "grad_norm": 0.8741360617038456, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 17922 + }, + { + "epoch": 0.17923, + "grad_norm": 1.0141438743501523, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 17923 + }, + { + "epoch": 0.17924, + "grad_norm": 0.8478569345318532, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 17924 + }, + { + "epoch": 0.17925, + "grad_norm": 0.7520471566488796, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 17925 + }, + { + "epoch": 0.17926, + "grad_norm": 0.6978604676792622, + "learning_rate": 0.003, + "loss": 4.037, + "step": 17926 + }, + { + "epoch": 0.17927, + "grad_norm": 0.8497823407207831, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 17927 + }, + { + "epoch": 0.17928, + "grad_norm": 1.1737409490970645, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 17928 + }, + { + "epoch": 0.17929, + "grad_norm": 0.9216083350139652, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 17929 + }, + { + "epoch": 0.1793, + "grad_norm": 0.9540698123430895, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 17930 + }, + { + "epoch": 0.17931, + "grad_norm": 1.180372591646462, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 17931 + }, + { + "epoch": 0.17932, + "grad_norm": 0.8802199886666183, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 17932 + }, + { + "epoch": 0.17933, + "grad_norm": 0.8734116782838467, + "learning_rate": 0.003, + "loss": 4.049, + "step": 17933 + }, + { + "epoch": 0.17934, + "grad_norm": 0.7855352141197343, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 17934 + }, + { + "epoch": 0.17935, + "grad_norm": 0.8975687543207646, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 17935 + }, + { + "epoch": 0.17936, + "grad_norm": 1.1101120089195442, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 17936 + }, + { + "epoch": 0.17937, + "grad_norm": 1.0529926461257948, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 17937 + }, + { + "epoch": 0.17938, + "grad_norm": 0.9307149829814615, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 17938 + }, + { + "epoch": 0.17939, + "grad_norm": 0.9991407278947849, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 17939 + }, + { + "epoch": 0.1794, + "grad_norm": 1.0413405119071344, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 17940 + }, + { + "epoch": 0.17941, + "grad_norm": 1.1522582155284176, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 17941 + }, + { + "epoch": 0.17942, + "grad_norm": 1.0635877004853507, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 17942 + }, + { + "epoch": 0.17943, + "grad_norm": 0.8479492847157323, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 17943 + }, + { + "epoch": 0.17944, + "grad_norm": 0.8325614379958157, + "learning_rate": 0.003, + "loss": 4.081, + "step": 17944 + }, + { + "epoch": 0.17945, + "grad_norm": 0.8936956994628485, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 17945 + }, + { + "epoch": 0.17946, + "grad_norm": 0.8655358709774501, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 17946 + }, + { + "epoch": 0.17947, + "grad_norm": 0.8450066761147643, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 17947 + }, + { + "epoch": 0.17948, + "grad_norm": 0.8266749042578516, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 17948 + }, + { + "epoch": 0.17949, + "grad_norm": 0.8730306356348632, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 17949 + }, + { + "epoch": 0.1795, + "grad_norm": 1.1400733758214807, + "learning_rate": 0.003, + "loss": 4.0945, + "step": 17950 + }, + { + "epoch": 0.17951, + "grad_norm": 1.1017829514672806, + "learning_rate": 0.003, + "loss": 4.063, + "step": 17951 + }, + { + "epoch": 0.17952, + "grad_norm": 0.9347560947573004, + "learning_rate": 0.003, + "loss": 4.04, + "step": 17952 + }, + { + "epoch": 0.17953, + "grad_norm": 0.9440296701440463, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 17953 + }, + { + "epoch": 0.17954, + "grad_norm": 0.8903371073408782, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 17954 + }, + { + "epoch": 0.17955, + "grad_norm": 0.8666115680666635, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 17955 + }, + { + "epoch": 0.17956, + "grad_norm": 0.8914096643592457, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 17956 + }, + { + "epoch": 0.17957, + "grad_norm": 1.0763191811784587, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 17957 + }, + { + "epoch": 0.17958, + "grad_norm": 0.8774000849886722, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 17958 + }, + { + "epoch": 0.17959, + "grad_norm": 0.8562373317838089, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 17959 + }, + { + "epoch": 0.1796, + "grad_norm": 0.8505421766506722, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 17960 + }, + { + "epoch": 0.17961, + "grad_norm": 0.7987535692134098, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 17961 + }, + { + "epoch": 0.17962, + "grad_norm": 0.8460412070139576, + "learning_rate": 0.003, + "loss": 4.043, + "step": 17962 + }, + { + "epoch": 0.17963, + "grad_norm": 0.9366779769009153, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 17963 + }, + { + "epoch": 0.17964, + "grad_norm": 1.1279418516043072, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 17964 + }, + { + "epoch": 0.17965, + "grad_norm": 0.8691996478669544, + "learning_rate": 0.003, + "loss": 4.048, + "step": 17965 + }, + { + "epoch": 0.17966, + "grad_norm": 0.7984813154248974, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 17966 + }, + { + "epoch": 0.17967, + "grad_norm": 0.784405067697677, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 17967 + }, + { + "epoch": 0.17968, + "grad_norm": 0.7624571907998321, + "learning_rate": 0.003, + "loss": 4.056, + "step": 17968 + }, + { + "epoch": 0.17969, + "grad_norm": 0.7934868979355254, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 17969 + }, + { + "epoch": 0.1797, + "grad_norm": 0.8584516317708869, + "learning_rate": 0.003, + "loss": 4.058, + "step": 17970 + }, + { + "epoch": 0.17971, + "grad_norm": 0.8440505039962745, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 17971 + }, + { + "epoch": 0.17972, + "grad_norm": 0.7911630625598691, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 17972 + }, + { + "epoch": 0.17973, + "grad_norm": 0.7811077834896014, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 17973 + }, + { + "epoch": 0.17974, + "grad_norm": 0.8446794202047522, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 17974 + }, + { + "epoch": 0.17975, + "grad_norm": 0.8254521903148109, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 17975 + }, + { + "epoch": 0.17976, + "grad_norm": 0.728235183395874, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 17976 + }, + { + "epoch": 0.17977, + "grad_norm": 0.7225241440192826, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 17977 + }, + { + "epoch": 0.17978, + "grad_norm": 0.7427535082841806, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 17978 + }, + { + "epoch": 0.17979, + "grad_norm": 0.811799520487269, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 17979 + }, + { + "epoch": 0.1798, + "grad_norm": 0.922958204480545, + "learning_rate": 0.003, + "loss": 4.045, + "step": 17980 + }, + { + "epoch": 0.17981, + "grad_norm": 0.9743395951623662, + "learning_rate": 0.003, + "loss": 3.9885, + "step": 17981 + }, + { + "epoch": 0.17982, + "grad_norm": 1.373180609655959, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 17982 + }, + { + "epoch": 0.17983, + "grad_norm": 0.8168275487681272, + "learning_rate": 0.003, + "loss": 4.065, + "step": 17983 + }, + { + "epoch": 0.17984, + "grad_norm": 0.7080180162419921, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 17984 + }, + { + "epoch": 0.17985, + "grad_norm": 0.7940605541977659, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 17985 + }, + { + "epoch": 0.17986, + "grad_norm": 0.7254951582788487, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 17986 + }, + { + "epoch": 0.17987, + "grad_norm": 0.7448635415988921, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 17987 + }, + { + "epoch": 0.17988, + "grad_norm": 0.8047736846631103, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 17988 + }, + { + "epoch": 0.17989, + "grad_norm": 0.9574550527423038, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 17989 + }, + { + "epoch": 0.1799, + "grad_norm": 1.020786271331795, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 17990 + }, + { + "epoch": 0.17991, + "grad_norm": 0.9887784017834026, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 17991 + }, + { + "epoch": 0.17992, + "grad_norm": 0.9676698946785408, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 17992 + }, + { + "epoch": 0.17993, + "grad_norm": 0.8948717629191097, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 17993 + }, + { + "epoch": 0.17994, + "grad_norm": 0.7534672583024733, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 17994 + }, + { + "epoch": 0.17995, + "grad_norm": 0.788104936502091, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 17995 + }, + { + "epoch": 0.17996, + "grad_norm": 0.8128901529972176, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 17996 + }, + { + "epoch": 0.17997, + "grad_norm": 0.7882848770742896, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 17997 + }, + { + "epoch": 0.17998, + "grad_norm": 0.850474555041141, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 17998 + }, + { + "epoch": 0.17999, + "grad_norm": 0.9586233532033333, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 17999 + }, + { + "epoch": 0.18, + "grad_norm": 1.1400668480008858, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 18000 + }, + { + "epoch": 0.18001, + "grad_norm": 1.054919436998714, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 18001 + }, + { + "epoch": 0.18002, + "grad_norm": 1.0247121761574658, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 18002 + }, + { + "epoch": 0.18003, + "grad_norm": 1.054778871943613, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 18003 + }, + { + "epoch": 0.18004, + "grad_norm": 1.0080173050674024, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 18004 + }, + { + "epoch": 0.18005, + "grad_norm": 0.9872412135627638, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 18005 + }, + { + "epoch": 0.18006, + "grad_norm": 0.8924778948565202, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 18006 + }, + { + "epoch": 0.18007, + "grad_norm": 0.913301667115082, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 18007 + }, + { + "epoch": 0.18008, + "grad_norm": 1.0397859611221514, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 18008 + }, + { + "epoch": 0.18009, + "grad_norm": 1.1887474112560947, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 18009 + }, + { + "epoch": 0.1801, + "grad_norm": 0.8600276505610895, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 18010 + }, + { + "epoch": 0.18011, + "grad_norm": 0.8158139335514721, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 18011 + }, + { + "epoch": 0.18012, + "grad_norm": 0.8305582817254908, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 18012 + }, + { + "epoch": 0.18013, + "grad_norm": 0.8390759420132285, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 18013 + }, + { + "epoch": 0.18014, + "grad_norm": 0.8586288442501921, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 18014 + }, + { + "epoch": 0.18015, + "grad_norm": 0.8530469028643262, + "learning_rate": 0.003, + "loss": 4.1096, + "step": 18015 + }, + { + "epoch": 0.18016, + "grad_norm": 0.9308114254337448, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 18016 + }, + { + "epoch": 0.18017, + "grad_norm": 0.88784790681411, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 18017 + }, + { + "epoch": 0.18018, + "grad_norm": 0.8238910140539516, + "learning_rate": 0.003, + "loss": 4.059, + "step": 18018 + }, + { + "epoch": 0.18019, + "grad_norm": 0.9016513070880283, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 18019 + }, + { + "epoch": 0.1802, + "grad_norm": 1.0920983725629796, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 18020 + }, + { + "epoch": 0.18021, + "grad_norm": 0.8839509402377498, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 18021 + }, + { + "epoch": 0.18022, + "grad_norm": 0.8766498380550272, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 18022 + }, + { + "epoch": 0.18023, + "grad_norm": 0.8968238559356412, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 18023 + }, + { + "epoch": 0.18024, + "grad_norm": 0.9562949134087456, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 18024 + }, + { + "epoch": 0.18025, + "grad_norm": 1.0678622849040766, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 18025 + }, + { + "epoch": 0.18026, + "grad_norm": 0.9914654337613729, + "learning_rate": 0.003, + "loss": 4.0913, + "step": 18026 + }, + { + "epoch": 0.18027, + "grad_norm": 1.0105618268675878, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 18027 + }, + { + "epoch": 0.18028, + "grad_norm": 0.8558677540865376, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 18028 + }, + { + "epoch": 0.18029, + "grad_norm": 0.7216040695209736, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 18029 + }, + { + "epoch": 0.1803, + "grad_norm": 0.7175008770053735, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 18030 + }, + { + "epoch": 0.18031, + "grad_norm": 0.823598678229924, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 18031 + }, + { + "epoch": 0.18032, + "grad_norm": 1.0033425238366533, + "learning_rate": 0.003, + "loss": 4.1353, + "step": 18032 + }, + { + "epoch": 0.18033, + "grad_norm": 1.1497851437371438, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 18033 + }, + { + "epoch": 0.18034, + "grad_norm": 1.0287070729647791, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 18034 + }, + { + "epoch": 0.18035, + "grad_norm": 1.145467793290977, + "learning_rate": 0.003, + "loss": 4.075, + "step": 18035 + }, + { + "epoch": 0.18036, + "grad_norm": 0.9283426059463389, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 18036 + }, + { + "epoch": 0.18037, + "grad_norm": 0.7992388200430705, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 18037 + }, + { + "epoch": 0.18038, + "grad_norm": 0.8191202045319761, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 18038 + }, + { + "epoch": 0.18039, + "grad_norm": 0.886227146354526, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 18039 + }, + { + "epoch": 0.1804, + "grad_norm": 0.8643583852276007, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 18040 + }, + { + "epoch": 0.18041, + "grad_norm": 0.8838099635267671, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18041 + }, + { + "epoch": 0.18042, + "grad_norm": 0.9005069709981848, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 18042 + }, + { + "epoch": 0.18043, + "grad_norm": 0.9236549671345337, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 18043 + }, + { + "epoch": 0.18044, + "grad_norm": 0.9911425335560223, + "learning_rate": 0.003, + "loss": 4.1108, + "step": 18044 + }, + { + "epoch": 0.18045, + "grad_norm": 1.021211340428821, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 18045 + }, + { + "epoch": 0.18046, + "grad_norm": 1.0008010184678846, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 18046 + }, + { + "epoch": 0.18047, + "grad_norm": 0.9706143140605511, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 18047 + }, + { + "epoch": 0.18048, + "grad_norm": 0.8050953450731502, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 18048 + }, + { + "epoch": 0.18049, + "grad_norm": 0.7168957116698095, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 18049 + }, + { + "epoch": 0.1805, + "grad_norm": 0.763730355417067, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 18050 + }, + { + "epoch": 0.18051, + "grad_norm": 0.8392387215658648, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 18051 + }, + { + "epoch": 0.18052, + "grad_norm": 0.8756320277128591, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 18052 + }, + { + "epoch": 0.18053, + "grad_norm": 0.9882093781293488, + "learning_rate": 0.003, + "loss": 4.063, + "step": 18053 + }, + { + "epoch": 0.18054, + "grad_norm": 1.147024489035133, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 18054 + }, + { + "epoch": 0.18055, + "grad_norm": 0.9365598602714547, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 18055 + }, + { + "epoch": 0.18056, + "grad_norm": 1.0658319735069848, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 18056 + }, + { + "epoch": 0.18057, + "grad_norm": 1.2052768115217336, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 18057 + }, + { + "epoch": 0.18058, + "grad_norm": 0.8583027638126337, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 18058 + }, + { + "epoch": 0.18059, + "grad_norm": 0.7610489523254532, + "learning_rate": 0.003, + "loss": 4.05, + "step": 18059 + }, + { + "epoch": 0.1806, + "grad_norm": 0.6985702685875976, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 18060 + }, + { + "epoch": 0.18061, + "grad_norm": 0.5905272923550221, + "learning_rate": 0.003, + "loss": 4.065, + "step": 18061 + }, + { + "epoch": 0.18062, + "grad_norm": 0.6693835892537288, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 18062 + }, + { + "epoch": 0.18063, + "grad_norm": 0.642860341607339, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 18063 + }, + { + "epoch": 0.18064, + "grad_norm": 0.644897143297268, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 18064 + }, + { + "epoch": 0.18065, + "grad_norm": 0.6998940543634489, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 18065 + }, + { + "epoch": 0.18066, + "grad_norm": 0.7876390615700182, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 18066 + }, + { + "epoch": 0.18067, + "grad_norm": 0.8507104078395324, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 18067 + }, + { + "epoch": 0.18068, + "grad_norm": 0.7868838319668721, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 18068 + }, + { + "epoch": 0.18069, + "grad_norm": 0.9103956263232936, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 18069 + }, + { + "epoch": 0.1807, + "grad_norm": 1.216721313552926, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 18070 + }, + { + "epoch": 0.18071, + "grad_norm": 0.8443612196545752, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 18071 + }, + { + "epoch": 0.18072, + "grad_norm": 0.7473558386864037, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 18072 + }, + { + "epoch": 0.18073, + "grad_norm": 0.7736004856991765, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 18073 + }, + { + "epoch": 0.18074, + "grad_norm": 0.7245788317208545, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 18074 + }, + { + "epoch": 0.18075, + "grad_norm": 0.77661643464328, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 18075 + }, + { + "epoch": 0.18076, + "grad_norm": 1.0284922953510736, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 18076 + }, + { + "epoch": 0.18077, + "grad_norm": 1.14881967732802, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 18077 + }, + { + "epoch": 0.18078, + "grad_norm": 0.9006986801785335, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 18078 + }, + { + "epoch": 0.18079, + "grad_norm": 0.9641920751676571, + "learning_rate": 0.003, + "loss": 4.056, + "step": 18079 + }, + { + "epoch": 0.1808, + "grad_norm": 1.1083547323529184, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 18080 + }, + { + "epoch": 0.18081, + "grad_norm": 0.8394517315269832, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 18081 + }, + { + "epoch": 0.18082, + "grad_norm": 0.7896556419721917, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 18082 + }, + { + "epoch": 0.18083, + "grad_norm": 0.7437492272834548, + "learning_rate": 0.003, + "loss": 4.051, + "step": 18083 + }, + { + "epoch": 0.18084, + "grad_norm": 0.7350499511250812, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 18084 + }, + { + "epoch": 0.18085, + "grad_norm": 0.7586160703975404, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 18085 + }, + { + "epoch": 0.18086, + "grad_norm": 0.8605106520318582, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 18086 + }, + { + "epoch": 0.18087, + "grad_norm": 1.0523219277194693, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 18087 + }, + { + "epoch": 0.18088, + "grad_norm": 0.9540598632872929, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 18088 + }, + { + "epoch": 0.18089, + "grad_norm": 0.954635444988524, + "learning_rate": 0.003, + "loss": 4.037, + "step": 18089 + }, + { + "epoch": 0.1809, + "grad_norm": 1.0242917218797458, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 18090 + }, + { + "epoch": 0.18091, + "grad_norm": 1.038614160368774, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 18091 + }, + { + "epoch": 0.18092, + "grad_norm": 0.857783444613349, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 18092 + }, + { + "epoch": 0.18093, + "grad_norm": 0.7140591390077858, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 18093 + }, + { + "epoch": 0.18094, + "grad_norm": 0.7482695580063535, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 18094 + }, + { + "epoch": 0.18095, + "grad_norm": 0.7385178452807232, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 18095 + }, + { + "epoch": 0.18096, + "grad_norm": 0.8392621298958493, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 18096 + }, + { + "epoch": 0.18097, + "grad_norm": 0.8916954202110989, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 18097 + }, + { + "epoch": 0.18098, + "grad_norm": 1.0075924604629982, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 18098 + }, + { + "epoch": 0.18099, + "grad_norm": 0.9911745971497355, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 18099 + }, + { + "epoch": 0.181, + "grad_norm": 0.9567441446592772, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 18100 + }, + { + "epoch": 0.18101, + "grad_norm": 0.9617819746104196, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 18101 + }, + { + "epoch": 0.18102, + "grad_norm": 1.010570390083101, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 18102 + }, + { + "epoch": 0.18103, + "grad_norm": 0.9373306937056151, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 18103 + }, + { + "epoch": 0.18104, + "grad_norm": 0.9270389789703667, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 18104 + }, + { + "epoch": 0.18105, + "grad_norm": 1.08425137493349, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 18105 + }, + { + "epoch": 0.18106, + "grad_norm": 1.195912316352079, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 18106 + }, + { + "epoch": 0.18107, + "grad_norm": 1.004636464485103, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 18107 + }, + { + "epoch": 0.18108, + "grad_norm": 0.9940175610961242, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 18108 + }, + { + "epoch": 0.18109, + "grad_norm": 0.9927972978176638, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 18109 + }, + { + "epoch": 0.1811, + "grad_norm": 1.1097166512494476, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 18110 + }, + { + "epoch": 0.18111, + "grad_norm": 0.8813473769855856, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 18111 + }, + { + "epoch": 0.18112, + "grad_norm": 0.8964082186183482, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 18112 + }, + { + "epoch": 0.18113, + "grad_norm": 0.950768700593564, + "learning_rate": 0.003, + "loss": 4.106, + "step": 18113 + }, + { + "epoch": 0.18114, + "grad_norm": 0.9586280817607733, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 18114 + }, + { + "epoch": 0.18115, + "grad_norm": 0.9933290778334918, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 18115 + }, + { + "epoch": 0.18116, + "grad_norm": 1.0564887509087377, + "learning_rate": 0.003, + "loss": 4.1107, + "step": 18116 + }, + { + "epoch": 0.18117, + "grad_norm": 1.0569225404214955, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 18117 + }, + { + "epoch": 0.18118, + "grad_norm": 1.0252559312834126, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 18118 + }, + { + "epoch": 0.18119, + "grad_norm": 0.9721326520180619, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 18119 + }, + { + "epoch": 0.1812, + "grad_norm": 0.8857108000225521, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 18120 + }, + { + "epoch": 0.18121, + "grad_norm": 0.8609578637849452, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 18121 + }, + { + "epoch": 0.18122, + "grad_norm": 0.9725015666055711, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 18122 + }, + { + "epoch": 0.18123, + "grad_norm": 0.9505304188010881, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 18123 + }, + { + "epoch": 0.18124, + "grad_norm": 0.8048063134000035, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 18124 + }, + { + "epoch": 0.18125, + "grad_norm": 0.7864391093644963, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 18125 + }, + { + "epoch": 0.18126, + "grad_norm": 0.8107160198696282, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 18126 + }, + { + "epoch": 0.18127, + "grad_norm": 0.805285737641391, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 18127 + }, + { + "epoch": 0.18128, + "grad_norm": 0.8503046943944598, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 18128 + }, + { + "epoch": 0.18129, + "grad_norm": 0.9339308416554004, + "learning_rate": 0.003, + "loss": 4.046, + "step": 18129 + }, + { + "epoch": 0.1813, + "grad_norm": 1.0167343887423093, + "learning_rate": 0.003, + "loss": 4.063, + "step": 18130 + }, + { + "epoch": 0.18131, + "grad_norm": 1.083885297834378, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 18131 + }, + { + "epoch": 0.18132, + "grad_norm": 0.7863781208744626, + "learning_rate": 0.003, + "loss": 4.075, + "step": 18132 + }, + { + "epoch": 0.18133, + "grad_norm": 0.6305209627716251, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 18133 + }, + { + "epoch": 0.18134, + "grad_norm": 0.6383779621138967, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 18134 + }, + { + "epoch": 0.18135, + "grad_norm": 0.6245947534556049, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 18135 + }, + { + "epoch": 0.18136, + "grad_norm": 0.5333880341479986, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 18136 + }, + { + "epoch": 0.18137, + "grad_norm": 0.5480307472237741, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 18137 + }, + { + "epoch": 0.18138, + "grad_norm": 0.5766572580782144, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 18138 + }, + { + "epoch": 0.18139, + "grad_norm": 0.6721878089281043, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 18139 + }, + { + "epoch": 0.1814, + "grad_norm": 0.8138944507771368, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 18140 + }, + { + "epoch": 0.18141, + "grad_norm": 0.9633570397696866, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 18141 + }, + { + "epoch": 0.18142, + "grad_norm": 1.0664658924456716, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 18142 + }, + { + "epoch": 0.18143, + "grad_norm": 1.1136573459458181, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 18143 + }, + { + "epoch": 0.18144, + "grad_norm": 0.8628506049356274, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 18144 + }, + { + "epoch": 0.18145, + "grad_norm": 0.7642134080266269, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 18145 + }, + { + "epoch": 0.18146, + "grad_norm": 0.8484411142804493, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 18146 + }, + { + "epoch": 0.18147, + "grad_norm": 0.8215187819624383, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18147 + }, + { + "epoch": 0.18148, + "grad_norm": 0.7898606748433694, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 18148 + }, + { + "epoch": 0.18149, + "grad_norm": 0.9865450042214753, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 18149 + }, + { + "epoch": 0.1815, + "grad_norm": 1.2419376894105545, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 18150 + }, + { + "epoch": 0.18151, + "grad_norm": 0.9270812697724077, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 18151 + }, + { + "epoch": 0.18152, + "grad_norm": 0.9479380298289326, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 18152 + }, + { + "epoch": 0.18153, + "grad_norm": 1.0005480471613615, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 18153 + }, + { + "epoch": 0.18154, + "grad_norm": 1.016867069998641, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 18154 + }, + { + "epoch": 0.18155, + "grad_norm": 0.8489247722993403, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 18155 + }, + { + "epoch": 0.18156, + "grad_norm": 0.8186962274083154, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 18156 + }, + { + "epoch": 0.18157, + "grad_norm": 0.7540143087358937, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 18157 + }, + { + "epoch": 0.18158, + "grad_norm": 0.7419636605484827, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 18158 + }, + { + "epoch": 0.18159, + "grad_norm": 0.802284595751655, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 18159 + }, + { + "epoch": 0.1816, + "grad_norm": 0.9385428012280234, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 18160 + }, + { + "epoch": 0.18161, + "grad_norm": 0.9717302322199527, + "learning_rate": 0.003, + "loss": 4.07, + "step": 18161 + }, + { + "epoch": 0.18162, + "grad_norm": 0.8754633514842903, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 18162 + }, + { + "epoch": 0.18163, + "grad_norm": 0.8396510209304644, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 18163 + }, + { + "epoch": 0.18164, + "grad_norm": 0.955555668213817, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 18164 + }, + { + "epoch": 0.18165, + "grad_norm": 1.137697207615346, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 18165 + }, + { + "epoch": 0.18166, + "grad_norm": 1.0881620225161932, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 18166 + }, + { + "epoch": 0.18167, + "grad_norm": 0.9485067426620597, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 18167 + }, + { + "epoch": 0.18168, + "grad_norm": 0.9243580245057562, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 18168 + }, + { + "epoch": 0.18169, + "grad_norm": 1.0203908542125293, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 18169 + }, + { + "epoch": 0.1817, + "grad_norm": 1.09077537303057, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 18170 + }, + { + "epoch": 0.18171, + "grad_norm": 0.9325823985592244, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 18171 + }, + { + "epoch": 0.18172, + "grad_norm": 0.915691801723223, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 18172 + }, + { + "epoch": 0.18173, + "grad_norm": 1.0059008108083742, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 18173 + }, + { + "epoch": 0.18174, + "grad_norm": 1.0551749379975364, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 18174 + }, + { + "epoch": 0.18175, + "grad_norm": 0.8984459911393465, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 18175 + }, + { + "epoch": 0.18176, + "grad_norm": 0.7819216793924464, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 18176 + }, + { + "epoch": 0.18177, + "grad_norm": 0.7210490214467581, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 18177 + }, + { + "epoch": 0.18178, + "grad_norm": 0.7012450839347708, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 18178 + }, + { + "epoch": 0.18179, + "grad_norm": 0.7692705758043559, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 18179 + }, + { + "epoch": 0.1818, + "grad_norm": 0.7210100101828922, + "learning_rate": 0.003, + "loss": 4.06, + "step": 18180 + }, + { + "epoch": 0.18181, + "grad_norm": 0.7500848601154694, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 18181 + }, + { + "epoch": 0.18182, + "grad_norm": 0.7321087967261691, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 18182 + }, + { + "epoch": 0.18183, + "grad_norm": 0.8023509999946052, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 18183 + }, + { + "epoch": 0.18184, + "grad_norm": 1.0483452254588141, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 18184 + }, + { + "epoch": 0.18185, + "grad_norm": 1.1355790294194081, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 18185 + }, + { + "epoch": 0.18186, + "grad_norm": 0.6902671569504694, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 18186 + }, + { + "epoch": 0.18187, + "grad_norm": 0.6531811095563804, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 18187 + }, + { + "epoch": 0.18188, + "grad_norm": 0.7234975208818184, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 18188 + }, + { + "epoch": 0.18189, + "grad_norm": 0.7301938424591938, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 18189 + }, + { + "epoch": 0.1819, + "grad_norm": 0.7773847873457516, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 18190 + }, + { + "epoch": 0.18191, + "grad_norm": 0.6645527794024358, + "learning_rate": 0.003, + "loss": 4.049, + "step": 18191 + }, + { + "epoch": 0.18192, + "grad_norm": 0.5967643259583706, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 18192 + }, + { + "epoch": 0.18193, + "grad_norm": 0.6782147612719742, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 18193 + }, + { + "epoch": 0.18194, + "grad_norm": 0.7955734597934974, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 18194 + }, + { + "epoch": 0.18195, + "grad_norm": 0.9581035879244924, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 18195 + }, + { + "epoch": 0.18196, + "grad_norm": 0.9299283102512945, + "learning_rate": 0.003, + "loss": 4.048, + "step": 18196 + }, + { + "epoch": 0.18197, + "grad_norm": 0.9942387648297777, + "learning_rate": 0.003, + "loss": 4.058, + "step": 18197 + }, + { + "epoch": 0.18198, + "grad_norm": 1.354846241094494, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 18198 + }, + { + "epoch": 0.18199, + "grad_norm": 0.8961845114651893, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 18199 + }, + { + "epoch": 0.182, + "grad_norm": 0.9161895535538193, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 18200 + }, + { + "epoch": 0.18201, + "grad_norm": 0.9347028076466997, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 18201 + }, + { + "epoch": 0.18202, + "grad_norm": 1.1339752903572067, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 18202 + }, + { + "epoch": 0.18203, + "grad_norm": 0.9935774237436475, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 18203 + }, + { + "epoch": 0.18204, + "grad_norm": 1.119459686939325, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 18204 + }, + { + "epoch": 0.18205, + "grad_norm": 0.9604229229376183, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 18205 + }, + { + "epoch": 0.18206, + "grad_norm": 1.0787143652624855, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 18206 + }, + { + "epoch": 0.18207, + "grad_norm": 0.9989436306033681, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 18207 + }, + { + "epoch": 0.18208, + "grad_norm": 0.9745983499062763, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 18208 + }, + { + "epoch": 0.18209, + "grad_norm": 0.8481239034032982, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 18209 + }, + { + "epoch": 0.1821, + "grad_norm": 0.7248993858698688, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 18210 + }, + { + "epoch": 0.18211, + "grad_norm": 0.6908269403481287, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 18211 + }, + { + "epoch": 0.18212, + "grad_norm": 0.6959265942355016, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 18212 + }, + { + "epoch": 0.18213, + "grad_norm": 0.6945147820887164, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 18213 + }, + { + "epoch": 0.18214, + "grad_norm": 0.6266237949266471, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 18214 + }, + { + "epoch": 0.18215, + "grad_norm": 0.6170896985720568, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 18215 + }, + { + "epoch": 0.18216, + "grad_norm": 0.6368359625575057, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 18216 + }, + { + "epoch": 0.18217, + "grad_norm": 0.6702022147134921, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 18217 + }, + { + "epoch": 0.18218, + "grad_norm": 0.8823613952334576, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 18218 + }, + { + "epoch": 0.18219, + "grad_norm": 1.4070313072252556, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 18219 + }, + { + "epoch": 0.1822, + "grad_norm": 0.7552854162567318, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 18220 + }, + { + "epoch": 0.18221, + "grad_norm": 0.6885545263331007, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 18221 + }, + { + "epoch": 0.18222, + "grad_norm": 0.6904079633572755, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 18222 + }, + { + "epoch": 0.18223, + "grad_norm": 0.7200929084130898, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 18223 + }, + { + "epoch": 0.18224, + "grad_norm": 0.7455347241748624, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 18224 + }, + { + "epoch": 0.18225, + "grad_norm": 0.8569416229175616, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 18225 + }, + { + "epoch": 0.18226, + "grad_norm": 0.9831236584861761, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 18226 + }, + { + "epoch": 0.18227, + "grad_norm": 1.2538086527772438, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 18227 + }, + { + "epoch": 0.18228, + "grad_norm": 0.8882797801107741, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 18228 + }, + { + "epoch": 0.18229, + "grad_norm": 0.8519735158492483, + "learning_rate": 0.003, + "loss": 4.042, + "step": 18229 + }, + { + "epoch": 0.1823, + "grad_norm": 0.8445283227191879, + "learning_rate": 0.003, + "loss": 4.075, + "step": 18230 + }, + { + "epoch": 0.18231, + "grad_norm": 0.9797837087003988, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 18231 + }, + { + "epoch": 0.18232, + "grad_norm": 1.1221256809356586, + "learning_rate": 0.003, + "loss": 4.069, + "step": 18232 + }, + { + "epoch": 0.18233, + "grad_norm": 0.8903268120362483, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 18233 + }, + { + "epoch": 0.18234, + "grad_norm": 1.0158494552810846, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 18234 + }, + { + "epoch": 0.18235, + "grad_norm": 1.1066567935006122, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 18235 + }, + { + "epoch": 0.18236, + "grad_norm": 0.6874891280868245, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 18236 + }, + { + "epoch": 0.18237, + "grad_norm": 0.6705247786577884, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 18237 + }, + { + "epoch": 0.18238, + "grad_norm": 0.7732181362741339, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 18238 + }, + { + "epoch": 0.18239, + "grad_norm": 0.7754611545539052, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 18239 + }, + { + "epoch": 0.1824, + "grad_norm": 0.9344975763258625, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 18240 + }, + { + "epoch": 0.18241, + "grad_norm": 1.2832078448609356, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 18241 + }, + { + "epoch": 0.18242, + "grad_norm": 0.9556987296668182, + "learning_rate": 0.003, + "loss": 4.023, + "step": 18242 + }, + { + "epoch": 0.18243, + "grad_norm": 0.9043533022376008, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 18243 + }, + { + "epoch": 0.18244, + "grad_norm": 0.9146811059154532, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 18244 + }, + { + "epoch": 0.18245, + "grad_norm": 1.1334886768689365, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 18245 + }, + { + "epoch": 0.18246, + "grad_norm": 0.9872644352464646, + "learning_rate": 0.003, + "loss": 4.053, + "step": 18246 + }, + { + "epoch": 0.18247, + "grad_norm": 0.9746216042009118, + "learning_rate": 0.003, + "loss": 4.065, + "step": 18247 + }, + { + "epoch": 0.18248, + "grad_norm": 0.873982634022835, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 18248 + }, + { + "epoch": 0.18249, + "grad_norm": 0.7966570786614937, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 18249 + }, + { + "epoch": 0.1825, + "grad_norm": 0.8541605241880963, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 18250 + }, + { + "epoch": 0.18251, + "grad_norm": 0.8219705222647181, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 18251 + }, + { + "epoch": 0.18252, + "grad_norm": 0.8552633058511083, + "learning_rate": 0.003, + "loss": 4.0992, + "step": 18252 + }, + { + "epoch": 0.18253, + "grad_norm": 0.8627514450069399, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 18253 + }, + { + "epoch": 0.18254, + "grad_norm": 0.8435662291486822, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 18254 + }, + { + "epoch": 0.18255, + "grad_norm": 0.8874599992740497, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 18255 + }, + { + "epoch": 0.18256, + "grad_norm": 1.124151888645897, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 18256 + }, + { + "epoch": 0.18257, + "grad_norm": 0.9881054341075414, + "learning_rate": 0.003, + "loss": 4.069, + "step": 18257 + }, + { + "epoch": 0.18258, + "grad_norm": 0.9111781193843494, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 18258 + }, + { + "epoch": 0.18259, + "grad_norm": 0.9461466892318765, + "learning_rate": 0.003, + "loss": 4.065, + "step": 18259 + }, + { + "epoch": 0.1826, + "grad_norm": 0.9910200896152757, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 18260 + }, + { + "epoch": 0.18261, + "grad_norm": 1.0945011675228884, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 18261 + }, + { + "epoch": 0.18262, + "grad_norm": 0.9424357588602961, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 18262 + }, + { + "epoch": 0.18263, + "grad_norm": 0.8973101958045837, + "learning_rate": 0.003, + "loss": 4.069, + "step": 18263 + }, + { + "epoch": 0.18264, + "grad_norm": 0.8859311667384303, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 18264 + }, + { + "epoch": 0.18265, + "grad_norm": 0.8538222882398226, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 18265 + }, + { + "epoch": 0.18266, + "grad_norm": 0.7469743685221801, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 18266 + }, + { + "epoch": 0.18267, + "grad_norm": 0.7634378210328012, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 18267 + }, + { + "epoch": 0.18268, + "grad_norm": 0.8821616565620666, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 18268 + }, + { + "epoch": 0.18269, + "grad_norm": 0.9031687267940208, + "learning_rate": 0.003, + "loss": 4.074, + "step": 18269 + }, + { + "epoch": 0.1827, + "grad_norm": 0.8377298491836317, + "learning_rate": 0.003, + "loss": 4.054, + "step": 18270 + }, + { + "epoch": 0.18271, + "grad_norm": 0.9628602826310017, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 18271 + }, + { + "epoch": 0.18272, + "grad_norm": 0.848731983485719, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 18272 + }, + { + "epoch": 0.18273, + "grad_norm": 0.78200864179445, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 18273 + }, + { + "epoch": 0.18274, + "grad_norm": 0.8714591997469179, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 18274 + }, + { + "epoch": 0.18275, + "grad_norm": 0.9714147257975454, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 18275 + }, + { + "epoch": 0.18276, + "grad_norm": 1.1434623688571923, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 18276 + }, + { + "epoch": 0.18277, + "grad_norm": 1.0768484423778402, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 18277 + }, + { + "epoch": 0.18278, + "grad_norm": 1.023875330268496, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 18278 + }, + { + "epoch": 0.18279, + "grad_norm": 1.0239789661246292, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 18279 + }, + { + "epoch": 0.1828, + "grad_norm": 0.9856963577577317, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 18280 + }, + { + "epoch": 0.18281, + "grad_norm": 0.8789880468350629, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 18281 + }, + { + "epoch": 0.18282, + "grad_norm": 0.8511356093129471, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 18282 + }, + { + "epoch": 0.18283, + "grad_norm": 0.920479915384502, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 18283 + }, + { + "epoch": 0.18284, + "grad_norm": 0.8594703754993123, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 18284 + }, + { + "epoch": 0.18285, + "grad_norm": 0.7253504130445864, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 18285 + }, + { + "epoch": 0.18286, + "grad_norm": 0.8583690051592623, + "learning_rate": 0.003, + "loss": 4.062, + "step": 18286 + }, + { + "epoch": 0.18287, + "grad_norm": 0.7911440901548231, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 18287 + }, + { + "epoch": 0.18288, + "grad_norm": 0.776923762306454, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 18288 + }, + { + "epoch": 0.18289, + "grad_norm": 0.7842037109411842, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 18289 + }, + { + "epoch": 0.1829, + "grad_norm": 0.7133961788360071, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 18290 + }, + { + "epoch": 0.18291, + "grad_norm": 0.8516798753726228, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 18291 + }, + { + "epoch": 0.18292, + "grad_norm": 0.996869566368745, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 18292 + }, + { + "epoch": 0.18293, + "grad_norm": 1.2170779467026525, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 18293 + }, + { + "epoch": 0.18294, + "grad_norm": 0.9956472849566619, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 18294 + }, + { + "epoch": 0.18295, + "grad_norm": 1.0407446679918768, + "learning_rate": 0.003, + "loss": 4.063, + "step": 18295 + }, + { + "epoch": 0.18296, + "grad_norm": 1.0606650933847244, + "learning_rate": 0.003, + "loss": 4.0882, + "step": 18296 + }, + { + "epoch": 0.18297, + "grad_norm": 0.8768720508475318, + "learning_rate": 0.003, + "loss": 4.081, + "step": 18297 + }, + { + "epoch": 0.18298, + "grad_norm": 0.866777662801284, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 18298 + }, + { + "epoch": 0.18299, + "grad_norm": 0.9263658840199724, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18299 + }, + { + "epoch": 0.183, + "grad_norm": 1.0691709281154491, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 18300 + }, + { + "epoch": 0.18301, + "grad_norm": 1.2485516906991745, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 18301 + }, + { + "epoch": 0.18302, + "grad_norm": 0.9049009757806349, + "learning_rate": 0.003, + "loss": 4.0925, + "step": 18302 + }, + { + "epoch": 0.18303, + "grad_norm": 0.803290980741296, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 18303 + }, + { + "epoch": 0.18304, + "grad_norm": 0.8004407740269954, + "learning_rate": 0.003, + "loss": 4.0931, + "step": 18304 + }, + { + "epoch": 0.18305, + "grad_norm": 0.8689812640157072, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 18305 + }, + { + "epoch": 0.18306, + "grad_norm": 0.8692417606954554, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 18306 + }, + { + "epoch": 0.18307, + "grad_norm": 0.9789031146923883, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 18307 + }, + { + "epoch": 0.18308, + "grad_norm": 1.1844961302853967, + "learning_rate": 0.003, + "loss": 4.056, + "step": 18308 + }, + { + "epoch": 0.18309, + "grad_norm": 0.9856975048478858, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 18309 + }, + { + "epoch": 0.1831, + "grad_norm": 0.9390258538029391, + "learning_rate": 0.003, + "loss": 4.071, + "step": 18310 + }, + { + "epoch": 0.18311, + "grad_norm": 0.8731181926448102, + "learning_rate": 0.003, + "loss": 4.053, + "step": 18311 + }, + { + "epoch": 0.18312, + "grad_norm": 0.8153171222244627, + "learning_rate": 0.003, + "loss": 4.08, + "step": 18312 + }, + { + "epoch": 0.18313, + "grad_norm": 0.7563976129452407, + "learning_rate": 0.003, + "loss": 3.988, + "step": 18313 + }, + { + "epoch": 0.18314, + "grad_norm": 0.7758783584421425, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 18314 + }, + { + "epoch": 0.18315, + "grad_norm": 0.6732592622665557, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 18315 + }, + { + "epoch": 0.18316, + "grad_norm": 0.7433584654616154, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 18316 + }, + { + "epoch": 0.18317, + "grad_norm": 0.8370717529734161, + "learning_rate": 0.003, + "loss": 4.07, + "step": 18317 + }, + { + "epoch": 0.18318, + "grad_norm": 0.9934388495146806, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 18318 + }, + { + "epoch": 0.18319, + "grad_norm": 1.238996126516156, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 18319 + }, + { + "epoch": 0.1832, + "grad_norm": 0.6814958351543677, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 18320 + }, + { + "epoch": 0.18321, + "grad_norm": 0.6738831987359615, + "learning_rate": 0.003, + "loss": 4.116, + "step": 18321 + }, + { + "epoch": 0.18322, + "grad_norm": 0.7257307701617576, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 18322 + }, + { + "epoch": 0.18323, + "grad_norm": 0.7858957715058614, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 18323 + }, + { + "epoch": 0.18324, + "grad_norm": 0.90781181482303, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 18324 + }, + { + "epoch": 0.18325, + "grad_norm": 0.9381424590321324, + "learning_rate": 0.003, + "loss": 4.053, + "step": 18325 + }, + { + "epoch": 0.18326, + "grad_norm": 1.0111861670278783, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 18326 + }, + { + "epoch": 0.18327, + "grad_norm": 1.0206278346312894, + "learning_rate": 0.003, + "loss": 4.057, + "step": 18327 + }, + { + "epoch": 0.18328, + "grad_norm": 0.8937175824766302, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 18328 + }, + { + "epoch": 0.18329, + "grad_norm": 0.7956703126603709, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 18329 + }, + { + "epoch": 0.1833, + "grad_norm": 0.6609151533793772, + "learning_rate": 0.003, + "loss": 4.089, + "step": 18330 + }, + { + "epoch": 0.18331, + "grad_norm": 0.8023424956614408, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 18331 + }, + { + "epoch": 0.18332, + "grad_norm": 0.9444500003668816, + "learning_rate": 0.003, + "loss": 4.045, + "step": 18332 + }, + { + "epoch": 0.18333, + "grad_norm": 1.1232820292954713, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 18333 + }, + { + "epoch": 0.18334, + "grad_norm": 0.925771832016891, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 18334 + }, + { + "epoch": 0.18335, + "grad_norm": 0.90350674124472, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 18335 + }, + { + "epoch": 0.18336, + "grad_norm": 0.8962820985223595, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 18336 + }, + { + "epoch": 0.18337, + "grad_norm": 0.8742216244400192, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 18337 + }, + { + "epoch": 0.18338, + "grad_norm": 0.9489693468394845, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 18338 + }, + { + "epoch": 0.18339, + "grad_norm": 1.1047154507497938, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 18339 + }, + { + "epoch": 0.1834, + "grad_norm": 0.9738272548357211, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 18340 + }, + { + "epoch": 0.18341, + "grad_norm": 0.9289877084843211, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 18341 + }, + { + "epoch": 0.18342, + "grad_norm": 1.0034615603372803, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 18342 + }, + { + "epoch": 0.18343, + "grad_norm": 1.0869913939659064, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 18343 + }, + { + "epoch": 0.18344, + "grad_norm": 1.0326968476863894, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 18344 + }, + { + "epoch": 0.18345, + "grad_norm": 0.9365309959175494, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 18345 + }, + { + "epoch": 0.18346, + "grad_norm": 0.8441916266736671, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 18346 + }, + { + "epoch": 0.18347, + "grad_norm": 0.8448971241023325, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 18347 + }, + { + "epoch": 0.18348, + "grad_norm": 0.844413538370678, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 18348 + }, + { + "epoch": 0.18349, + "grad_norm": 1.180133984849724, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 18349 + }, + { + "epoch": 0.1835, + "grad_norm": 0.9807553149104413, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 18350 + }, + { + "epoch": 0.18351, + "grad_norm": 0.9538475469940382, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 18351 + }, + { + "epoch": 0.18352, + "grad_norm": 0.920438568997827, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 18352 + }, + { + "epoch": 0.18353, + "grad_norm": 0.9507661433634037, + "learning_rate": 0.003, + "loss": 4.071, + "step": 18353 + }, + { + "epoch": 0.18354, + "grad_norm": 0.9419113362763344, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 18354 + }, + { + "epoch": 0.18355, + "grad_norm": 0.9079770424794629, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 18355 + }, + { + "epoch": 0.18356, + "grad_norm": 0.7753937404092256, + "learning_rate": 0.003, + "loss": 4.041, + "step": 18356 + }, + { + "epoch": 0.18357, + "grad_norm": 0.8413296240917386, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 18357 + }, + { + "epoch": 0.18358, + "grad_norm": 0.9334011147106239, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 18358 + }, + { + "epoch": 0.18359, + "grad_norm": 1.0598753744361773, + "learning_rate": 0.003, + "loss": 4.1011, + "step": 18359 + }, + { + "epoch": 0.1836, + "grad_norm": 0.8700862978204383, + "learning_rate": 0.003, + "loss": 4.062, + "step": 18360 + }, + { + "epoch": 0.18361, + "grad_norm": 0.7775217484779283, + "learning_rate": 0.003, + "loss": 4.033, + "step": 18361 + }, + { + "epoch": 0.18362, + "grad_norm": 0.8475541836531145, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 18362 + }, + { + "epoch": 0.18363, + "grad_norm": 1.002621330099506, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 18363 + }, + { + "epoch": 0.18364, + "grad_norm": 1.0121652612178136, + "learning_rate": 0.003, + "loss": 4.1025, + "step": 18364 + }, + { + "epoch": 0.18365, + "grad_norm": 0.896934730668961, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 18365 + }, + { + "epoch": 0.18366, + "grad_norm": 0.700411786910494, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 18366 + }, + { + "epoch": 0.18367, + "grad_norm": 0.7200053737541005, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 18367 + }, + { + "epoch": 0.18368, + "grad_norm": 0.7379024769412605, + "learning_rate": 0.003, + "loss": 4.037, + "step": 18368 + }, + { + "epoch": 0.18369, + "grad_norm": 0.7369079470654286, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 18369 + }, + { + "epoch": 0.1837, + "grad_norm": 0.675034788221465, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 18370 + }, + { + "epoch": 0.18371, + "grad_norm": 0.6761825233019262, + "learning_rate": 0.003, + "loss": 4.028, + "step": 18371 + }, + { + "epoch": 0.18372, + "grad_norm": 0.73695990545588, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 18372 + }, + { + "epoch": 0.18373, + "grad_norm": 0.8779304698659591, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 18373 + }, + { + "epoch": 0.18374, + "grad_norm": 0.9574820279518487, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 18374 + }, + { + "epoch": 0.18375, + "grad_norm": 1.0082288952455414, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 18375 + }, + { + "epoch": 0.18376, + "grad_norm": 1.0789011139422715, + "learning_rate": 0.003, + "loss": 4.079, + "step": 18376 + }, + { + "epoch": 0.18377, + "grad_norm": 0.8889600977719888, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 18377 + }, + { + "epoch": 0.18378, + "grad_norm": 0.7369730004655919, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 18378 + }, + { + "epoch": 0.18379, + "grad_norm": 0.7006050753014715, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 18379 + }, + { + "epoch": 0.1838, + "grad_norm": 0.7135330918358853, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 18380 + }, + { + "epoch": 0.18381, + "grad_norm": 0.7277103801109799, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 18381 + }, + { + "epoch": 0.18382, + "grad_norm": 0.7512859891377555, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 18382 + }, + { + "epoch": 0.18383, + "grad_norm": 0.7100037046870975, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 18383 + }, + { + "epoch": 0.18384, + "grad_norm": 0.7845754390711044, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 18384 + }, + { + "epoch": 0.18385, + "grad_norm": 0.8088905780895999, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 18385 + }, + { + "epoch": 0.18386, + "grad_norm": 0.9519916077831818, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 18386 + }, + { + "epoch": 0.18387, + "grad_norm": 1.0850979765636848, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 18387 + }, + { + "epoch": 0.18388, + "grad_norm": 1.018253926346212, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 18388 + }, + { + "epoch": 0.18389, + "grad_norm": 0.9685745457080133, + "learning_rate": 0.003, + "loss": 4.075, + "step": 18389 + }, + { + "epoch": 0.1839, + "grad_norm": 0.9997793727571137, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 18390 + }, + { + "epoch": 0.18391, + "grad_norm": 1.1096756427692092, + "learning_rate": 0.003, + "loss": 4.05, + "step": 18391 + }, + { + "epoch": 0.18392, + "grad_norm": 1.0053052369757307, + "learning_rate": 0.003, + "loss": 4.035, + "step": 18392 + }, + { + "epoch": 0.18393, + "grad_norm": 1.0907581583031685, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 18393 + }, + { + "epoch": 0.18394, + "grad_norm": 0.9485181197151356, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 18394 + }, + { + "epoch": 0.18395, + "grad_norm": 1.0125317780175331, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 18395 + }, + { + "epoch": 0.18396, + "grad_norm": 1.0589857597585408, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 18396 + }, + { + "epoch": 0.18397, + "grad_norm": 0.9428369261269266, + "learning_rate": 0.003, + "loss": 4.089, + "step": 18397 + }, + { + "epoch": 0.18398, + "grad_norm": 0.9058252684157386, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 18398 + }, + { + "epoch": 0.18399, + "grad_norm": 0.9004445411384819, + "learning_rate": 0.003, + "loss": 4.056, + "step": 18399 + }, + { + "epoch": 0.184, + "grad_norm": 0.8888019074005667, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 18400 + }, + { + "epoch": 0.18401, + "grad_norm": 0.9103513397753971, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 18401 + }, + { + "epoch": 0.18402, + "grad_norm": 1.0078726060095158, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 18402 + }, + { + "epoch": 0.18403, + "grad_norm": 1.2079609013253374, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 18403 + }, + { + "epoch": 0.18404, + "grad_norm": 1.0349328044208834, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 18404 + }, + { + "epoch": 0.18405, + "grad_norm": 1.0252836146503492, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 18405 + }, + { + "epoch": 0.18406, + "grad_norm": 1.0518917905917498, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 18406 + }, + { + "epoch": 0.18407, + "grad_norm": 0.9447576952333357, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 18407 + }, + { + "epoch": 0.18408, + "grad_norm": 0.950840379862803, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 18408 + }, + { + "epoch": 0.18409, + "grad_norm": 0.9081149147449998, + "learning_rate": 0.003, + "loss": 4.084, + "step": 18409 + }, + { + "epoch": 0.1841, + "grad_norm": 0.946756086650973, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 18410 + }, + { + "epoch": 0.18411, + "grad_norm": 1.0261992049981374, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 18411 + }, + { + "epoch": 0.18412, + "grad_norm": 1.0374419767202754, + "learning_rate": 0.003, + "loss": 4.1046, + "step": 18412 + }, + { + "epoch": 0.18413, + "grad_norm": 0.938850828370222, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 18413 + }, + { + "epoch": 0.18414, + "grad_norm": 0.9428837509698571, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 18414 + }, + { + "epoch": 0.18415, + "grad_norm": 1.016081854555453, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 18415 + }, + { + "epoch": 0.18416, + "grad_norm": 0.932861505724795, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 18416 + }, + { + "epoch": 0.18417, + "grad_norm": 0.7944634969291409, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 18417 + }, + { + "epoch": 0.18418, + "grad_norm": 0.7436733513208756, + "learning_rate": 0.003, + "loss": 4.042, + "step": 18418 + }, + { + "epoch": 0.18419, + "grad_norm": 0.7619970817084877, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 18419 + }, + { + "epoch": 0.1842, + "grad_norm": 0.7726319078385481, + "learning_rate": 0.003, + "loss": 4.085, + "step": 18420 + }, + { + "epoch": 0.18421, + "grad_norm": 0.8303853659853697, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 18421 + }, + { + "epoch": 0.18422, + "grad_norm": 0.9984712080761585, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 18422 + }, + { + "epoch": 0.18423, + "grad_norm": 1.2033989132315623, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 18423 + }, + { + "epoch": 0.18424, + "grad_norm": 0.9742790495383061, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 18424 + }, + { + "epoch": 0.18425, + "grad_norm": 0.927499385772444, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 18425 + }, + { + "epoch": 0.18426, + "grad_norm": 0.8717863685698273, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 18426 + }, + { + "epoch": 0.18427, + "grad_norm": 0.889216761621203, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 18427 + }, + { + "epoch": 0.18428, + "grad_norm": 0.9016093072479948, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 18428 + }, + { + "epoch": 0.18429, + "grad_norm": 0.8894854111643814, + "learning_rate": 0.003, + "loss": 4.077, + "step": 18429 + }, + { + "epoch": 0.1843, + "grad_norm": 0.7999202321666762, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 18430 + }, + { + "epoch": 0.18431, + "grad_norm": 0.8938771279461828, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 18431 + }, + { + "epoch": 0.18432, + "grad_norm": 0.8099028692557037, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 18432 + }, + { + "epoch": 0.18433, + "grad_norm": 0.9289813584587582, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 18433 + }, + { + "epoch": 0.18434, + "grad_norm": 1.0143284757428352, + "learning_rate": 0.003, + "loss": 4.063, + "step": 18434 + }, + { + "epoch": 0.18435, + "grad_norm": 1.121348756495198, + "learning_rate": 0.003, + "loss": 4.071, + "step": 18435 + }, + { + "epoch": 0.18436, + "grad_norm": 0.7415758261715508, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 18436 + }, + { + "epoch": 0.18437, + "grad_norm": 0.5183029476904694, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 18437 + }, + { + "epoch": 0.18438, + "grad_norm": 0.6287001273917971, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 18438 + }, + { + "epoch": 0.18439, + "grad_norm": 0.6527913564503991, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 18439 + }, + { + "epoch": 0.1844, + "grad_norm": 0.6537165185082999, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 18440 + }, + { + "epoch": 0.18441, + "grad_norm": 0.6109688303670382, + "learning_rate": 0.003, + "loss": 4.043, + "step": 18441 + }, + { + "epoch": 0.18442, + "grad_norm": 0.5807051392264547, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 18442 + }, + { + "epoch": 0.18443, + "grad_norm": 0.51909647450184, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 18443 + }, + { + "epoch": 0.18444, + "grad_norm": 0.6161715819722443, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 18444 + }, + { + "epoch": 0.18445, + "grad_norm": 0.686010218463515, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 18445 + }, + { + "epoch": 0.18446, + "grad_norm": 0.8357332711782991, + "learning_rate": 0.003, + "loss": 4.055, + "step": 18446 + }, + { + "epoch": 0.18447, + "grad_norm": 1.061025022993599, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 18447 + }, + { + "epoch": 0.18448, + "grad_norm": 1.0284687748521408, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 18448 + }, + { + "epoch": 0.18449, + "grad_norm": 0.9394082854203095, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 18449 + }, + { + "epoch": 0.1845, + "grad_norm": 0.8781725199666855, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 18450 + }, + { + "epoch": 0.18451, + "grad_norm": 0.8807996401925396, + "learning_rate": 0.003, + "loss": 4.064, + "step": 18451 + }, + { + "epoch": 0.18452, + "grad_norm": 0.9606994874096902, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 18452 + }, + { + "epoch": 0.18453, + "grad_norm": 0.9969082774520639, + "learning_rate": 0.003, + "loss": 4.103, + "step": 18453 + }, + { + "epoch": 0.18454, + "grad_norm": 1.1239948587452864, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 18454 + }, + { + "epoch": 0.18455, + "grad_norm": 0.9447925944530146, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 18455 + }, + { + "epoch": 0.18456, + "grad_norm": 0.8569234008384664, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 18456 + }, + { + "epoch": 0.18457, + "grad_norm": 0.8476602670449437, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 18457 + }, + { + "epoch": 0.18458, + "grad_norm": 0.9647356539077508, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 18458 + }, + { + "epoch": 0.18459, + "grad_norm": 1.1341728410551524, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 18459 + }, + { + "epoch": 0.1846, + "grad_norm": 0.9436184052763803, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 18460 + }, + { + "epoch": 0.18461, + "grad_norm": 1.0886259079988436, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 18461 + }, + { + "epoch": 0.18462, + "grad_norm": 1.1137114040374576, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 18462 + }, + { + "epoch": 0.18463, + "grad_norm": 0.888863614623211, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 18463 + }, + { + "epoch": 0.18464, + "grad_norm": 0.7955306254438335, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 18464 + }, + { + "epoch": 0.18465, + "grad_norm": 0.7948147222157742, + "learning_rate": 0.003, + "loss": 4.078, + "step": 18465 + }, + { + "epoch": 0.18466, + "grad_norm": 1.0093851830990055, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 18466 + }, + { + "epoch": 0.18467, + "grad_norm": 1.2687326355556912, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 18467 + }, + { + "epoch": 0.18468, + "grad_norm": 0.7723133455360601, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 18468 + }, + { + "epoch": 0.18469, + "grad_norm": 0.6975098229030329, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 18469 + }, + { + "epoch": 0.1847, + "grad_norm": 0.7467157201584129, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 18470 + }, + { + "epoch": 0.18471, + "grad_norm": 0.707265521656276, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 18471 + }, + { + "epoch": 0.18472, + "grad_norm": 0.7223176688893698, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 18472 + }, + { + "epoch": 0.18473, + "grad_norm": 0.7798563034331775, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 18473 + }, + { + "epoch": 0.18474, + "grad_norm": 0.8053796522646952, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 18474 + }, + { + "epoch": 0.18475, + "grad_norm": 0.8282204826481101, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18475 + }, + { + "epoch": 0.18476, + "grad_norm": 0.8738331447423594, + "learning_rate": 0.003, + "loss": 4.066, + "step": 18476 + }, + { + "epoch": 0.18477, + "grad_norm": 0.8801572279090764, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 18477 + }, + { + "epoch": 0.18478, + "grad_norm": 1.0007657078182481, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 18478 + }, + { + "epoch": 0.18479, + "grad_norm": 1.1297510234974877, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 18479 + }, + { + "epoch": 0.1848, + "grad_norm": 0.93616625647756, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 18480 + }, + { + "epoch": 0.18481, + "grad_norm": 1.1069482850270802, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 18481 + }, + { + "epoch": 0.18482, + "grad_norm": 1.0328090219519728, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 18482 + }, + { + "epoch": 0.18483, + "grad_norm": 0.8304889646346205, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 18483 + }, + { + "epoch": 0.18484, + "grad_norm": 0.8306578210325865, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 18484 + }, + { + "epoch": 0.18485, + "grad_norm": 0.9084971459826292, + "learning_rate": 0.003, + "loss": 4.067, + "step": 18485 + }, + { + "epoch": 0.18486, + "grad_norm": 0.8508793939670165, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 18486 + }, + { + "epoch": 0.18487, + "grad_norm": 0.8456528685492818, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 18487 + }, + { + "epoch": 0.18488, + "grad_norm": 0.9793854587943929, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 18488 + }, + { + "epoch": 0.18489, + "grad_norm": 1.107666868486215, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 18489 + }, + { + "epoch": 0.1849, + "grad_norm": 0.9225095750991604, + "learning_rate": 0.003, + "loss": 4.088, + "step": 18490 + }, + { + "epoch": 0.18491, + "grad_norm": 0.8448100396851996, + "learning_rate": 0.003, + "loss": 4.066, + "step": 18491 + }, + { + "epoch": 0.18492, + "grad_norm": 0.8756577943950237, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 18492 + }, + { + "epoch": 0.18493, + "grad_norm": 0.921153383274438, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 18493 + }, + { + "epoch": 0.18494, + "grad_norm": 0.840848008704909, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 18494 + }, + { + "epoch": 0.18495, + "grad_norm": 0.7144620754955204, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 18495 + }, + { + "epoch": 0.18496, + "grad_norm": 0.7249052376451209, + "learning_rate": 0.003, + "loss": 4.018, + "step": 18496 + }, + { + "epoch": 0.18497, + "grad_norm": 0.7847380117299806, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 18497 + }, + { + "epoch": 0.18498, + "grad_norm": 0.9196205811049072, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 18498 + }, + { + "epoch": 0.18499, + "grad_norm": 0.9978465713301852, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 18499 + }, + { + "epoch": 0.185, + "grad_norm": 1.2254032374543695, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 18500 + }, + { + "epoch": 0.18501, + "grad_norm": 0.8662246686936604, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 18501 + }, + { + "epoch": 0.18502, + "grad_norm": 0.7970732484039922, + "learning_rate": 0.003, + "loss": 4.052, + "step": 18502 + }, + { + "epoch": 0.18503, + "grad_norm": 0.8844422038201806, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 18503 + }, + { + "epoch": 0.18504, + "grad_norm": 1.1654490981629433, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 18504 + }, + { + "epoch": 0.18505, + "grad_norm": 0.9050345806266741, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 18505 + }, + { + "epoch": 0.18506, + "grad_norm": 0.779394214768107, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 18506 + }, + { + "epoch": 0.18507, + "grad_norm": 0.862050962883863, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 18507 + }, + { + "epoch": 0.18508, + "grad_norm": 0.8815184232639309, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 18508 + }, + { + "epoch": 0.18509, + "grad_norm": 0.8056097550425132, + "learning_rate": 0.003, + "loss": 4.035, + "step": 18509 + }, + { + "epoch": 0.1851, + "grad_norm": 0.7791877804311869, + "learning_rate": 0.003, + "loss": 4.086, + "step": 18510 + }, + { + "epoch": 0.18511, + "grad_norm": 0.746159518453401, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 18511 + }, + { + "epoch": 0.18512, + "grad_norm": 0.7027773370904974, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 18512 + }, + { + "epoch": 0.18513, + "grad_norm": 0.7721353011637136, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 18513 + }, + { + "epoch": 0.18514, + "grad_norm": 0.8215571322591416, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 18514 + }, + { + "epoch": 0.18515, + "grad_norm": 0.9314042119284027, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 18515 + }, + { + "epoch": 0.18516, + "grad_norm": 1.0269123781707103, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 18516 + }, + { + "epoch": 0.18517, + "grad_norm": 1.0932754875226336, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 18517 + }, + { + "epoch": 0.18518, + "grad_norm": 0.9061094588281557, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 18518 + }, + { + "epoch": 0.18519, + "grad_norm": 0.9994184618025153, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 18519 + }, + { + "epoch": 0.1852, + "grad_norm": 0.9762699073612215, + "learning_rate": 0.003, + "loss": 4.05, + "step": 18520 + }, + { + "epoch": 0.18521, + "grad_norm": 0.9826633730146579, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 18521 + }, + { + "epoch": 0.18522, + "grad_norm": 0.9493219997439435, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 18522 + }, + { + "epoch": 0.18523, + "grad_norm": 0.8338856238760505, + "learning_rate": 0.003, + "loss": 4.044, + "step": 18523 + }, + { + "epoch": 0.18524, + "grad_norm": 0.6604945655333001, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 18524 + }, + { + "epoch": 0.18525, + "grad_norm": 0.602588548112755, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 18525 + }, + { + "epoch": 0.18526, + "grad_norm": 0.6890798022179152, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 18526 + }, + { + "epoch": 0.18527, + "grad_norm": 0.7583416468363614, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 18527 + }, + { + "epoch": 0.18528, + "grad_norm": 0.8064278412947958, + "learning_rate": 0.003, + "loss": 4.077, + "step": 18528 + }, + { + "epoch": 0.18529, + "grad_norm": 0.9415906924771318, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 18529 + }, + { + "epoch": 0.1853, + "grad_norm": 1.0054048758862657, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 18530 + }, + { + "epoch": 0.18531, + "grad_norm": 1.0521052734046925, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 18531 + }, + { + "epoch": 0.18532, + "grad_norm": 0.9634328372197097, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 18532 + }, + { + "epoch": 0.18533, + "grad_norm": 0.9808569150687576, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 18533 + }, + { + "epoch": 0.18534, + "grad_norm": 1.048165713144891, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 18534 + }, + { + "epoch": 0.18535, + "grad_norm": 1.0573097465128605, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 18535 + }, + { + "epoch": 0.18536, + "grad_norm": 1.063150165834106, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 18536 + }, + { + "epoch": 0.18537, + "grad_norm": 0.9292842474533076, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 18537 + }, + { + "epoch": 0.18538, + "grad_norm": 0.8128674410057475, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 18538 + }, + { + "epoch": 0.18539, + "grad_norm": 0.8341469754200896, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 18539 + }, + { + "epoch": 0.1854, + "grad_norm": 0.8838137436985168, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 18540 + }, + { + "epoch": 0.18541, + "grad_norm": 0.8878788892629679, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 18541 + }, + { + "epoch": 0.18542, + "grad_norm": 0.922069685170163, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 18542 + }, + { + "epoch": 0.18543, + "grad_norm": 0.865949105504782, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 18543 + }, + { + "epoch": 0.18544, + "grad_norm": 0.8314339821578145, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 18544 + }, + { + "epoch": 0.18545, + "grad_norm": 0.920204070855273, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 18545 + }, + { + "epoch": 0.18546, + "grad_norm": 1.0092756442902626, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 18546 + }, + { + "epoch": 0.18547, + "grad_norm": 1.1780330697843064, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 18547 + }, + { + "epoch": 0.18548, + "grad_norm": 0.9525007098326996, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 18548 + }, + { + "epoch": 0.18549, + "grad_norm": 0.8363546002359297, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 18549 + }, + { + "epoch": 0.1855, + "grad_norm": 0.8195504772440295, + "learning_rate": 0.003, + "loss": 4.07, + "step": 18550 + }, + { + "epoch": 0.18551, + "grad_norm": 0.7577289023619256, + "learning_rate": 0.003, + "loss": 4.059, + "step": 18551 + }, + { + "epoch": 0.18552, + "grad_norm": 0.8650761776571809, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 18552 + }, + { + "epoch": 0.18553, + "grad_norm": 0.8030292341735045, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 18553 + }, + { + "epoch": 0.18554, + "grad_norm": 0.8347437093497928, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18554 + }, + { + "epoch": 0.18555, + "grad_norm": 0.978299065362135, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 18555 + }, + { + "epoch": 0.18556, + "grad_norm": 1.1173164361270036, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 18556 + }, + { + "epoch": 0.18557, + "grad_norm": 0.8360979444883614, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 18557 + }, + { + "epoch": 0.18558, + "grad_norm": 0.7582031849995656, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 18558 + }, + { + "epoch": 0.18559, + "grad_norm": 0.7068028381600463, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 18559 + }, + { + "epoch": 0.1856, + "grad_norm": 0.7180100367141031, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 18560 + }, + { + "epoch": 0.18561, + "grad_norm": 0.6840309184352409, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 18561 + }, + { + "epoch": 0.18562, + "grad_norm": 0.7694258088987754, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 18562 + }, + { + "epoch": 0.18563, + "grad_norm": 0.958104883986215, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 18563 + }, + { + "epoch": 0.18564, + "grad_norm": 1.4157366045585693, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 18564 + }, + { + "epoch": 0.18565, + "grad_norm": 0.5747070966442605, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 18565 + }, + { + "epoch": 0.18566, + "grad_norm": 0.6963956094707563, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 18566 + }, + { + "epoch": 0.18567, + "grad_norm": 0.7569404487630428, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 18567 + }, + { + "epoch": 0.18568, + "grad_norm": 0.7142714312803677, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 18568 + }, + { + "epoch": 0.18569, + "grad_norm": 0.7932754940972369, + "learning_rate": 0.003, + "loss": 4.063, + "step": 18569 + }, + { + "epoch": 0.1857, + "grad_norm": 0.948177045362133, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 18570 + }, + { + "epoch": 0.18571, + "grad_norm": 1.1651598379649453, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 18571 + }, + { + "epoch": 0.18572, + "grad_norm": 0.8348491979308896, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 18572 + }, + { + "epoch": 0.18573, + "grad_norm": 0.7978817295357757, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 18573 + }, + { + "epoch": 0.18574, + "grad_norm": 0.861319557730631, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 18574 + }, + { + "epoch": 0.18575, + "grad_norm": 0.8777951145807659, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 18575 + }, + { + "epoch": 0.18576, + "grad_norm": 0.7557209765978126, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 18576 + }, + { + "epoch": 0.18577, + "grad_norm": 0.7176279207797613, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 18577 + }, + { + "epoch": 0.18578, + "grad_norm": 0.8450161085980437, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 18578 + }, + { + "epoch": 0.18579, + "grad_norm": 1.0433029608767581, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 18579 + }, + { + "epoch": 0.1858, + "grad_norm": 1.0937265536703222, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 18580 + }, + { + "epoch": 0.18581, + "grad_norm": 1.0318135985630523, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 18581 + }, + { + "epoch": 0.18582, + "grad_norm": 1.1871668144742313, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 18582 + }, + { + "epoch": 0.18583, + "grad_norm": 0.9503706656273395, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 18583 + }, + { + "epoch": 0.18584, + "grad_norm": 0.8657144530670727, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 18584 + }, + { + "epoch": 0.18585, + "grad_norm": 0.804801902831209, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 18585 + }, + { + "epoch": 0.18586, + "grad_norm": 0.8820837687810786, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 18586 + }, + { + "epoch": 0.18587, + "grad_norm": 1.0015522090335942, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 18587 + }, + { + "epoch": 0.18588, + "grad_norm": 1.0145893090698617, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 18588 + }, + { + "epoch": 0.18589, + "grad_norm": 0.7714093426824449, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 18589 + }, + { + "epoch": 0.1859, + "grad_norm": 0.7239848046602954, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 18590 + }, + { + "epoch": 0.18591, + "grad_norm": 0.6753840091173711, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 18591 + }, + { + "epoch": 0.18592, + "grad_norm": 0.7673868892363467, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 18592 + }, + { + "epoch": 0.18593, + "grad_norm": 0.9701562480530804, + "learning_rate": 0.003, + "loss": 4.087, + "step": 18593 + }, + { + "epoch": 0.18594, + "grad_norm": 1.3944210066498526, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 18594 + }, + { + "epoch": 0.18595, + "grad_norm": 0.8126422871848954, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 18595 + }, + { + "epoch": 0.18596, + "grad_norm": 0.8016438186141875, + "learning_rate": 0.003, + "loss": 4.0971, + "step": 18596 + }, + { + "epoch": 0.18597, + "grad_norm": 0.8534178146910221, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 18597 + }, + { + "epoch": 0.18598, + "grad_norm": 0.8008697451174429, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 18598 + }, + { + "epoch": 0.18599, + "grad_norm": 0.748727464666017, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 18599 + }, + { + "epoch": 0.186, + "grad_norm": 0.7970963092703048, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 18600 + }, + { + "epoch": 0.18601, + "grad_norm": 0.8937831658798338, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 18601 + }, + { + "epoch": 0.18602, + "grad_norm": 1.0473329983382251, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 18602 + }, + { + "epoch": 0.18603, + "grad_norm": 1.064094771613099, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 18603 + }, + { + "epoch": 0.18604, + "grad_norm": 1.1863735272555227, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 18604 + }, + { + "epoch": 0.18605, + "grad_norm": 0.818349618627902, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 18605 + }, + { + "epoch": 0.18606, + "grad_norm": 0.9333503419193867, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 18606 + }, + { + "epoch": 0.18607, + "grad_norm": 1.2111414666867866, + "learning_rate": 0.003, + "loss": 4.0965, + "step": 18607 + }, + { + "epoch": 0.18608, + "grad_norm": 0.8497029535733186, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 18608 + }, + { + "epoch": 0.18609, + "grad_norm": 0.7137641796449244, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 18609 + }, + { + "epoch": 0.1861, + "grad_norm": 0.7932912505625324, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 18610 + }, + { + "epoch": 0.18611, + "grad_norm": 0.8232957366999208, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 18611 + }, + { + "epoch": 0.18612, + "grad_norm": 0.8075830264997964, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 18612 + }, + { + "epoch": 0.18613, + "grad_norm": 0.8771825778106802, + "learning_rate": 0.003, + "loss": 4.098, + "step": 18613 + }, + { + "epoch": 0.18614, + "grad_norm": 0.9470182671477181, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 18614 + }, + { + "epoch": 0.18615, + "grad_norm": 0.9258175891779046, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 18615 + }, + { + "epoch": 0.18616, + "grad_norm": 0.9036624713666295, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 18616 + }, + { + "epoch": 0.18617, + "grad_norm": 0.998954548989223, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 18617 + }, + { + "epoch": 0.18618, + "grad_norm": 0.9928413055052442, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 18618 + }, + { + "epoch": 0.18619, + "grad_norm": 0.9373758273689293, + "learning_rate": 0.003, + "loss": 4.061, + "step": 18619 + }, + { + "epoch": 0.1862, + "grad_norm": 0.9505850268826704, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 18620 + }, + { + "epoch": 0.18621, + "grad_norm": 0.9184104039486911, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 18621 + }, + { + "epoch": 0.18622, + "grad_norm": 0.7834062272402487, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 18622 + }, + { + "epoch": 0.18623, + "grad_norm": 0.8457309927041944, + "learning_rate": 0.003, + "loss": 4.057, + "step": 18623 + }, + { + "epoch": 0.18624, + "grad_norm": 0.773240324462519, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 18624 + }, + { + "epoch": 0.18625, + "grad_norm": 0.7758577710034719, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 18625 + }, + { + "epoch": 0.18626, + "grad_norm": 0.7474112197130057, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 18626 + }, + { + "epoch": 0.18627, + "grad_norm": 0.8009277134952273, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 18627 + }, + { + "epoch": 0.18628, + "grad_norm": 0.864150769355301, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 18628 + }, + { + "epoch": 0.18629, + "grad_norm": 0.9900300707558808, + "learning_rate": 0.003, + "loss": 4.054, + "step": 18629 + }, + { + "epoch": 0.1863, + "grad_norm": 1.1528805792552672, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 18630 + }, + { + "epoch": 0.18631, + "grad_norm": 0.9621551929838353, + "learning_rate": 0.003, + "loss": 4.091, + "step": 18631 + }, + { + "epoch": 0.18632, + "grad_norm": 1.0592645670992038, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 18632 + }, + { + "epoch": 0.18633, + "grad_norm": 1.0638430249249775, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 18633 + }, + { + "epoch": 0.18634, + "grad_norm": 0.8956163122694202, + "learning_rate": 0.003, + "loss": 4.069, + "step": 18634 + }, + { + "epoch": 0.18635, + "grad_norm": 0.9208634065672368, + "learning_rate": 0.003, + "loss": 4.0927, + "step": 18635 + }, + { + "epoch": 0.18636, + "grad_norm": 0.8374672234311633, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 18636 + }, + { + "epoch": 0.18637, + "grad_norm": 1.0151809888599386, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 18637 + }, + { + "epoch": 0.18638, + "grad_norm": 1.2321944528000137, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 18638 + }, + { + "epoch": 0.18639, + "grad_norm": 0.6209810581818979, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 18639 + }, + { + "epoch": 0.1864, + "grad_norm": 0.6714859577579821, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 18640 + }, + { + "epoch": 0.18641, + "grad_norm": 0.7585800682226972, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 18641 + }, + { + "epoch": 0.18642, + "grad_norm": 0.7736416728996286, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 18642 + }, + { + "epoch": 0.18643, + "grad_norm": 0.6845587611141088, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 18643 + }, + { + "epoch": 0.18644, + "grad_norm": 0.6449457042942692, + "learning_rate": 0.003, + "loss": 4.04, + "step": 18644 + }, + { + "epoch": 0.18645, + "grad_norm": 0.6781658589648845, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 18645 + }, + { + "epoch": 0.18646, + "grad_norm": 0.7335346540476505, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 18646 + }, + { + "epoch": 0.18647, + "grad_norm": 0.9043414802065455, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 18647 + }, + { + "epoch": 0.18648, + "grad_norm": 1.115024472927529, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 18648 + }, + { + "epoch": 0.18649, + "grad_norm": 0.9442446230800574, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 18649 + }, + { + "epoch": 0.1865, + "grad_norm": 0.8640464791821411, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 18650 + }, + { + "epoch": 0.18651, + "grad_norm": 0.852620260969083, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 18651 + }, + { + "epoch": 0.18652, + "grad_norm": 0.8938138180833967, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 18652 + }, + { + "epoch": 0.18653, + "grad_norm": 0.9129665444894208, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 18653 + }, + { + "epoch": 0.18654, + "grad_norm": 0.9985517751187377, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 18654 + }, + { + "epoch": 0.18655, + "grad_norm": 1.114880208481357, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 18655 + }, + { + "epoch": 0.18656, + "grad_norm": 0.8585340599610687, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 18656 + }, + { + "epoch": 0.18657, + "grad_norm": 0.7283994937851266, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 18657 + }, + { + "epoch": 0.18658, + "grad_norm": 0.7626271075147254, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 18658 + }, + { + "epoch": 0.18659, + "grad_norm": 0.9814812835118157, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 18659 + }, + { + "epoch": 0.1866, + "grad_norm": 1.0757033001178842, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 18660 + }, + { + "epoch": 0.18661, + "grad_norm": 0.7884415209663592, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 18661 + }, + { + "epoch": 0.18662, + "grad_norm": 0.7979802653243565, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 18662 + }, + { + "epoch": 0.18663, + "grad_norm": 0.8907768455362602, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 18663 + }, + { + "epoch": 0.18664, + "grad_norm": 1.3466317763565474, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 18664 + }, + { + "epoch": 0.18665, + "grad_norm": 1.0636091644752153, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 18665 + }, + { + "epoch": 0.18666, + "grad_norm": 0.8427447620290612, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 18666 + }, + { + "epoch": 0.18667, + "grad_norm": 0.8992002508141136, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 18667 + }, + { + "epoch": 0.18668, + "grad_norm": 1.1605032609000148, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 18668 + }, + { + "epoch": 0.18669, + "grad_norm": 1.0041522367290034, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 18669 + }, + { + "epoch": 0.1867, + "grad_norm": 1.0570474655128794, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 18670 + }, + { + "epoch": 0.18671, + "grad_norm": 0.8562110161440691, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 18671 + }, + { + "epoch": 0.18672, + "grad_norm": 0.7916245608667062, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 18672 + }, + { + "epoch": 0.18673, + "grad_norm": 0.8174257597305178, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 18673 + }, + { + "epoch": 0.18674, + "grad_norm": 0.9890349710269822, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 18674 + }, + { + "epoch": 0.18675, + "grad_norm": 1.1489879870539976, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 18675 + }, + { + "epoch": 0.18676, + "grad_norm": 1.0579998447216608, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 18676 + }, + { + "epoch": 0.18677, + "grad_norm": 0.9422890963603554, + "learning_rate": 0.003, + "loss": 4.067, + "step": 18677 + }, + { + "epoch": 0.18678, + "grad_norm": 0.9266295073890203, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 18678 + }, + { + "epoch": 0.18679, + "grad_norm": 1.0229221223859826, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 18679 + }, + { + "epoch": 0.1868, + "grad_norm": 0.9790192128522515, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 18680 + }, + { + "epoch": 0.18681, + "grad_norm": 0.8227062142935673, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 18681 + }, + { + "epoch": 0.18682, + "grad_norm": 0.7613958240398305, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 18682 + }, + { + "epoch": 0.18683, + "grad_norm": 0.7971309870586542, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 18683 + }, + { + "epoch": 0.18684, + "grad_norm": 0.8694020388828363, + "learning_rate": 0.003, + "loss": 4.08, + "step": 18684 + }, + { + "epoch": 0.18685, + "grad_norm": 0.7547661441038792, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 18685 + }, + { + "epoch": 0.18686, + "grad_norm": 0.7586909918580507, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 18686 + }, + { + "epoch": 0.18687, + "grad_norm": 0.9244202442626519, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 18687 + }, + { + "epoch": 0.18688, + "grad_norm": 0.9714856248368594, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18688 + }, + { + "epoch": 0.18689, + "grad_norm": 1.0033020555264711, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 18689 + }, + { + "epoch": 0.1869, + "grad_norm": 1.2299609613464744, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 18690 + }, + { + "epoch": 0.18691, + "grad_norm": 0.8317275090324072, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 18691 + }, + { + "epoch": 0.18692, + "grad_norm": 0.6599085621781208, + "learning_rate": 0.003, + "loss": 4.039, + "step": 18692 + }, + { + "epoch": 0.18693, + "grad_norm": 0.6042969686271215, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 18693 + }, + { + "epoch": 0.18694, + "grad_norm": 0.6085383113880665, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 18694 + }, + { + "epoch": 0.18695, + "grad_norm": 0.5585852976317263, + "learning_rate": 0.003, + "loss": 4.037, + "step": 18695 + }, + { + "epoch": 0.18696, + "grad_norm": 0.5403496937503798, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 18696 + }, + { + "epoch": 0.18697, + "grad_norm": 0.5333917672859838, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 18697 + }, + { + "epoch": 0.18698, + "grad_norm": 0.636439261340254, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 18698 + }, + { + "epoch": 0.18699, + "grad_norm": 0.8436122663873049, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 18699 + }, + { + "epoch": 0.187, + "grad_norm": 1.1818583809108116, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 18700 + }, + { + "epoch": 0.18701, + "grad_norm": 1.0260629147667248, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 18701 + }, + { + "epoch": 0.18702, + "grad_norm": 0.8223954752661594, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 18702 + }, + { + "epoch": 0.18703, + "grad_norm": 0.8137398214259036, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 18703 + }, + { + "epoch": 0.18704, + "grad_norm": 0.8143537813191083, + "learning_rate": 0.003, + "loss": 4.045, + "step": 18704 + }, + { + "epoch": 0.18705, + "grad_norm": 0.8892196928174321, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 18705 + }, + { + "epoch": 0.18706, + "grad_norm": 1.028766516821005, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 18706 + }, + { + "epoch": 0.18707, + "grad_norm": 0.9938237658131278, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 18707 + }, + { + "epoch": 0.18708, + "grad_norm": 1.1059708735089646, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 18708 + }, + { + "epoch": 0.18709, + "grad_norm": 1.0039294957107938, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 18709 + }, + { + "epoch": 0.1871, + "grad_norm": 0.9944472547123511, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 18710 + }, + { + "epoch": 0.18711, + "grad_norm": 0.9493386160201528, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 18711 + }, + { + "epoch": 0.18712, + "grad_norm": 0.8878470824962599, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 18712 + }, + { + "epoch": 0.18713, + "grad_norm": 0.8886403319100207, + "learning_rate": 0.003, + "loss": 4.075, + "step": 18713 + }, + { + "epoch": 0.18714, + "grad_norm": 1.0385483170221521, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 18714 + }, + { + "epoch": 0.18715, + "grad_norm": 1.079455300542097, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 18715 + }, + { + "epoch": 0.18716, + "grad_norm": 0.9226942336676118, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 18716 + }, + { + "epoch": 0.18717, + "grad_norm": 0.9031168481896467, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 18717 + }, + { + "epoch": 0.18718, + "grad_norm": 0.9145644943509552, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 18718 + }, + { + "epoch": 0.18719, + "grad_norm": 0.8901906264837669, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 18719 + }, + { + "epoch": 0.1872, + "grad_norm": 0.8823783083940833, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 18720 + }, + { + "epoch": 0.18721, + "grad_norm": 0.8741006416996544, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 18721 + }, + { + "epoch": 0.18722, + "grad_norm": 0.8771843269620443, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 18722 + }, + { + "epoch": 0.18723, + "grad_norm": 1.0859961396872917, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 18723 + }, + { + "epoch": 0.18724, + "grad_norm": 1.234783587381509, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 18724 + }, + { + "epoch": 0.18725, + "grad_norm": 0.8656584054193945, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 18725 + }, + { + "epoch": 0.18726, + "grad_norm": 1.0176295817640155, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 18726 + }, + { + "epoch": 0.18727, + "grad_norm": 1.0952543702855067, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 18727 + }, + { + "epoch": 0.18728, + "grad_norm": 0.9818200441142511, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 18728 + }, + { + "epoch": 0.18729, + "grad_norm": 0.8647280212517727, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 18729 + }, + { + "epoch": 0.1873, + "grad_norm": 0.7604727821073138, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 18730 + }, + { + "epoch": 0.18731, + "grad_norm": 0.8927783198152612, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 18731 + }, + { + "epoch": 0.18732, + "grad_norm": 1.1928710761756436, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 18732 + }, + { + "epoch": 0.18733, + "grad_norm": 1.0339184880306165, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 18733 + }, + { + "epoch": 0.18734, + "grad_norm": 0.9954957072543477, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 18734 + }, + { + "epoch": 0.18735, + "grad_norm": 1.0415461169977747, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 18735 + }, + { + "epoch": 0.18736, + "grad_norm": 1.0809808982084235, + "learning_rate": 0.003, + "loss": 4.085, + "step": 18736 + }, + { + "epoch": 0.18737, + "grad_norm": 0.9550940296303166, + "learning_rate": 0.003, + "loss": 4.095, + "step": 18737 + }, + { + "epoch": 0.18738, + "grad_norm": 0.8853494548839914, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 18738 + }, + { + "epoch": 0.18739, + "grad_norm": 0.8868575040078728, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 18739 + }, + { + "epoch": 0.1874, + "grad_norm": 0.8687924155211184, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 18740 + }, + { + "epoch": 0.18741, + "grad_norm": 1.0061975967728851, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 18741 + }, + { + "epoch": 0.18742, + "grad_norm": 1.0010206793135974, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 18742 + }, + { + "epoch": 0.18743, + "grad_norm": 0.9878387860859184, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 18743 + }, + { + "epoch": 0.18744, + "grad_norm": 0.9827023048517439, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 18744 + }, + { + "epoch": 0.18745, + "grad_norm": 0.932708005871291, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 18745 + }, + { + "epoch": 0.18746, + "grad_norm": 0.8068614961020573, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 18746 + }, + { + "epoch": 0.18747, + "grad_norm": 0.7385578015805192, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 18747 + }, + { + "epoch": 0.18748, + "grad_norm": 0.5711167269593707, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 18748 + }, + { + "epoch": 0.18749, + "grad_norm": 0.5294477523722143, + "learning_rate": 0.003, + "loss": 4.056, + "step": 18749 + }, + { + "epoch": 0.1875, + "grad_norm": 0.5140437975886389, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 18750 + }, + { + "epoch": 0.18751, + "grad_norm": 0.530468961475268, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 18751 + }, + { + "epoch": 0.18752, + "grad_norm": 0.5645708108749334, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 18752 + }, + { + "epoch": 0.18753, + "grad_norm": 0.675429752564956, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 18753 + }, + { + "epoch": 0.18754, + "grad_norm": 0.6639880753127896, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 18754 + }, + { + "epoch": 0.18755, + "grad_norm": 0.5432681051998008, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 18755 + }, + { + "epoch": 0.18756, + "grad_norm": 0.5940936271651334, + "learning_rate": 0.003, + "loss": 4.06, + "step": 18756 + }, + { + "epoch": 0.18757, + "grad_norm": 0.7237507616753347, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 18757 + }, + { + "epoch": 0.18758, + "grad_norm": 0.7555555056002169, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 18758 + }, + { + "epoch": 0.18759, + "grad_norm": 0.891224620725597, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 18759 + }, + { + "epoch": 0.1876, + "grad_norm": 1.3203251452776854, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 18760 + }, + { + "epoch": 0.18761, + "grad_norm": 1.0028429772376184, + "learning_rate": 0.003, + "loss": 4.053, + "step": 18761 + }, + { + "epoch": 0.18762, + "grad_norm": 1.0536848032883817, + "learning_rate": 0.003, + "loss": 4.083, + "step": 18762 + }, + { + "epoch": 0.18763, + "grad_norm": 0.8930248954359878, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 18763 + }, + { + "epoch": 0.18764, + "grad_norm": 0.9711429735798429, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 18764 + }, + { + "epoch": 0.18765, + "grad_norm": 0.9549252602224277, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 18765 + }, + { + "epoch": 0.18766, + "grad_norm": 0.8825744979988466, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 18766 + }, + { + "epoch": 0.18767, + "grad_norm": 0.866613538140772, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 18767 + }, + { + "epoch": 0.18768, + "grad_norm": 0.9927590475619487, + "learning_rate": 0.003, + "loss": 4.061, + "step": 18768 + }, + { + "epoch": 0.18769, + "grad_norm": 1.0088609322533033, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 18769 + }, + { + "epoch": 0.1877, + "grad_norm": 1.106423781881455, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 18770 + }, + { + "epoch": 0.18771, + "grad_norm": 1.034431665338578, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 18771 + }, + { + "epoch": 0.18772, + "grad_norm": 1.0627262986881796, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 18772 + }, + { + "epoch": 0.18773, + "grad_norm": 0.887195522054885, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 18773 + }, + { + "epoch": 0.18774, + "grad_norm": 0.858918141371021, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 18774 + }, + { + "epoch": 0.18775, + "grad_norm": 0.8239609561464694, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 18775 + }, + { + "epoch": 0.18776, + "grad_norm": 0.765249461107146, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 18776 + }, + { + "epoch": 0.18777, + "grad_norm": 0.7284087557981171, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 18777 + }, + { + "epoch": 0.18778, + "grad_norm": 0.7478154263010988, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 18778 + }, + { + "epoch": 0.18779, + "grad_norm": 0.8587254952613894, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 18779 + }, + { + "epoch": 0.1878, + "grad_norm": 1.060230551436364, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 18780 + }, + { + "epoch": 0.18781, + "grad_norm": 0.9978506452509432, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 18781 + }, + { + "epoch": 0.18782, + "grad_norm": 0.9835383501396353, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 18782 + }, + { + "epoch": 0.18783, + "grad_norm": 1.1044040436391112, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 18783 + }, + { + "epoch": 0.18784, + "grad_norm": 0.9758066096693506, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 18784 + }, + { + "epoch": 0.18785, + "grad_norm": 0.9497832993559777, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 18785 + }, + { + "epoch": 0.18786, + "grad_norm": 0.9153335118379946, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 18786 + }, + { + "epoch": 0.18787, + "grad_norm": 0.9234378938531879, + "learning_rate": 0.003, + "loss": 4.059, + "step": 18787 + }, + { + "epoch": 0.18788, + "grad_norm": 0.9706677008830245, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 18788 + }, + { + "epoch": 0.18789, + "grad_norm": 1.0192850757926157, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 18789 + }, + { + "epoch": 0.1879, + "grad_norm": 0.9550680647462348, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 18790 + }, + { + "epoch": 0.18791, + "grad_norm": 0.9308141125746637, + "learning_rate": 0.003, + "loss": 4.041, + "step": 18791 + }, + { + "epoch": 0.18792, + "grad_norm": 0.9191175824382493, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 18792 + }, + { + "epoch": 0.18793, + "grad_norm": 0.9020403126675917, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 18793 + }, + { + "epoch": 0.18794, + "grad_norm": 0.7026790245672965, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 18794 + }, + { + "epoch": 0.18795, + "grad_norm": 0.7463574707950998, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 18795 + }, + { + "epoch": 0.18796, + "grad_norm": 0.775705222498586, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 18796 + }, + { + "epoch": 0.18797, + "grad_norm": 0.860465009667747, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 18797 + }, + { + "epoch": 0.18798, + "grad_norm": 0.8729114215281945, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 18798 + }, + { + "epoch": 0.18799, + "grad_norm": 0.8493526673144045, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 18799 + }, + { + "epoch": 0.188, + "grad_norm": 0.8414683201108558, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 18800 + }, + { + "epoch": 0.18801, + "grad_norm": 0.7109917521508585, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 18801 + }, + { + "epoch": 0.18802, + "grad_norm": 0.7464302518820626, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 18802 + }, + { + "epoch": 0.18803, + "grad_norm": 0.9287888400318897, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 18803 + }, + { + "epoch": 0.18804, + "grad_norm": 0.9558074591311403, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 18804 + }, + { + "epoch": 0.18805, + "grad_norm": 1.131489662866921, + "learning_rate": 0.003, + "loss": 4.0938, + "step": 18805 + }, + { + "epoch": 0.18806, + "grad_norm": 1.0499753946196504, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 18806 + }, + { + "epoch": 0.18807, + "grad_norm": 0.9022586354003591, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 18807 + }, + { + "epoch": 0.18808, + "grad_norm": 0.9819719640203605, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 18808 + }, + { + "epoch": 0.18809, + "grad_norm": 0.8317694849570995, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 18809 + }, + { + "epoch": 0.1881, + "grad_norm": 0.8069785912305021, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 18810 + }, + { + "epoch": 0.18811, + "grad_norm": 0.9416952608965097, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 18811 + }, + { + "epoch": 0.18812, + "grad_norm": 1.0691887778931077, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 18812 + }, + { + "epoch": 0.18813, + "grad_norm": 0.9226958543712647, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 18813 + }, + { + "epoch": 0.18814, + "grad_norm": 0.9931817208848512, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 18814 + }, + { + "epoch": 0.18815, + "grad_norm": 1.2125783717525978, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 18815 + }, + { + "epoch": 0.18816, + "grad_norm": 0.8120664996097132, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 18816 + }, + { + "epoch": 0.18817, + "grad_norm": 0.6353000270082406, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 18817 + }, + { + "epoch": 0.18818, + "grad_norm": 0.6002060133252773, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 18818 + }, + { + "epoch": 0.18819, + "grad_norm": 0.6266331901580718, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 18819 + }, + { + "epoch": 0.1882, + "grad_norm": 0.5770467050684869, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 18820 + }, + { + "epoch": 0.18821, + "grad_norm": 0.5613509305690385, + "learning_rate": 0.003, + "loss": 4.072, + "step": 18821 + }, + { + "epoch": 0.18822, + "grad_norm": 0.5150651484254392, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 18822 + }, + { + "epoch": 0.18823, + "grad_norm": 0.5568903900998636, + "learning_rate": 0.003, + "loss": 4.047, + "step": 18823 + }, + { + "epoch": 0.18824, + "grad_norm": 0.7060667885894555, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 18824 + }, + { + "epoch": 0.18825, + "grad_norm": 0.9063872184032593, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 18825 + }, + { + "epoch": 0.18826, + "grad_norm": 1.2178790678775449, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 18826 + }, + { + "epoch": 0.18827, + "grad_norm": 0.7594558247508345, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 18827 + }, + { + "epoch": 0.18828, + "grad_norm": 0.6593887365400035, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 18828 + }, + { + "epoch": 0.18829, + "grad_norm": 0.6966940622137495, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 18829 + }, + { + "epoch": 0.1883, + "grad_norm": 0.6614036416998257, + "learning_rate": 0.003, + "loss": 4.054, + "step": 18830 + }, + { + "epoch": 0.18831, + "grad_norm": 0.715301254672398, + "learning_rate": 0.003, + "loss": 4.028, + "step": 18831 + }, + { + "epoch": 0.18832, + "grad_norm": 0.781847676703661, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 18832 + }, + { + "epoch": 0.18833, + "grad_norm": 0.7080776058645224, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 18833 + }, + { + "epoch": 0.18834, + "grad_norm": 0.7356251735610517, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 18834 + }, + { + "epoch": 0.18835, + "grad_norm": 0.7430063733504297, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 18835 + }, + { + "epoch": 0.18836, + "grad_norm": 0.8891537651152955, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 18836 + }, + { + "epoch": 0.18837, + "grad_norm": 1.0556734859556318, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 18837 + }, + { + "epoch": 0.18838, + "grad_norm": 1.1679233135151377, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 18838 + }, + { + "epoch": 0.18839, + "grad_norm": 0.8264179660628502, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 18839 + }, + { + "epoch": 0.1884, + "grad_norm": 0.7098483265548313, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 18840 + }, + { + "epoch": 0.18841, + "grad_norm": 0.8054116842630247, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 18841 + }, + { + "epoch": 0.18842, + "grad_norm": 0.8509981670241051, + "learning_rate": 0.003, + "loss": 4.063, + "step": 18842 + }, + { + "epoch": 0.18843, + "grad_norm": 0.9969852262785717, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 18843 + }, + { + "epoch": 0.18844, + "grad_norm": 1.1413452406387146, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 18844 + }, + { + "epoch": 0.18845, + "grad_norm": 0.872671479987583, + "learning_rate": 0.003, + "loss": 4.056, + "step": 18845 + }, + { + "epoch": 0.18846, + "grad_norm": 0.9264950201075592, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 18846 + }, + { + "epoch": 0.18847, + "grad_norm": 1.4893856769578935, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 18847 + }, + { + "epoch": 0.18848, + "grad_norm": 0.8722870420523543, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 18848 + }, + { + "epoch": 0.18849, + "grad_norm": 0.8718515176998175, + "learning_rate": 0.003, + "loss": 4.071, + "step": 18849 + }, + { + "epoch": 0.1885, + "grad_norm": 1.239750137077456, + "learning_rate": 0.003, + "loss": 4.078, + "step": 18850 + }, + { + "epoch": 0.18851, + "grad_norm": 1.085094743759517, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 18851 + }, + { + "epoch": 0.18852, + "grad_norm": 0.9221287522489386, + "learning_rate": 0.003, + "loss": 4.1007, + "step": 18852 + }, + { + "epoch": 0.18853, + "grad_norm": 0.9790516636979539, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 18853 + }, + { + "epoch": 0.18854, + "grad_norm": 1.0851939061736167, + "learning_rate": 0.003, + "loss": 4.093, + "step": 18854 + }, + { + "epoch": 0.18855, + "grad_norm": 0.8143372006760771, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 18855 + }, + { + "epoch": 0.18856, + "grad_norm": 0.8526868039812159, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 18856 + }, + { + "epoch": 0.18857, + "grad_norm": 0.934240988277304, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 18857 + }, + { + "epoch": 0.18858, + "grad_norm": 1.059774404285563, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 18858 + }, + { + "epoch": 0.18859, + "grad_norm": 1.3744112044009085, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 18859 + }, + { + "epoch": 0.1886, + "grad_norm": 0.838191945956162, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 18860 + }, + { + "epoch": 0.18861, + "grad_norm": 0.9567723189309212, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 18861 + }, + { + "epoch": 0.18862, + "grad_norm": 0.9522209163282251, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 18862 + }, + { + "epoch": 0.18863, + "grad_norm": 0.9701645634719633, + "learning_rate": 0.003, + "loss": 4.048, + "step": 18863 + }, + { + "epoch": 0.18864, + "grad_norm": 0.8867468108089815, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 18864 + }, + { + "epoch": 0.18865, + "grad_norm": 0.9147378691920581, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 18865 + }, + { + "epoch": 0.18866, + "grad_norm": 0.9408687642786978, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 18866 + }, + { + "epoch": 0.18867, + "grad_norm": 0.8335230506646315, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 18867 + }, + { + "epoch": 0.18868, + "grad_norm": 0.8060490775288223, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 18868 + }, + { + "epoch": 0.18869, + "grad_norm": 0.970622923617127, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 18869 + }, + { + "epoch": 0.1887, + "grad_norm": 1.1854363082597448, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 18870 + }, + { + "epoch": 0.18871, + "grad_norm": 0.8204597507089216, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 18871 + }, + { + "epoch": 0.18872, + "grad_norm": 0.7326807644564894, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 18872 + }, + { + "epoch": 0.18873, + "grad_norm": 0.7012234089239755, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 18873 + }, + { + "epoch": 0.18874, + "grad_norm": 0.6900292394829466, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 18874 + }, + { + "epoch": 0.18875, + "grad_norm": 0.6939154791718721, + "learning_rate": 0.003, + "loss": 4.0963, + "step": 18875 + }, + { + "epoch": 0.18876, + "grad_norm": 0.7517473133778547, + "learning_rate": 0.003, + "loss": 4.056, + "step": 18876 + }, + { + "epoch": 0.18877, + "grad_norm": 0.8191317178861717, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 18877 + }, + { + "epoch": 0.18878, + "grad_norm": 0.8538127620028799, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 18878 + }, + { + "epoch": 0.18879, + "grad_norm": 0.932832724439279, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 18879 + }, + { + "epoch": 0.1888, + "grad_norm": 1.092720141013755, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 18880 + }, + { + "epoch": 0.18881, + "grad_norm": 1.242970126766121, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 18881 + }, + { + "epoch": 0.18882, + "grad_norm": 0.6911935609097055, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 18882 + }, + { + "epoch": 0.18883, + "grad_norm": 0.5855532035751623, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 18883 + }, + { + "epoch": 0.18884, + "grad_norm": 0.5788839021366634, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 18884 + }, + { + "epoch": 0.18885, + "grad_norm": 0.6723976432780131, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 18885 + }, + { + "epoch": 0.18886, + "grad_norm": 0.9613490012287368, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 18886 + }, + { + "epoch": 0.18887, + "grad_norm": 1.2156570563802014, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 18887 + }, + { + "epoch": 0.18888, + "grad_norm": 0.7026564730680053, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 18888 + }, + { + "epoch": 0.18889, + "grad_norm": 0.637218552771614, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 18889 + }, + { + "epoch": 0.1889, + "grad_norm": 0.7848841726408939, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 18890 + }, + { + "epoch": 0.18891, + "grad_norm": 0.8174618056874458, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 18891 + }, + { + "epoch": 0.18892, + "grad_norm": 0.82324037796606, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 18892 + }, + { + "epoch": 0.18893, + "grad_norm": 0.8989579567688616, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 18893 + }, + { + "epoch": 0.18894, + "grad_norm": 0.8170069921172262, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 18894 + }, + { + "epoch": 0.18895, + "grad_norm": 0.8709899818471913, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 18895 + }, + { + "epoch": 0.18896, + "grad_norm": 0.9816936074435, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 18896 + }, + { + "epoch": 0.18897, + "grad_norm": 0.9385300050005491, + "learning_rate": 0.003, + "loss": 4.043, + "step": 18897 + }, + { + "epoch": 0.18898, + "grad_norm": 1.0164258971081395, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 18898 + }, + { + "epoch": 0.18899, + "grad_norm": 1.2353664741512713, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 18899 + }, + { + "epoch": 0.189, + "grad_norm": 0.8115214863222378, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 18900 + }, + { + "epoch": 0.18901, + "grad_norm": 0.8702930207318287, + "learning_rate": 0.003, + "loss": 4.06, + "step": 18901 + }, + { + "epoch": 0.18902, + "grad_norm": 0.9074106996674826, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 18902 + }, + { + "epoch": 0.18903, + "grad_norm": 0.8769430132728124, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 18903 + }, + { + "epoch": 0.18904, + "grad_norm": 0.8357201714124332, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 18904 + }, + { + "epoch": 0.18905, + "grad_norm": 0.8568778260748834, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 18905 + }, + { + "epoch": 0.18906, + "grad_norm": 0.8773830592359707, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 18906 + }, + { + "epoch": 0.18907, + "grad_norm": 1.1018615697908245, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 18907 + }, + { + "epoch": 0.18908, + "grad_norm": 1.0768350176206178, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 18908 + }, + { + "epoch": 0.18909, + "grad_norm": 1.02958642003902, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 18909 + }, + { + "epoch": 0.1891, + "grad_norm": 1.109531743082689, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 18910 + }, + { + "epoch": 0.18911, + "grad_norm": 1.0706380059324208, + "learning_rate": 0.003, + "loss": 4.1006, + "step": 18911 + }, + { + "epoch": 0.18912, + "grad_norm": 0.9401072080081331, + "learning_rate": 0.003, + "loss": 4.081, + "step": 18912 + }, + { + "epoch": 0.18913, + "grad_norm": 0.9471800113328755, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 18913 + }, + { + "epoch": 0.18914, + "grad_norm": 1.03021998339916, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 18914 + }, + { + "epoch": 0.18915, + "grad_norm": 0.9926485053115832, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 18915 + }, + { + "epoch": 0.18916, + "grad_norm": 0.8649525785747464, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 18916 + }, + { + "epoch": 0.18917, + "grad_norm": 0.8716473557338101, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 18917 + }, + { + "epoch": 0.18918, + "grad_norm": 0.7974051544805679, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 18918 + }, + { + "epoch": 0.18919, + "grad_norm": 0.7486120150741057, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 18919 + }, + { + "epoch": 0.1892, + "grad_norm": 0.7471681148128011, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 18920 + }, + { + "epoch": 0.18921, + "grad_norm": 0.8481416100666542, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 18921 + }, + { + "epoch": 0.18922, + "grad_norm": 1.05609494877661, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 18922 + }, + { + "epoch": 0.18923, + "grad_norm": 1.0960752489006915, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 18923 + }, + { + "epoch": 0.18924, + "grad_norm": 0.7967767649772057, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 18924 + }, + { + "epoch": 0.18925, + "grad_norm": 0.7933314767515339, + "learning_rate": 0.003, + "loss": 4.047, + "step": 18925 + }, + { + "epoch": 0.18926, + "grad_norm": 0.8148930991589804, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 18926 + }, + { + "epoch": 0.18927, + "grad_norm": 1.0041858509230297, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 18927 + }, + { + "epoch": 0.18928, + "grad_norm": 1.250085685550684, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 18928 + }, + { + "epoch": 0.18929, + "grad_norm": 0.9377175281824044, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 18929 + }, + { + "epoch": 0.1893, + "grad_norm": 0.9908659703128931, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 18930 + }, + { + "epoch": 0.18931, + "grad_norm": 1.0835707397021723, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 18931 + }, + { + "epoch": 0.18932, + "grad_norm": 0.8962878343047594, + "learning_rate": 0.003, + "loss": 4.1078, + "step": 18932 + }, + { + "epoch": 0.18933, + "grad_norm": 0.8173775100193572, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 18933 + }, + { + "epoch": 0.18934, + "grad_norm": 0.8729611047696794, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 18934 + }, + { + "epoch": 0.18935, + "grad_norm": 0.8113040778480568, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 18935 + }, + { + "epoch": 0.18936, + "grad_norm": 0.7988674454618783, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 18936 + }, + { + "epoch": 0.18937, + "grad_norm": 0.8254437408286549, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 18937 + }, + { + "epoch": 0.18938, + "grad_norm": 0.9520101741877295, + "learning_rate": 0.003, + "loss": 4.073, + "step": 18938 + }, + { + "epoch": 0.18939, + "grad_norm": 0.8857240409846306, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 18939 + }, + { + "epoch": 0.1894, + "grad_norm": 0.88749937816289, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 18940 + }, + { + "epoch": 0.18941, + "grad_norm": 1.1227583098496423, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 18941 + }, + { + "epoch": 0.18942, + "grad_norm": 0.9832114707116061, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 18942 + }, + { + "epoch": 0.18943, + "grad_norm": 0.9600468349870453, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 18943 + }, + { + "epoch": 0.18944, + "grad_norm": 0.8732775554293952, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 18944 + }, + { + "epoch": 0.18945, + "grad_norm": 0.7220890422982016, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 18945 + }, + { + "epoch": 0.18946, + "grad_norm": 0.6903304654162348, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 18946 + }, + { + "epoch": 0.18947, + "grad_norm": 0.6762756345509435, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 18947 + }, + { + "epoch": 0.18948, + "grad_norm": 0.6460604076303211, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 18948 + }, + { + "epoch": 0.18949, + "grad_norm": 0.6329574927541973, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 18949 + }, + { + "epoch": 0.1895, + "grad_norm": 0.6836089872978555, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 18950 + }, + { + "epoch": 0.18951, + "grad_norm": 0.7086644898134858, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 18951 + }, + { + "epoch": 0.18952, + "grad_norm": 0.7100978117855653, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 18952 + }, + { + "epoch": 0.18953, + "grad_norm": 0.7871203746614859, + "learning_rate": 0.003, + "loss": 4.052, + "step": 18953 + }, + { + "epoch": 0.18954, + "grad_norm": 0.8469184527638219, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 18954 + }, + { + "epoch": 0.18955, + "grad_norm": 1.0414248711497491, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 18955 + }, + { + "epoch": 0.18956, + "grad_norm": 1.3506103140542016, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 18956 + }, + { + "epoch": 0.18957, + "grad_norm": 0.6334204222274756, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 18957 + }, + { + "epoch": 0.18958, + "grad_norm": 0.6643756571034968, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 18958 + }, + { + "epoch": 0.18959, + "grad_norm": 0.7045097358355302, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 18959 + }, + { + "epoch": 0.1896, + "grad_norm": 0.7453331975404801, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 18960 + }, + { + "epoch": 0.18961, + "grad_norm": 0.8504204787573448, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 18961 + }, + { + "epoch": 0.18962, + "grad_norm": 1.013284928088702, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 18962 + }, + { + "epoch": 0.18963, + "grad_norm": 1.3091718330381101, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 18963 + }, + { + "epoch": 0.18964, + "grad_norm": 0.6574366733188126, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 18964 + }, + { + "epoch": 0.18965, + "grad_norm": 0.6169896902961652, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 18965 + }, + { + "epoch": 0.18966, + "grad_norm": 0.670031144439429, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 18966 + }, + { + "epoch": 0.18967, + "grad_norm": 0.6229251933542171, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 18967 + }, + { + "epoch": 0.18968, + "grad_norm": 0.6948496321264902, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 18968 + }, + { + "epoch": 0.18969, + "grad_norm": 0.8592221207104161, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 18969 + }, + { + "epoch": 0.1897, + "grad_norm": 0.9751045037276446, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 18970 + }, + { + "epoch": 0.18971, + "grad_norm": 1.086102405373793, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 18971 + }, + { + "epoch": 0.18972, + "grad_norm": 1.0940275559044008, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 18972 + }, + { + "epoch": 0.18973, + "grad_norm": 1.1265406620659937, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 18973 + }, + { + "epoch": 0.18974, + "grad_norm": 1.0123078102853043, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 18974 + }, + { + "epoch": 0.18975, + "grad_norm": 0.9758868356065785, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 18975 + }, + { + "epoch": 0.18976, + "grad_norm": 0.9080524279516271, + "learning_rate": 0.003, + "loss": 4.095, + "step": 18976 + }, + { + "epoch": 0.18977, + "grad_norm": 0.9830345863691641, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 18977 + }, + { + "epoch": 0.18978, + "grad_norm": 1.177799175192337, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 18978 + }, + { + "epoch": 0.18979, + "grad_norm": 1.0274277802455434, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 18979 + }, + { + "epoch": 0.1898, + "grad_norm": 1.02455739441151, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 18980 + }, + { + "epoch": 0.18981, + "grad_norm": 1.2845174388375342, + "learning_rate": 0.003, + "loss": 4.081, + "step": 18981 + }, + { + "epoch": 0.18982, + "grad_norm": 0.971770146010959, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 18982 + }, + { + "epoch": 0.18983, + "grad_norm": 0.9270480670885711, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 18983 + }, + { + "epoch": 0.18984, + "grad_norm": 0.9218668106555503, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 18984 + }, + { + "epoch": 0.18985, + "grad_norm": 0.868318247439268, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 18985 + }, + { + "epoch": 0.18986, + "grad_norm": 0.890714238448063, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 18986 + }, + { + "epoch": 0.18987, + "grad_norm": 0.9555878528790877, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 18987 + }, + { + "epoch": 0.18988, + "grad_norm": 1.0551855156304657, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 18988 + }, + { + "epoch": 0.18989, + "grad_norm": 1.12462282757074, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 18989 + }, + { + "epoch": 0.1899, + "grad_norm": 1.038697462277725, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 18990 + }, + { + "epoch": 0.18991, + "grad_norm": 1.1496875897116776, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 18991 + }, + { + "epoch": 0.18992, + "grad_norm": 1.1227898373814509, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 18992 + }, + { + "epoch": 0.18993, + "grad_norm": 0.7791332709741187, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 18993 + }, + { + "epoch": 0.18994, + "grad_norm": 0.5527281179829645, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 18994 + }, + { + "epoch": 0.18995, + "grad_norm": 0.604223751927832, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 18995 + }, + { + "epoch": 0.18996, + "grad_norm": 0.6747532465704158, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 18996 + }, + { + "epoch": 0.18997, + "grad_norm": 0.7207582894714029, + "learning_rate": 0.003, + "loss": 4.068, + "step": 18997 + }, + { + "epoch": 0.18998, + "grad_norm": 0.7449484313550031, + "learning_rate": 0.003, + "loss": 4.065, + "step": 18998 + }, + { + "epoch": 0.18999, + "grad_norm": 0.8533927002613969, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 18999 + }, + { + "epoch": 0.19, + "grad_norm": 0.9778404124788905, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 19000 + }, + { + "epoch": 0.19001, + "grad_norm": 1.1364391339030848, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 19001 + }, + { + "epoch": 0.19002, + "grad_norm": 0.6695573143213839, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 19002 + }, + { + "epoch": 0.19003, + "grad_norm": 0.6048424395466192, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 19003 + }, + { + "epoch": 0.19004, + "grad_norm": 0.6115529014494826, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 19004 + }, + { + "epoch": 0.19005, + "grad_norm": 0.5770869656635091, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 19005 + }, + { + "epoch": 0.19006, + "grad_norm": 0.5873184325931075, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 19006 + }, + { + "epoch": 0.19007, + "grad_norm": 0.6284217240533744, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 19007 + }, + { + "epoch": 0.19008, + "grad_norm": 0.6135687901151156, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 19008 + }, + { + "epoch": 0.19009, + "grad_norm": 0.6565765096883247, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 19009 + }, + { + "epoch": 0.1901, + "grad_norm": 0.7769475849315273, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 19010 + }, + { + "epoch": 0.19011, + "grad_norm": 0.8040936368011172, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 19011 + }, + { + "epoch": 0.19012, + "grad_norm": 0.8936636379105999, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 19012 + }, + { + "epoch": 0.19013, + "grad_norm": 0.9324952613877908, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 19013 + }, + { + "epoch": 0.19014, + "grad_norm": 1.13754471117426, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 19014 + }, + { + "epoch": 0.19015, + "grad_norm": 1.1953709908335939, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 19015 + }, + { + "epoch": 0.19016, + "grad_norm": 0.8246736475318042, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 19016 + }, + { + "epoch": 0.19017, + "grad_norm": 0.777573909964225, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 19017 + }, + { + "epoch": 0.19018, + "grad_norm": 0.9216623413003302, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 19018 + }, + { + "epoch": 0.19019, + "grad_norm": 1.0944538370069077, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 19019 + }, + { + "epoch": 0.1902, + "grad_norm": 0.9709440838124774, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 19020 + }, + { + "epoch": 0.19021, + "grad_norm": 0.9160250481035979, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 19021 + }, + { + "epoch": 0.19022, + "grad_norm": 0.9012521941774062, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 19022 + }, + { + "epoch": 0.19023, + "grad_norm": 0.8770374884453678, + "learning_rate": 0.003, + "loss": 4.106, + "step": 19023 + }, + { + "epoch": 0.19024, + "grad_norm": 0.9632582548635487, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 19024 + }, + { + "epoch": 0.19025, + "grad_norm": 0.9475115024002957, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 19025 + }, + { + "epoch": 0.19026, + "grad_norm": 1.0989703319915722, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 19026 + }, + { + "epoch": 0.19027, + "grad_norm": 1.1497766262185596, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 19027 + }, + { + "epoch": 0.19028, + "grad_norm": 0.9571602279212525, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 19028 + }, + { + "epoch": 0.19029, + "grad_norm": 1.0474889964290843, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 19029 + }, + { + "epoch": 0.1903, + "grad_norm": 1.0323725074430252, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 19030 + }, + { + "epoch": 0.19031, + "grad_norm": 0.8808059545512241, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 19031 + }, + { + "epoch": 0.19032, + "grad_norm": 0.7104999259230595, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 19032 + }, + { + "epoch": 0.19033, + "grad_norm": 0.7214429505155346, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 19033 + }, + { + "epoch": 0.19034, + "grad_norm": 0.8412580038537382, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 19034 + }, + { + "epoch": 0.19035, + "grad_norm": 1.0452842487812224, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 19035 + }, + { + "epoch": 0.19036, + "grad_norm": 1.079435052990799, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 19036 + }, + { + "epoch": 0.19037, + "grad_norm": 0.9397425655569261, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 19037 + }, + { + "epoch": 0.19038, + "grad_norm": 0.9761474114736021, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 19038 + }, + { + "epoch": 0.19039, + "grad_norm": 0.8420411401715001, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 19039 + }, + { + "epoch": 0.1904, + "grad_norm": 0.77521773509717, + "learning_rate": 0.003, + "loss": 4.036, + "step": 19040 + }, + { + "epoch": 0.19041, + "grad_norm": 0.8430973686243493, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 19041 + }, + { + "epoch": 0.19042, + "grad_norm": 1.0806468455160583, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 19042 + }, + { + "epoch": 0.19043, + "grad_norm": 1.123342672423004, + "learning_rate": 0.003, + "loss": 4.082, + "step": 19043 + }, + { + "epoch": 0.19044, + "grad_norm": 0.7778214511480973, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 19044 + }, + { + "epoch": 0.19045, + "grad_norm": 0.7600183814017205, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 19045 + }, + { + "epoch": 0.19046, + "grad_norm": 0.7241825916248691, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 19046 + }, + { + "epoch": 0.19047, + "grad_norm": 0.8160897029473771, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 19047 + }, + { + "epoch": 0.19048, + "grad_norm": 0.8226404474133778, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 19048 + }, + { + "epoch": 0.19049, + "grad_norm": 0.8002548929155533, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 19049 + }, + { + "epoch": 0.1905, + "grad_norm": 0.8369692255230874, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 19050 + }, + { + "epoch": 0.19051, + "grad_norm": 0.9504811808648432, + "learning_rate": 0.003, + "loss": 4.067, + "step": 19051 + }, + { + "epoch": 0.19052, + "grad_norm": 0.9110330922214464, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 19052 + }, + { + "epoch": 0.19053, + "grad_norm": 1.0333947672152355, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 19053 + }, + { + "epoch": 0.19054, + "grad_norm": 1.0502654522138795, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 19054 + }, + { + "epoch": 0.19055, + "grad_norm": 0.8806951048462609, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 19055 + }, + { + "epoch": 0.19056, + "grad_norm": 0.7160675565186634, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 19056 + }, + { + "epoch": 0.19057, + "grad_norm": 0.800471976042363, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 19057 + }, + { + "epoch": 0.19058, + "grad_norm": 0.9237572272122008, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 19058 + }, + { + "epoch": 0.19059, + "grad_norm": 1.0207160767994679, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 19059 + }, + { + "epoch": 0.1906, + "grad_norm": 1.0155380104867275, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 19060 + }, + { + "epoch": 0.19061, + "grad_norm": 0.9719302098268005, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 19061 + }, + { + "epoch": 0.19062, + "grad_norm": 0.8827863254829319, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 19062 + }, + { + "epoch": 0.19063, + "grad_norm": 0.735191162169673, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 19063 + }, + { + "epoch": 0.19064, + "grad_norm": 0.8052584913638133, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 19064 + }, + { + "epoch": 0.19065, + "grad_norm": 0.8423568825317832, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 19065 + }, + { + "epoch": 0.19066, + "grad_norm": 0.9768182473156256, + "learning_rate": 0.003, + "loss": 4.067, + "step": 19066 + }, + { + "epoch": 0.19067, + "grad_norm": 1.1381319150884115, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 19067 + }, + { + "epoch": 0.19068, + "grad_norm": 0.9586070130478144, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 19068 + }, + { + "epoch": 0.19069, + "grad_norm": 1.1867842705878802, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 19069 + }, + { + "epoch": 0.1907, + "grad_norm": 0.8853121649755085, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 19070 + }, + { + "epoch": 0.19071, + "grad_norm": 0.8364749876177177, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 19071 + }, + { + "epoch": 0.19072, + "grad_norm": 0.7779189562093047, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 19072 + }, + { + "epoch": 0.19073, + "grad_norm": 0.7766213145962902, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 19073 + }, + { + "epoch": 0.19074, + "grad_norm": 0.776139594455867, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 19074 + }, + { + "epoch": 0.19075, + "grad_norm": 0.8693476495515625, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 19075 + }, + { + "epoch": 0.19076, + "grad_norm": 0.9944336576999422, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 19076 + }, + { + "epoch": 0.19077, + "grad_norm": 1.0381320802359264, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 19077 + }, + { + "epoch": 0.19078, + "grad_norm": 0.9919237266469918, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 19078 + }, + { + "epoch": 0.19079, + "grad_norm": 1.0211300167597395, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 19079 + }, + { + "epoch": 0.1908, + "grad_norm": 0.8948322404700847, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 19080 + }, + { + "epoch": 0.19081, + "grad_norm": 0.801754753027131, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 19081 + }, + { + "epoch": 0.19082, + "grad_norm": 0.7455925499217908, + "learning_rate": 0.003, + "loss": 4.059, + "step": 19082 + }, + { + "epoch": 0.19083, + "grad_norm": 0.7911114663148194, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 19083 + }, + { + "epoch": 0.19084, + "grad_norm": 0.8781286305127428, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 19084 + }, + { + "epoch": 0.19085, + "grad_norm": 0.9910629670424296, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 19085 + }, + { + "epoch": 0.19086, + "grad_norm": 1.1617636532351778, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 19086 + }, + { + "epoch": 0.19087, + "grad_norm": 0.9859533117451245, + "learning_rate": 0.003, + "loss": 4.073, + "step": 19087 + }, + { + "epoch": 0.19088, + "grad_norm": 0.9392208421502648, + "learning_rate": 0.003, + "loss": 4.094, + "step": 19088 + }, + { + "epoch": 0.19089, + "grad_norm": 0.9997116388962914, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 19089 + }, + { + "epoch": 0.1909, + "grad_norm": 1.2245682599145473, + "learning_rate": 0.003, + "loss": 4.073, + "step": 19090 + }, + { + "epoch": 0.19091, + "grad_norm": 0.8624504485237479, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 19091 + }, + { + "epoch": 0.19092, + "grad_norm": 0.7901900475596503, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 19092 + }, + { + "epoch": 0.19093, + "grad_norm": 0.7944658431544632, + "learning_rate": 0.003, + "loss": 4.059, + "step": 19093 + }, + { + "epoch": 0.19094, + "grad_norm": 0.612304868461562, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 19094 + }, + { + "epoch": 0.19095, + "grad_norm": 0.6407830815765895, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 19095 + }, + { + "epoch": 0.19096, + "grad_norm": 0.5426360117642127, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 19096 + }, + { + "epoch": 0.19097, + "grad_norm": 0.5338474522388726, + "learning_rate": 0.003, + "loss": 4.065, + "step": 19097 + }, + { + "epoch": 0.19098, + "grad_norm": 0.5101727978090598, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 19098 + }, + { + "epoch": 0.19099, + "grad_norm": 0.5955354350848842, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 19099 + }, + { + "epoch": 0.191, + "grad_norm": 0.8188553946198028, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 19100 + }, + { + "epoch": 0.19101, + "grad_norm": 1.0956303663338571, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 19101 + }, + { + "epoch": 0.19102, + "grad_norm": 1.2288352687088009, + "learning_rate": 0.003, + "loss": 4.046, + "step": 19102 + }, + { + "epoch": 0.19103, + "grad_norm": 0.6996486707725332, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 19103 + }, + { + "epoch": 0.19104, + "grad_norm": 0.6681169360270387, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 19104 + }, + { + "epoch": 0.19105, + "grad_norm": 0.6994247802933407, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 19105 + }, + { + "epoch": 0.19106, + "grad_norm": 0.7142269955146352, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 19106 + }, + { + "epoch": 0.19107, + "grad_norm": 0.6740016205411797, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 19107 + }, + { + "epoch": 0.19108, + "grad_norm": 0.7858392813953062, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19108 + }, + { + "epoch": 0.19109, + "grad_norm": 0.9914323241276799, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 19109 + }, + { + "epoch": 0.1911, + "grad_norm": 1.201935745501774, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 19110 + }, + { + "epoch": 0.19111, + "grad_norm": 0.9219062035845695, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 19111 + }, + { + "epoch": 0.19112, + "grad_norm": 0.8901108010065315, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 19112 + }, + { + "epoch": 0.19113, + "grad_norm": 1.0587914695573095, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 19113 + }, + { + "epoch": 0.19114, + "grad_norm": 1.044476999135324, + "learning_rate": 0.003, + "loss": 4.04, + "step": 19114 + }, + { + "epoch": 0.19115, + "grad_norm": 0.9053654633231307, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 19115 + }, + { + "epoch": 0.19116, + "grad_norm": 0.8977068127848019, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 19116 + }, + { + "epoch": 0.19117, + "grad_norm": 0.8813162473390667, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 19117 + }, + { + "epoch": 0.19118, + "grad_norm": 0.8408219122726204, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 19118 + }, + { + "epoch": 0.19119, + "grad_norm": 0.7658415895624578, + "learning_rate": 0.003, + "loss": 4.0998, + "step": 19119 + }, + { + "epoch": 0.1912, + "grad_norm": 0.8065178753069221, + "learning_rate": 0.003, + "loss": 4.075, + "step": 19120 + }, + { + "epoch": 0.19121, + "grad_norm": 0.8650369341587718, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 19121 + }, + { + "epoch": 0.19122, + "grad_norm": 0.8291494114776464, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 19122 + }, + { + "epoch": 0.19123, + "grad_norm": 0.8914803818441838, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 19123 + }, + { + "epoch": 0.19124, + "grad_norm": 1.102372637490019, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 19124 + }, + { + "epoch": 0.19125, + "grad_norm": 1.1368227004790603, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 19125 + }, + { + "epoch": 0.19126, + "grad_norm": 0.7957796561014767, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 19126 + }, + { + "epoch": 0.19127, + "grad_norm": 0.6904629736784584, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 19127 + }, + { + "epoch": 0.19128, + "grad_norm": 0.7618235887730825, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 19128 + }, + { + "epoch": 0.19129, + "grad_norm": 0.8652592177286308, + "learning_rate": 0.003, + "loss": 4.066, + "step": 19129 + }, + { + "epoch": 0.1913, + "grad_norm": 1.00902247292995, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 19130 + }, + { + "epoch": 0.19131, + "grad_norm": 1.1941754642577591, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 19131 + }, + { + "epoch": 0.19132, + "grad_norm": 1.0522844401627696, + "learning_rate": 0.003, + "loss": 4.061, + "step": 19132 + }, + { + "epoch": 0.19133, + "grad_norm": 1.1078820619546768, + "learning_rate": 0.003, + "loss": 4.039, + "step": 19133 + }, + { + "epoch": 0.19134, + "grad_norm": 0.8916828609188207, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 19134 + }, + { + "epoch": 0.19135, + "grad_norm": 0.7758340235402817, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 19135 + }, + { + "epoch": 0.19136, + "grad_norm": 0.758903533761499, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 19136 + }, + { + "epoch": 0.19137, + "grad_norm": 0.7985332910244933, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 19137 + }, + { + "epoch": 0.19138, + "grad_norm": 0.8872785736897502, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 19138 + }, + { + "epoch": 0.19139, + "grad_norm": 1.1690459392571224, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 19139 + }, + { + "epoch": 0.1914, + "grad_norm": 1.0149515533241773, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 19140 + }, + { + "epoch": 0.19141, + "grad_norm": 0.9234684303330382, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 19141 + }, + { + "epoch": 0.19142, + "grad_norm": 0.9974718989558207, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 19142 + }, + { + "epoch": 0.19143, + "grad_norm": 1.1275730156046364, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 19143 + }, + { + "epoch": 0.19144, + "grad_norm": 1.043525932775733, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 19144 + }, + { + "epoch": 0.19145, + "grad_norm": 0.9635472066146746, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 19145 + }, + { + "epoch": 0.19146, + "grad_norm": 0.9366549747109736, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 19146 + }, + { + "epoch": 0.19147, + "grad_norm": 0.9556797261385593, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 19147 + }, + { + "epoch": 0.19148, + "grad_norm": 0.9968319204491339, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 19148 + }, + { + "epoch": 0.19149, + "grad_norm": 0.9789725136212473, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 19149 + }, + { + "epoch": 0.1915, + "grad_norm": 0.921798913910574, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 19150 + }, + { + "epoch": 0.19151, + "grad_norm": 0.8979370775907503, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 19151 + }, + { + "epoch": 0.19152, + "grad_norm": 0.9881899412066255, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 19152 + }, + { + "epoch": 0.19153, + "grad_norm": 1.1728827136099387, + "learning_rate": 0.003, + "loss": 4.078, + "step": 19153 + }, + { + "epoch": 0.19154, + "grad_norm": 0.8900696701292256, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 19154 + }, + { + "epoch": 0.19155, + "grad_norm": 0.7598033224415386, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 19155 + }, + { + "epoch": 0.19156, + "grad_norm": 0.6770261811478786, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 19156 + }, + { + "epoch": 0.19157, + "grad_norm": 0.6570370501712361, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 19157 + }, + { + "epoch": 0.19158, + "grad_norm": 0.6639133765216619, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 19158 + }, + { + "epoch": 0.19159, + "grad_norm": 0.6723059500159161, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 19159 + }, + { + "epoch": 0.1916, + "grad_norm": 0.8238752444239678, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 19160 + }, + { + "epoch": 0.19161, + "grad_norm": 1.0240555657257644, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 19161 + }, + { + "epoch": 0.19162, + "grad_norm": 1.1364190715089406, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 19162 + }, + { + "epoch": 0.19163, + "grad_norm": 1.0650643978733716, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 19163 + }, + { + "epoch": 0.19164, + "grad_norm": 0.924337511105464, + "learning_rate": 0.003, + "loss": 4.1097, + "step": 19164 + }, + { + "epoch": 0.19165, + "grad_norm": 0.7955589991008061, + "learning_rate": 0.003, + "loss": 4.043, + "step": 19165 + }, + { + "epoch": 0.19166, + "grad_norm": 0.778886009131878, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 19166 + }, + { + "epoch": 0.19167, + "grad_norm": 0.7292515234082881, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 19167 + }, + { + "epoch": 0.19168, + "grad_norm": 0.6684820134081613, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 19168 + }, + { + "epoch": 0.19169, + "grad_norm": 0.5816988168759027, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 19169 + }, + { + "epoch": 0.1917, + "grad_norm": 0.5620713084851875, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 19170 + }, + { + "epoch": 0.19171, + "grad_norm": 0.5974182691705774, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 19171 + }, + { + "epoch": 0.19172, + "grad_norm": 0.6767206688909244, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 19172 + }, + { + "epoch": 0.19173, + "grad_norm": 0.7746260951447896, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 19173 + }, + { + "epoch": 0.19174, + "grad_norm": 0.7642247781590363, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 19174 + }, + { + "epoch": 0.19175, + "grad_norm": 0.8457011397987604, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 19175 + }, + { + "epoch": 0.19176, + "grad_norm": 0.916909573871974, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 19176 + }, + { + "epoch": 0.19177, + "grad_norm": 0.9492617484767752, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 19177 + }, + { + "epoch": 0.19178, + "grad_norm": 1.0149750443993122, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 19178 + }, + { + "epoch": 0.19179, + "grad_norm": 1.0933227967891406, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 19179 + }, + { + "epoch": 0.1918, + "grad_norm": 0.9532154672568713, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 19180 + }, + { + "epoch": 0.19181, + "grad_norm": 1.4331795061474801, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 19181 + }, + { + "epoch": 0.19182, + "grad_norm": 0.8201519184611736, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 19182 + }, + { + "epoch": 0.19183, + "grad_norm": 0.9013677698648515, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 19183 + }, + { + "epoch": 0.19184, + "grad_norm": 0.943429964425819, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 19184 + }, + { + "epoch": 0.19185, + "grad_norm": 1.0073760741135105, + "learning_rate": 0.003, + "loss": 4.06, + "step": 19185 + }, + { + "epoch": 0.19186, + "grad_norm": 1.0929939784382448, + "learning_rate": 0.003, + "loss": 4.046, + "step": 19186 + }, + { + "epoch": 0.19187, + "grad_norm": 1.2413339685325235, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 19187 + }, + { + "epoch": 0.19188, + "grad_norm": 1.2020388720618786, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 19188 + }, + { + "epoch": 0.19189, + "grad_norm": 0.9481795929159604, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 19189 + }, + { + "epoch": 0.1919, + "grad_norm": 0.9919308863807638, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 19190 + }, + { + "epoch": 0.19191, + "grad_norm": 0.998820204590595, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 19191 + }, + { + "epoch": 0.19192, + "grad_norm": 1.0408854899950093, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 19192 + }, + { + "epoch": 0.19193, + "grad_norm": 1.04311058320516, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 19193 + }, + { + "epoch": 0.19194, + "grad_norm": 1.1941497721833318, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 19194 + }, + { + "epoch": 0.19195, + "grad_norm": 0.920139316469817, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 19195 + }, + { + "epoch": 0.19196, + "grad_norm": 0.9108519081658464, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 19196 + }, + { + "epoch": 0.19197, + "grad_norm": 0.9223702416502508, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 19197 + }, + { + "epoch": 0.19198, + "grad_norm": 0.8509627782828049, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 19198 + }, + { + "epoch": 0.19199, + "grad_norm": 0.7289220032264974, + "learning_rate": 0.003, + "loss": 4.088, + "step": 19199 + }, + { + "epoch": 0.192, + "grad_norm": 0.7224358854503841, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 19200 + }, + { + "epoch": 0.19201, + "grad_norm": 0.7544214058537576, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 19201 + }, + { + "epoch": 0.19202, + "grad_norm": 0.8092066777788773, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 19202 + }, + { + "epoch": 0.19203, + "grad_norm": 0.9368709707212719, + "learning_rate": 0.003, + "loss": 4.039, + "step": 19203 + }, + { + "epoch": 0.19204, + "grad_norm": 1.0931590885061948, + "learning_rate": 0.003, + "loss": 4.054, + "step": 19204 + }, + { + "epoch": 0.19205, + "grad_norm": 0.8770844795871859, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 19205 + }, + { + "epoch": 0.19206, + "grad_norm": 0.8306421817763099, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 19206 + }, + { + "epoch": 0.19207, + "grad_norm": 0.878648736336449, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 19207 + }, + { + "epoch": 0.19208, + "grad_norm": 0.8875133742494693, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 19208 + }, + { + "epoch": 0.19209, + "grad_norm": 0.793238499807842, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 19209 + }, + { + "epoch": 0.1921, + "grad_norm": 0.7757701288960257, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 19210 + }, + { + "epoch": 0.19211, + "grad_norm": 0.7824645907529354, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 19211 + }, + { + "epoch": 0.19212, + "grad_norm": 0.8177183774141822, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 19212 + }, + { + "epoch": 0.19213, + "grad_norm": 0.929446946537238, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 19213 + }, + { + "epoch": 0.19214, + "grad_norm": 1.0867526246727846, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 19214 + }, + { + "epoch": 0.19215, + "grad_norm": 0.9013704811261157, + "learning_rate": 0.003, + "loss": 4.085, + "step": 19215 + }, + { + "epoch": 0.19216, + "grad_norm": 0.7802080780659087, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 19216 + }, + { + "epoch": 0.19217, + "grad_norm": 0.7052820494757375, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 19217 + }, + { + "epoch": 0.19218, + "grad_norm": 0.6840988159643647, + "learning_rate": 0.003, + "loss": 4.1008, + "step": 19218 + }, + { + "epoch": 0.19219, + "grad_norm": 0.6891256990062292, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 19219 + }, + { + "epoch": 0.1922, + "grad_norm": 0.6592335983566835, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 19220 + }, + { + "epoch": 0.19221, + "grad_norm": 0.7092450067835178, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 19221 + }, + { + "epoch": 0.19222, + "grad_norm": 0.7737560399743477, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 19222 + }, + { + "epoch": 0.19223, + "grad_norm": 1.0353930285151072, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 19223 + }, + { + "epoch": 0.19224, + "grad_norm": 1.160953135931673, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 19224 + }, + { + "epoch": 0.19225, + "grad_norm": 1.0047042315574906, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 19225 + }, + { + "epoch": 0.19226, + "grad_norm": 1.094540519862959, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 19226 + }, + { + "epoch": 0.19227, + "grad_norm": 0.870391283859072, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 19227 + }, + { + "epoch": 0.19228, + "grad_norm": 0.8355690166155801, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 19228 + }, + { + "epoch": 0.19229, + "grad_norm": 0.8620252155285648, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 19229 + }, + { + "epoch": 0.1923, + "grad_norm": 0.9896399499042035, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 19230 + }, + { + "epoch": 0.19231, + "grad_norm": 0.9658354004622579, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 19231 + }, + { + "epoch": 0.19232, + "grad_norm": 0.9207134572200805, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 19232 + }, + { + "epoch": 0.19233, + "grad_norm": 0.9680384497789604, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 19233 + }, + { + "epoch": 0.19234, + "grad_norm": 0.979017491808022, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 19234 + }, + { + "epoch": 0.19235, + "grad_norm": 0.9468907270544002, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 19235 + }, + { + "epoch": 0.19236, + "grad_norm": 0.8974984596975909, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 19236 + }, + { + "epoch": 0.19237, + "grad_norm": 0.9601350008999757, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 19237 + }, + { + "epoch": 0.19238, + "grad_norm": 0.9336776046445143, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 19238 + }, + { + "epoch": 0.19239, + "grad_norm": 1.0093740356448468, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 19239 + }, + { + "epoch": 0.1924, + "grad_norm": 1.2536772971899799, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 19240 + }, + { + "epoch": 0.19241, + "grad_norm": 1.0182498434339136, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 19241 + }, + { + "epoch": 0.19242, + "grad_norm": 1.0837510483224895, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 19242 + }, + { + "epoch": 0.19243, + "grad_norm": 0.9225747469867477, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 19243 + }, + { + "epoch": 0.19244, + "grad_norm": 0.7844361174545726, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 19244 + }, + { + "epoch": 0.19245, + "grad_norm": 0.7574314633985662, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 19245 + }, + { + "epoch": 0.19246, + "grad_norm": 0.8707291741019877, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 19246 + }, + { + "epoch": 0.19247, + "grad_norm": 1.09006320519354, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 19247 + }, + { + "epoch": 0.19248, + "grad_norm": 1.0023955625391028, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 19248 + }, + { + "epoch": 0.19249, + "grad_norm": 0.9469880272587307, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 19249 + }, + { + "epoch": 0.1925, + "grad_norm": 0.9076851822165349, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 19250 + }, + { + "epoch": 0.19251, + "grad_norm": 0.9804263974244334, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 19251 + }, + { + "epoch": 0.19252, + "grad_norm": 1.0621242569370077, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 19252 + }, + { + "epoch": 0.19253, + "grad_norm": 0.8462923391150671, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 19253 + }, + { + "epoch": 0.19254, + "grad_norm": 0.9115455068799846, + "learning_rate": 0.003, + "loss": 4.1128, + "step": 19254 + }, + { + "epoch": 0.19255, + "grad_norm": 1.0029584191222796, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 19255 + }, + { + "epoch": 0.19256, + "grad_norm": 0.9674100852940246, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 19256 + }, + { + "epoch": 0.19257, + "grad_norm": 0.8292408728437941, + "learning_rate": 0.003, + "loss": 4.036, + "step": 19257 + }, + { + "epoch": 0.19258, + "grad_norm": 1.009116301371756, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 19258 + }, + { + "epoch": 0.19259, + "grad_norm": 1.1966204992888037, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 19259 + }, + { + "epoch": 0.1926, + "grad_norm": 0.7878949451380268, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 19260 + }, + { + "epoch": 0.19261, + "grad_norm": 0.7808985411678049, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 19261 + }, + { + "epoch": 0.19262, + "grad_norm": 0.8547357589002595, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 19262 + }, + { + "epoch": 0.19263, + "grad_norm": 0.718288020292227, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 19263 + }, + { + "epoch": 0.19264, + "grad_norm": 0.6304679970206166, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 19264 + }, + { + "epoch": 0.19265, + "grad_norm": 0.724331469536231, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 19265 + }, + { + "epoch": 0.19266, + "grad_norm": 0.815068469050919, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 19266 + }, + { + "epoch": 0.19267, + "grad_norm": 0.8514287596550341, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 19267 + }, + { + "epoch": 0.19268, + "grad_norm": 0.9774310685272432, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 19268 + }, + { + "epoch": 0.19269, + "grad_norm": 1.0914727380265792, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 19269 + }, + { + "epoch": 0.1927, + "grad_norm": 0.9712674589547434, + "learning_rate": 0.003, + "loss": 4.055, + "step": 19270 + }, + { + "epoch": 0.19271, + "grad_norm": 1.0115780669642893, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 19271 + }, + { + "epoch": 0.19272, + "grad_norm": 0.8872210112799476, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 19272 + }, + { + "epoch": 0.19273, + "grad_norm": 0.7578652090507335, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 19273 + }, + { + "epoch": 0.19274, + "grad_norm": 0.759825135477429, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 19274 + }, + { + "epoch": 0.19275, + "grad_norm": 0.7997469392525652, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 19275 + }, + { + "epoch": 0.19276, + "grad_norm": 0.7291875124795859, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 19276 + }, + { + "epoch": 0.19277, + "grad_norm": 0.732910639686414, + "learning_rate": 0.003, + "loss": 4.093, + "step": 19277 + }, + { + "epoch": 0.19278, + "grad_norm": 0.7504022956990735, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 19278 + }, + { + "epoch": 0.19279, + "grad_norm": 0.7145183158646564, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 19279 + }, + { + "epoch": 0.1928, + "grad_norm": 0.6809551669791736, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 19280 + }, + { + "epoch": 0.19281, + "grad_norm": 0.7163662690691528, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 19281 + }, + { + "epoch": 0.19282, + "grad_norm": 0.7850505360953289, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 19282 + }, + { + "epoch": 0.19283, + "grad_norm": 0.9407779214143366, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 19283 + }, + { + "epoch": 0.19284, + "grad_norm": 1.4375717729943787, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 19284 + }, + { + "epoch": 0.19285, + "grad_norm": 0.684169909298226, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 19285 + }, + { + "epoch": 0.19286, + "grad_norm": 0.8695288130362363, + "learning_rate": 0.003, + "loss": 4.025, + "step": 19286 + }, + { + "epoch": 0.19287, + "grad_norm": 1.1242503477218868, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 19287 + }, + { + "epoch": 0.19288, + "grad_norm": 0.8466664857908782, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 19288 + }, + { + "epoch": 0.19289, + "grad_norm": 0.7283764504237402, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 19289 + }, + { + "epoch": 0.1929, + "grad_norm": 0.7640308214485745, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 19290 + }, + { + "epoch": 0.19291, + "grad_norm": 0.76545557633287, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 19291 + }, + { + "epoch": 0.19292, + "grad_norm": 0.8660208301542295, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 19292 + }, + { + "epoch": 0.19293, + "grad_norm": 1.006380575264228, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 19293 + }, + { + "epoch": 0.19294, + "grad_norm": 1.0174367698461975, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 19294 + }, + { + "epoch": 0.19295, + "grad_norm": 0.9156945148357316, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 19295 + }, + { + "epoch": 0.19296, + "grad_norm": 0.8934904545697673, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 19296 + }, + { + "epoch": 0.19297, + "grad_norm": 0.8166606962145884, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 19297 + }, + { + "epoch": 0.19298, + "grad_norm": 0.8204643793301916, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 19298 + }, + { + "epoch": 0.19299, + "grad_norm": 0.7884797328812746, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 19299 + }, + { + "epoch": 0.193, + "grad_norm": 0.8848245411938087, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 19300 + }, + { + "epoch": 0.19301, + "grad_norm": 0.9491213559661399, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 19301 + }, + { + "epoch": 0.19302, + "grad_norm": 1.201532040807617, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 19302 + }, + { + "epoch": 0.19303, + "grad_norm": 0.8820087392952594, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 19303 + }, + { + "epoch": 0.19304, + "grad_norm": 0.9067089569250883, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 19304 + }, + { + "epoch": 0.19305, + "grad_norm": 1.0074690436179412, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 19305 + }, + { + "epoch": 0.19306, + "grad_norm": 0.9882078433848459, + "learning_rate": 0.003, + "loss": 4.069, + "step": 19306 + }, + { + "epoch": 0.19307, + "grad_norm": 1.0188381908844888, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 19307 + }, + { + "epoch": 0.19308, + "grad_norm": 0.9821704739833307, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 19308 + }, + { + "epoch": 0.19309, + "grad_norm": 0.942424683799336, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 19309 + }, + { + "epoch": 0.1931, + "grad_norm": 0.9334417165161468, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 19310 + }, + { + "epoch": 0.19311, + "grad_norm": 1.0651321222090708, + "learning_rate": 0.003, + "loss": 4.068, + "step": 19311 + }, + { + "epoch": 0.19312, + "grad_norm": 1.2103711850247247, + "learning_rate": 0.003, + "loss": 4.073, + "step": 19312 + }, + { + "epoch": 0.19313, + "grad_norm": 0.8972642524800851, + "learning_rate": 0.003, + "loss": 4.1, + "step": 19313 + }, + { + "epoch": 0.19314, + "grad_norm": 0.8036964751306692, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 19314 + }, + { + "epoch": 0.19315, + "grad_norm": 0.726795320706453, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 19315 + }, + { + "epoch": 0.19316, + "grad_norm": 0.6811792939150739, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 19316 + }, + { + "epoch": 0.19317, + "grad_norm": 0.7024349095635795, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 19317 + }, + { + "epoch": 0.19318, + "grad_norm": 0.7572675667273833, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 19318 + }, + { + "epoch": 0.19319, + "grad_norm": 0.8285780744679329, + "learning_rate": 0.003, + "loss": 4.054, + "step": 19319 + }, + { + "epoch": 0.1932, + "grad_norm": 1.12929868918835, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 19320 + }, + { + "epoch": 0.19321, + "grad_norm": 0.9536326676893999, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 19321 + }, + { + "epoch": 0.19322, + "grad_norm": 0.9668487379190761, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 19322 + }, + { + "epoch": 0.19323, + "grad_norm": 0.9753566245316622, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 19323 + }, + { + "epoch": 0.19324, + "grad_norm": 1.0201313993998238, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 19324 + }, + { + "epoch": 0.19325, + "grad_norm": 0.9187757221333906, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 19325 + }, + { + "epoch": 0.19326, + "grad_norm": 0.8503241399014173, + "learning_rate": 0.003, + "loss": 4.067, + "step": 19326 + }, + { + "epoch": 0.19327, + "grad_norm": 0.6947144636157419, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 19327 + }, + { + "epoch": 0.19328, + "grad_norm": 0.6595103173716894, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 19328 + }, + { + "epoch": 0.19329, + "grad_norm": 0.7053577224401365, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 19329 + }, + { + "epoch": 0.1933, + "grad_norm": 0.8821891890970084, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 19330 + }, + { + "epoch": 0.19331, + "grad_norm": 0.9967821902363833, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 19331 + }, + { + "epoch": 0.19332, + "grad_norm": 1.1995869818275835, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 19332 + }, + { + "epoch": 0.19333, + "grad_norm": 0.9189640826374365, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 19333 + }, + { + "epoch": 0.19334, + "grad_norm": 1.0077012014502056, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 19334 + }, + { + "epoch": 0.19335, + "grad_norm": 0.9147865305334655, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 19335 + }, + { + "epoch": 0.19336, + "grad_norm": 0.9077212628452612, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 19336 + }, + { + "epoch": 0.19337, + "grad_norm": 0.9862699628741036, + "learning_rate": 0.003, + "loss": 4.056, + "step": 19337 + }, + { + "epoch": 0.19338, + "grad_norm": 1.1451910546203383, + "learning_rate": 0.003, + "loss": 4.102, + "step": 19338 + }, + { + "epoch": 0.19339, + "grad_norm": 0.8767077068542017, + "learning_rate": 0.003, + "loss": 4.064, + "step": 19339 + }, + { + "epoch": 0.1934, + "grad_norm": 0.8457171250838031, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 19340 + }, + { + "epoch": 0.19341, + "grad_norm": 0.7592240416015384, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 19341 + }, + { + "epoch": 0.19342, + "grad_norm": 0.6983669343797456, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 19342 + }, + { + "epoch": 0.19343, + "grad_norm": 0.7785188933601789, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 19343 + }, + { + "epoch": 0.19344, + "grad_norm": 0.9046180073407409, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 19344 + }, + { + "epoch": 0.19345, + "grad_norm": 1.0366997635756199, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 19345 + }, + { + "epoch": 0.19346, + "grad_norm": 1.0162979133688614, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 19346 + }, + { + "epoch": 0.19347, + "grad_norm": 0.9060762641110706, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 19347 + }, + { + "epoch": 0.19348, + "grad_norm": 0.9442767671289194, + "learning_rate": 0.003, + "loss": 4.1066, + "step": 19348 + }, + { + "epoch": 0.19349, + "grad_norm": 0.9573196738046161, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 19349 + }, + { + "epoch": 0.1935, + "grad_norm": 1.026506296420836, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 19350 + }, + { + "epoch": 0.19351, + "grad_norm": 0.937231457026112, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 19351 + }, + { + "epoch": 0.19352, + "grad_norm": 1.0436098863612515, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 19352 + }, + { + "epoch": 0.19353, + "grad_norm": 1.1122056396611406, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 19353 + }, + { + "epoch": 0.19354, + "grad_norm": 0.7951754885001442, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 19354 + }, + { + "epoch": 0.19355, + "grad_norm": 0.731618306665379, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 19355 + }, + { + "epoch": 0.19356, + "grad_norm": 0.8169119441950538, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 19356 + }, + { + "epoch": 0.19357, + "grad_norm": 0.9951121658364063, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 19357 + }, + { + "epoch": 0.19358, + "grad_norm": 1.2601588573719498, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 19358 + }, + { + "epoch": 0.19359, + "grad_norm": 0.8177939686033251, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 19359 + }, + { + "epoch": 0.1936, + "grad_norm": 0.8505608578205448, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 19360 + }, + { + "epoch": 0.19361, + "grad_norm": 1.0998820770390672, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 19361 + }, + { + "epoch": 0.19362, + "grad_norm": 1.1157030478507055, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 19362 + }, + { + "epoch": 0.19363, + "grad_norm": 0.79935380789488, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 19363 + }, + { + "epoch": 0.19364, + "grad_norm": 0.7196371621609685, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 19364 + }, + { + "epoch": 0.19365, + "grad_norm": 0.7715266907600049, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 19365 + }, + { + "epoch": 0.19366, + "grad_norm": 0.644796637237514, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 19366 + }, + { + "epoch": 0.19367, + "grad_norm": 0.5818694184589748, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 19367 + }, + { + "epoch": 0.19368, + "grad_norm": 0.5469812151673934, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 19368 + }, + { + "epoch": 0.19369, + "grad_norm": 0.5642327960132972, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 19369 + }, + { + "epoch": 0.1937, + "grad_norm": 0.5491507178522439, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 19370 + }, + { + "epoch": 0.19371, + "grad_norm": 0.6178386048368959, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 19371 + }, + { + "epoch": 0.19372, + "grad_norm": 0.6792661145423722, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 19372 + }, + { + "epoch": 0.19373, + "grad_norm": 0.7579709646077045, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 19373 + }, + { + "epoch": 0.19374, + "grad_norm": 0.8043491088295369, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 19374 + }, + { + "epoch": 0.19375, + "grad_norm": 0.9173096652133536, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 19375 + }, + { + "epoch": 0.19376, + "grad_norm": 1.104770502970042, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 19376 + }, + { + "epoch": 0.19377, + "grad_norm": 0.9931044029743681, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 19377 + }, + { + "epoch": 0.19378, + "grad_norm": 1.2575693666209646, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 19378 + }, + { + "epoch": 0.19379, + "grad_norm": 0.7893045372481265, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 19379 + }, + { + "epoch": 0.1938, + "grad_norm": 0.7290089644437999, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 19380 + }, + { + "epoch": 0.19381, + "grad_norm": 0.7954150049719874, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 19381 + }, + { + "epoch": 0.19382, + "grad_norm": 0.9335877267014221, + "learning_rate": 0.003, + "loss": 4.072, + "step": 19382 + }, + { + "epoch": 0.19383, + "grad_norm": 1.1566640119912726, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 19383 + }, + { + "epoch": 0.19384, + "grad_norm": 0.9305220064139013, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 19384 + }, + { + "epoch": 0.19385, + "grad_norm": 0.9284443858567036, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 19385 + }, + { + "epoch": 0.19386, + "grad_norm": 0.8698772978575846, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 19386 + }, + { + "epoch": 0.19387, + "grad_norm": 0.9039789823960264, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 19387 + }, + { + "epoch": 0.19388, + "grad_norm": 0.8938361441679321, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 19388 + }, + { + "epoch": 0.19389, + "grad_norm": 1.079251678335314, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 19389 + }, + { + "epoch": 0.1939, + "grad_norm": 0.9888540957536921, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 19390 + }, + { + "epoch": 0.19391, + "grad_norm": 0.8653988459312033, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 19391 + }, + { + "epoch": 0.19392, + "grad_norm": 0.7387427348858798, + "learning_rate": 0.003, + "loss": 4.052, + "step": 19392 + }, + { + "epoch": 0.19393, + "grad_norm": 0.7195453908865173, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 19393 + }, + { + "epoch": 0.19394, + "grad_norm": 1.0132534843416474, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 19394 + }, + { + "epoch": 0.19395, + "grad_norm": 1.438402311165173, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 19395 + }, + { + "epoch": 0.19396, + "grad_norm": 0.6832755764345744, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 19396 + }, + { + "epoch": 0.19397, + "grad_norm": 0.7173595937643056, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 19397 + }, + { + "epoch": 0.19398, + "grad_norm": 0.7412058429957814, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 19398 + }, + { + "epoch": 0.19399, + "grad_norm": 0.7348205577739316, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 19399 + }, + { + "epoch": 0.194, + "grad_norm": 0.7589437145220229, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 19400 + }, + { + "epoch": 0.19401, + "grad_norm": 0.7744153690054988, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 19401 + }, + { + "epoch": 0.19402, + "grad_norm": 0.7842889187210925, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 19402 + }, + { + "epoch": 0.19403, + "grad_norm": 0.693093910166812, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 19403 + }, + { + "epoch": 0.19404, + "grad_norm": 0.6746526719944133, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 19404 + }, + { + "epoch": 0.19405, + "grad_norm": 0.6850649742412825, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 19405 + }, + { + "epoch": 0.19406, + "grad_norm": 0.8077518972655819, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 19406 + }, + { + "epoch": 0.19407, + "grad_norm": 0.9910010785169938, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 19407 + }, + { + "epoch": 0.19408, + "grad_norm": 1.2453570176589803, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 19408 + }, + { + "epoch": 0.19409, + "grad_norm": 0.8234145584135474, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 19409 + }, + { + "epoch": 0.1941, + "grad_norm": 0.8663858509496111, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 19410 + }, + { + "epoch": 0.19411, + "grad_norm": 1.0228186203156835, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 19411 + }, + { + "epoch": 0.19412, + "grad_norm": 1.184175206863987, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 19412 + }, + { + "epoch": 0.19413, + "grad_norm": 0.8128680746578928, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 19413 + }, + { + "epoch": 0.19414, + "grad_norm": 0.8349677648661027, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 19414 + }, + { + "epoch": 0.19415, + "grad_norm": 0.839110250731865, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 19415 + }, + { + "epoch": 0.19416, + "grad_norm": 1.0530972382185582, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 19416 + }, + { + "epoch": 0.19417, + "grad_norm": 1.2071529196567012, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 19417 + }, + { + "epoch": 0.19418, + "grad_norm": 0.7532693429543044, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 19418 + }, + { + "epoch": 0.19419, + "grad_norm": 0.7025049287295203, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 19419 + }, + { + "epoch": 0.1942, + "grad_norm": 0.6617869994428833, + "learning_rate": 0.003, + "loss": 4.034, + "step": 19420 + }, + { + "epoch": 0.19421, + "grad_norm": 0.7107982239512384, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 19421 + }, + { + "epoch": 0.19422, + "grad_norm": 0.7731539221854351, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 19422 + }, + { + "epoch": 0.19423, + "grad_norm": 0.7954650930536287, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 19423 + }, + { + "epoch": 0.19424, + "grad_norm": 0.744812551693108, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 19424 + }, + { + "epoch": 0.19425, + "grad_norm": 0.7845461767458478, + "learning_rate": 0.003, + "loss": 4.046, + "step": 19425 + }, + { + "epoch": 0.19426, + "grad_norm": 0.7963268991869785, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 19426 + }, + { + "epoch": 0.19427, + "grad_norm": 0.9046663135799429, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 19427 + }, + { + "epoch": 0.19428, + "grad_norm": 1.1986557570670113, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 19428 + }, + { + "epoch": 0.19429, + "grad_norm": 0.8848341229192824, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 19429 + }, + { + "epoch": 0.1943, + "grad_norm": 0.7520684600772797, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 19430 + }, + { + "epoch": 0.19431, + "grad_norm": 0.7599682318677224, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 19431 + }, + { + "epoch": 0.19432, + "grad_norm": 0.8557189101886336, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 19432 + }, + { + "epoch": 0.19433, + "grad_norm": 0.9104566496816495, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 19433 + }, + { + "epoch": 0.19434, + "grad_norm": 1.0351919580811069, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 19434 + }, + { + "epoch": 0.19435, + "grad_norm": 1.240388030759488, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 19435 + }, + { + "epoch": 0.19436, + "grad_norm": 1.1308522532143102, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 19436 + }, + { + "epoch": 0.19437, + "grad_norm": 1.106405125898721, + "learning_rate": 0.003, + "loss": 4.046, + "step": 19437 + }, + { + "epoch": 0.19438, + "grad_norm": 0.9453742851113698, + "learning_rate": 0.003, + "loss": 4.079, + "step": 19438 + }, + { + "epoch": 0.19439, + "grad_norm": 0.8606477423158553, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 19439 + }, + { + "epoch": 0.1944, + "grad_norm": 0.8429940189120442, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 19440 + }, + { + "epoch": 0.19441, + "grad_norm": 0.7464507941703405, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 19441 + }, + { + "epoch": 0.19442, + "grad_norm": 0.7801455651340992, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 19442 + }, + { + "epoch": 0.19443, + "grad_norm": 0.792522750942894, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 19443 + }, + { + "epoch": 0.19444, + "grad_norm": 0.788629505743987, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 19444 + }, + { + "epoch": 0.19445, + "grad_norm": 0.8301473853232875, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 19445 + }, + { + "epoch": 0.19446, + "grad_norm": 0.8608611945764904, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 19446 + }, + { + "epoch": 0.19447, + "grad_norm": 0.9974219292655547, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 19447 + }, + { + "epoch": 0.19448, + "grad_norm": 1.318589946379268, + "learning_rate": 0.003, + "loss": 4.059, + "step": 19448 + }, + { + "epoch": 0.19449, + "grad_norm": 0.7471531950751678, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 19449 + }, + { + "epoch": 0.1945, + "grad_norm": 0.8234966459974362, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 19450 + }, + { + "epoch": 0.19451, + "grad_norm": 0.8365819171105646, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 19451 + }, + { + "epoch": 0.19452, + "grad_norm": 0.9296322955615434, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 19452 + }, + { + "epoch": 0.19453, + "grad_norm": 1.0848704779287814, + "learning_rate": 0.003, + "loss": 4.0912, + "step": 19453 + }, + { + "epoch": 0.19454, + "grad_norm": 0.8588849849266655, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 19454 + }, + { + "epoch": 0.19455, + "grad_norm": 0.8562132116638607, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 19455 + }, + { + "epoch": 0.19456, + "grad_norm": 0.8576836587299588, + "learning_rate": 0.003, + "loss": 4.0946, + "step": 19456 + }, + { + "epoch": 0.19457, + "grad_norm": 1.0178256252910618, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 19457 + }, + { + "epoch": 0.19458, + "grad_norm": 1.0673845665305683, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 19458 + }, + { + "epoch": 0.19459, + "grad_norm": 0.9526930015094522, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 19459 + }, + { + "epoch": 0.1946, + "grad_norm": 1.0773885084350963, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 19460 + }, + { + "epoch": 0.19461, + "grad_norm": 1.020877896151739, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 19461 + }, + { + "epoch": 0.19462, + "grad_norm": 1.0136060535129237, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 19462 + }, + { + "epoch": 0.19463, + "grad_norm": 0.9954529145757499, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 19463 + }, + { + "epoch": 0.19464, + "grad_norm": 1.0596088310554157, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 19464 + }, + { + "epoch": 0.19465, + "grad_norm": 1.1154382364130757, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 19465 + }, + { + "epoch": 0.19466, + "grad_norm": 1.1784204764828055, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 19466 + }, + { + "epoch": 0.19467, + "grad_norm": 0.9259931971512808, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 19467 + }, + { + "epoch": 0.19468, + "grad_norm": 0.9416683844112438, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 19468 + }, + { + "epoch": 0.19469, + "grad_norm": 1.2413378354355256, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 19469 + }, + { + "epoch": 0.1947, + "grad_norm": 0.9270447216921386, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 19470 + }, + { + "epoch": 0.19471, + "grad_norm": 0.9545181686146227, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 19471 + }, + { + "epoch": 0.19472, + "grad_norm": 1.0339319753001825, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 19472 + }, + { + "epoch": 0.19473, + "grad_norm": 1.000844535995447, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 19473 + }, + { + "epoch": 0.19474, + "grad_norm": 0.9143181943814181, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 19474 + }, + { + "epoch": 0.19475, + "grad_norm": 0.8602119219325912, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 19475 + }, + { + "epoch": 0.19476, + "grad_norm": 0.9540118603831539, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 19476 + }, + { + "epoch": 0.19477, + "grad_norm": 0.9782597139022668, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 19477 + }, + { + "epoch": 0.19478, + "grad_norm": 0.9962543688764389, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 19478 + }, + { + "epoch": 0.19479, + "grad_norm": 0.7779573812208588, + "learning_rate": 0.003, + "loss": 4.058, + "step": 19479 + }, + { + "epoch": 0.1948, + "grad_norm": 0.7301011933193419, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 19480 + }, + { + "epoch": 0.19481, + "grad_norm": 0.7707368473789574, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 19481 + }, + { + "epoch": 0.19482, + "grad_norm": 0.8470135701871124, + "learning_rate": 0.003, + "loss": 4.073, + "step": 19482 + }, + { + "epoch": 0.19483, + "grad_norm": 0.8458066880417292, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 19483 + }, + { + "epoch": 0.19484, + "grad_norm": 0.8337140274876754, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 19484 + }, + { + "epoch": 0.19485, + "grad_norm": 0.7926767981455103, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 19485 + }, + { + "epoch": 0.19486, + "grad_norm": 0.8656239751429737, + "learning_rate": 0.003, + "loss": 4.071, + "step": 19486 + }, + { + "epoch": 0.19487, + "grad_norm": 0.9838859096652176, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 19487 + }, + { + "epoch": 0.19488, + "grad_norm": 0.9871605794357838, + "learning_rate": 0.003, + "loss": 4.048, + "step": 19488 + }, + { + "epoch": 0.19489, + "grad_norm": 1.0088434336621808, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 19489 + }, + { + "epoch": 0.1949, + "grad_norm": 1.1324089063831135, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 19490 + }, + { + "epoch": 0.19491, + "grad_norm": 0.9202283600730529, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 19491 + }, + { + "epoch": 0.19492, + "grad_norm": 0.7821970084215664, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 19492 + }, + { + "epoch": 0.19493, + "grad_norm": 0.6611199839192118, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 19493 + }, + { + "epoch": 0.19494, + "grad_norm": 0.6393568576028075, + "learning_rate": 0.003, + "loss": 4.068, + "step": 19494 + }, + { + "epoch": 0.19495, + "grad_norm": 0.7050742852054983, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 19495 + }, + { + "epoch": 0.19496, + "grad_norm": 0.6817189794167758, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 19496 + }, + { + "epoch": 0.19497, + "grad_norm": 0.7594130011999084, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 19497 + }, + { + "epoch": 0.19498, + "grad_norm": 0.7333688561420428, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 19498 + }, + { + "epoch": 0.19499, + "grad_norm": 0.7123112115752411, + "learning_rate": 0.003, + "loss": 4.07, + "step": 19499 + }, + { + "epoch": 0.195, + "grad_norm": 0.7042225273617304, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 19500 + }, + { + "epoch": 0.19501, + "grad_norm": 0.8029016193437605, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 19501 + }, + { + "epoch": 0.19502, + "grad_norm": 0.9922773101680152, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 19502 + }, + { + "epoch": 0.19503, + "grad_norm": 1.1466301986118883, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 19503 + }, + { + "epoch": 0.19504, + "grad_norm": 0.8807567321460502, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 19504 + }, + { + "epoch": 0.19505, + "grad_norm": 1.0379231428547009, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 19505 + }, + { + "epoch": 0.19506, + "grad_norm": 1.164717902234979, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 19506 + }, + { + "epoch": 0.19507, + "grad_norm": 0.8425390274820713, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 19507 + }, + { + "epoch": 0.19508, + "grad_norm": 0.7959424547295146, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 19508 + }, + { + "epoch": 0.19509, + "grad_norm": 0.8632302355867209, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 19509 + }, + { + "epoch": 0.1951, + "grad_norm": 0.881691131147382, + "learning_rate": 0.003, + "loss": 4.058, + "step": 19510 + }, + { + "epoch": 0.19511, + "grad_norm": 0.9925927341296379, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 19511 + }, + { + "epoch": 0.19512, + "grad_norm": 1.28965793777758, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 19512 + }, + { + "epoch": 0.19513, + "grad_norm": 0.7905992924507371, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 19513 + }, + { + "epoch": 0.19514, + "grad_norm": 0.6384576253160198, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 19514 + }, + { + "epoch": 0.19515, + "grad_norm": 0.6166443571201118, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19515 + }, + { + "epoch": 0.19516, + "grad_norm": 0.6867575151399782, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 19516 + }, + { + "epoch": 0.19517, + "grad_norm": 0.7685770719860647, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 19517 + }, + { + "epoch": 0.19518, + "grad_norm": 0.8594828737342208, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 19518 + }, + { + "epoch": 0.19519, + "grad_norm": 0.9872709054688854, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 19519 + }, + { + "epoch": 0.1952, + "grad_norm": 1.2125394175737512, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 19520 + }, + { + "epoch": 0.19521, + "grad_norm": 0.886659689134626, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 19521 + }, + { + "epoch": 0.19522, + "grad_norm": 0.9636041826651245, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 19522 + }, + { + "epoch": 0.19523, + "grad_norm": 1.205013653960159, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 19523 + }, + { + "epoch": 0.19524, + "grad_norm": 0.9724836120974313, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 19524 + }, + { + "epoch": 0.19525, + "grad_norm": 1.0118739956056824, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 19525 + }, + { + "epoch": 0.19526, + "grad_norm": 1.0064466822541676, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 19526 + }, + { + "epoch": 0.19527, + "grad_norm": 0.9651436841120309, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 19527 + }, + { + "epoch": 0.19528, + "grad_norm": 0.8937188861082559, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 19528 + }, + { + "epoch": 0.19529, + "grad_norm": 0.8821529187386105, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 19529 + }, + { + "epoch": 0.1953, + "grad_norm": 0.8252793530031995, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 19530 + }, + { + "epoch": 0.19531, + "grad_norm": 0.7651803369307546, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 19531 + }, + { + "epoch": 0.19532, + "grad_norm": 0.8058864113766969, + "learning_rate": 0.003, + "loss": 4.1026, + "step": 19532 + }, + { + "epoch": 0.19533, + "grad_norm": 1.0366595185288165, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 19533 + }, + { + "epoch": 0.19534, + "grad_norm": 1.051722844597907, + "learning_rate": 0.003, + "loss": 4.0981, + "step": 19534 + }, + { + "epoch": 0.19535, + "grad_norm": 0.9434461648697393, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 19535 + }, + { + "epoch": 0.19536, + "grad_norm": 0.9725778695812081, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 19536 + }, + { + "epoch": 0.19537, + "grad_norm": 1.017590227573199, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 19537 + }, + { + "epoch": 0.19538, + "grad_norm": 0.9968575972285362, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 19538 + }, + { + "epoch": 0.19539, + "grad_norm": 0.9359579432709345, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 19539 + }, + { + "epoch": 0.1954, + "grad_norm": 0.8686704791280276, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 19540 + }, + { + "epoch": 0.19541, + "grad_norm": 0.897106152786685, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 19541 + }, + { + "epoch": 0.19542, + "grad_norm": 1.0077546120042802, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 19542 + }, + { + "epoch": 0.19543, + "grad_norm": 1.0636340344146884, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 19543 + }, + { + "epoch": 0.19544, + "grad_norm": 1.0095013765477652, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 19544 + }, + { + "epoch": 0.19545, + "grad_norm": 1.030678871683277, + "learning_rate": 0.003, + "loss": 4.075, + "step": 19545 + }, + { + "epoch": 0.19546, + "grad_norm": 1.0129705246150944, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 19546 + }, + { + "epoch": 0.19547, + "grad_norm": 0.9631765361708327, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 19547 + }, + { + "epoch": 0.19548, + "grad_norm": 0.963864305064408, + "learning_rate": 0.003, + "loss": 4.076, + "step": 19548 + }, + { + "epoch": 0.19549, + "grad_norm": 0.9961369640847921, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 19549 + }, + { + "epoch": 0.1955, + "grad_norm": 0.9152450673234607, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 19550 + }, + { + "epoch": 0.19551, + "grad_norm": 0.8857343149036193, + "learning_rate": 0.003, + "loss": 4.092, + "step": 19551 + }, + { + "epoch": 0.19552, + "grad_norm": 0.9627084866512174, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 19552 + }, + { + "epoch": 0.19553, + "grad_norm": 1.0490876132405937, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 19553 + }, + { + "epoch": 0.19554, + "grad_norm": 0.8631105090005501, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 19554 + }, + { + "epoch": 0.19555, + "grad_norm": 0.7983585693124079, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 19555 + }, + { + "epoch": 0.19556, + "grad_norm": 0.911049293813308, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 19556 + }, + { + "epoch": 0.19557, + "grad_norm": 1.0179910902311773, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 19557 + }, + { + "epoch": 0.19558, + "grad_norm": 0.9814181656212816, + "learning_rate": 0.003, + "loss": 4.055, + "step": 19558 + }, + { + "epoch": 0.19559, + "grad_norm": 0.9199296198007286, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 19559 + }, + { + "epoch": 0.1956, + "grad_norm": 0.8530861861107941, + "learning_rate": 0.003, + "loss": 4.0956, + "step": 19560 + }, + { + "epoch": 0.19561, + "grad_norm": 0.8013947064723252, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 19561 + }, + { + "epoch": 0.19562, + "grad_norm": 0.8313204093885591, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 19562 + }, + { + "epoch": 0.19563, + "grad_norm": 0.8264158893586153, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 19563 + }, + { + "epoch": 0.19564, + "grad_norm": 0.8277466957586677, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 19564 + }, + { + "epoch": 0.19565, + "grad_norm": 0.9135284567351104, + "learning_rate": 0.003, + "loss": 4.035, + "step": 19565 + }, + { + "epoch": 0.19566, + "grad_norm": 1.0679433935045328, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 19566 + }, + { + "epoch": 0.19567, + "grad_norm": 0.9851444389545205, + "learning_rate": 0.003, + "loss": 4.058, + "step": 19567 + }, + { + "epoch": 0.19568, + "grad_norm": 0.8402413726976194, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 19568 + }, + { + "epoch": 0.19569, + "grad_norm": 0.9457773671925149, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 19569 + }, + { + "epoch": 0.1957, + "grad_norm": 1.1822131417765054, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 19570 + }, + { + "epoch": 0.19571, + "grad_norm": 0.7814700340209935, + "learning_rate": 0.003, + "loss": 4.064, + "step": 19571 + }, + { + "epoch": 0.19572, + "grad_norm": 0.7549355950312976, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 19572 + }, + { + "epoch": 0.19573, + "grad_norm": 0.8191030608931872, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 19573 + }, + { + "epoch": 0.19574, + "grad_norm": 0.8167295038611587, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 19574 + }, + { + "epoch": 0.19575, + "grad_norm": 0.8155217987813517, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 19575 + }, + { + "epoch": 0.19576, + "grad_norm": 0.7620612768135239, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 19576 + }, + { + "epoch": 0.19577, + "grad_norm": 0.7508875732334481, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 19577 + }, + { + "epoch": 0.19578, + "grad_norm": 0.6591786381246394, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 19578 + }, + { + "epoch": 0.19579, + "grad_norm": 0.6665442541428873, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 19579 + }, + { + "epoch": 0.1958, + "grad_norm": 0.7184883388085611, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 19580 + }, + { + "epoch": 0.19581, + "grad_norm": 0.6716101243677017, + "learning_rate": 0.003, + "loss": 4.005, + "step": 19581 + }, + { + "epoch": 0.19582, + "grad_norm": 0.5931393714437542, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 19582 + }, + { + "epoch": 0.19583, + "grad_norm": 0.5237593632232761, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 19583 + }, + { + "epoch": 0.19584, + "grad_norm": 0.5447536023317237, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 19584 + }, + { + "epoch": 0.19585, + "grad_norm": 0.5514066401220156, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 19585 + }, + { + "epoch": 0.19586, + "grad_norm": 0.5876951881811275, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 19586 + }, + { + "epoch": 0.19587, + "grad_norm": 0.7803656706981374, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 19587 + }, + { + "epoch": 0.19588, + "grad_norm": 1.0531089546767054, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 19588 + }, + { + "epoch": 0.19589, + "grad_norm": 1.2906770227877205, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 19589 + }, + { + "epoch": 0.1959, + "grad_norm": 0.640725072928441, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 19590 + }, + { + "epoch": 0.19591, + "grad_norm": 0.765682506484728, + "learning_rate": 0.003, + "loss": 4.073, + "step": 19591 + }, + { + "epoch": 0.19592, + "grad_norm": 0.8051515711380889, + "learning_rate": 0.003, + "loss": 4.066, + "step": 19592 + }, + { + "epoch": 0.19593, + "grad_norm": 0.8090570490624589, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 19593 + }, + { + "epoch": 0.19594, + "grad_norm": 0.9148053656372326, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 19594 + }, + { + "epoch": 0.19595, + "grad_norm": 0.9588129993589732, + "learning_rate": 0.003, + "loss": 4.047, + "step": 19595 + }, + { + "epoch": 0.19596, + "grad_norm": 1.0845422131169817, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 19596 + }, + { + "epoch": 0.19597, + "grad_norm": 1.2405161970936593, + "learning_rate": 0.003, + "loss": 4.087, + "step": 19597 + }, + { + "epoch": 0.19598, + "grad_norm": 0.8763254362895997, + "learning_rate": 0.003, + "loss": 4.039, + "step": 19598 + }, + { + "epoch": 0.19599, + "grad_norm": 0.8397382645251689, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 19599 + }, + { + "epoch": 0.196, + "grad_norm": 0.8324375316149644, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 19600 + }, + { + "epoch": 0.19601, + "grad_norm": 0.8671578177626034, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 19601 + }, + { + "epoch": 0.19602, + "grad_norm": 0.8984845395036228, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 19602 + }, + { + "epoch": 0.19603, + "grad_norm": 0.9138350481391048, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 19603 + }, + { + "epoch": 0.19604, + "grad_norm": 0.8679316485996372, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 19604 + }, + { + "epoch": 0.19605, + "grad_norm": 1.0142894792470354, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 19605 + }, + { + "epoch": 0.19606, + "grad_norm": 1.1463352220278804, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 19606 + }, + { + "epoch": 0.19607, + "grad_norm": 0.9032755723109397, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 19607 + }, + { + "epoch": 0.19608, + "grad_norm": 0.9756126186131024, + "learning_rate": 0.003, + "loss": 4.054, + "step": 19608 + }, + { + "epoch": 0.19609, + "grad_norm": 1.0252085137602285, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 19609 + }, + { + "epoch": 0.1961, + "grad_norm": 1.0275481471237282, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 19610 + }, + { + "epoch": 0.19611, + "grad_norm": 1.1091289142283627, + "learning_rate": 0.003, + "loss": 4.072, + "step": 19611 + }, + { + "epoch": 0.19612, + "grad_norm": 1.0041349989429535, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 19612 + }, + { + "epoch": 0.19613, + "grad_norm": 1.0752228600198706, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 19613 + }, + { + "epoch": 0.19614, + "grad_norm": 0.7712905298214182, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 19614 + }, + { + "epoch": 0.19615, + "grad_norm": 0.75575047929902, + "learning_rate": 0.003, + "loss": 4.049, + "step": 19615 + }, + { + "epoch": 0.19616, + "grad_norm": 0.8029741166981483, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 19616 + }, + { + "epoch": 0.19617, + "grad_norm": 0.8626511772695846, + "learning_rate": 0.003, + "loss": 4.088, + "step": 19617 + }, + { + "epoch": 0.19618, + "grad_norm": 0.8672605456043144, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 19618 + }, + { + "epoch": 0.19619, + "grad_norm": 0.906518472965422, + "learning_rate": 0.003, + "loss": 4.067, + "step": 19619 + }, + { + "epoch": 0.1962, + "grad_norm": 0.978468408039395, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 19620 + }, + { + "epoch": 0.19621, + "grad_norm": 1.14111150112848, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 19621 + }, + { + "epoch": 0.19622, + "grad_norm": 0.8650014457284066, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 19622 + }, + { + "epoch": 0.19623, + "grad_norm": 0.9636491762678555, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 19623 + }, + { + "epoch": 0.19624, + "grad_norm": 1.0588919339234388, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 19624 + }, + { + "epoch": 0.19625, + "grad_norm": 0.863972905355138, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 19625 + }, + { + "epoch": 0.19626, + "grad_norm": 0.834111566098492, + "learning_rate": 0.003, + "loss": 4.077, + "step": 19626 + }, + { + "epoch": 0.19627, + "grad_norm": 0.7487609764255219, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 19627 + }, + { + "epoch": 0.19628, + "grad_norm": 0.6943484442750629, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 19628 + }, + { + "epoch": 0.19629, + "grad_norm": 0.70303815508834, + "learning_rate": 0.003, + "loss": 4.044, + "step": 19629 + }, + { + "epoch": 0.1963, + "grad_norm": 0.7445017089096984, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 19630 + }, + { + "epoch": 0.19631, + "grad_norm": 0.8521821108803652, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 19631 + }, + { + "epoch": 0.19632, + "grad_norm": 1.0450800327161265, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 19632 + }, + { + "epoch": 0.19633, + "grad_norm": 1.3012511420639266, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 19633 + }, + { + "epoch": 0.19634, + "grad_norm": 0.6218729762126922, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 19634 + }, + { + "epoch": 0.19635, + "grad_norm": 0.775632845745399, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 19635 + }, + { + "epoch": 0.19636, + "grad_norm": 0.8638470958774185, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 19636 + }, + { + "epoch": 0.19637, + "grad_norm": 0.9111695509287965, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 19637 + }, + { + "epoch": 0.19638, + "grad_norm": 0.9149375831134107, + "learning_rate": 0.003, + "loss": 4.0031, + "step": 19638 + }, + { + "epoch": 0.19639, + "grad_norm": 0.9230680816971693, + "learning_rate": 0.003, + "loss": 4.0926, + "step": 19639 + }, + { + "epoch": 0.1964, + "grad_norm": 0.9303667561199962, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 19640 + }, + { + "epoch": 0.19641, + "grad_norm": 0.9743412657753244, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 19641 + }, + { + "epoch": 0.19642, + "grad_norm": 1.0421841288257094, + "learning_rate": 0.003, + "loss": 4.053, + "step": 19642 + }, + { + "epoch": 0.19643, + "grad_norm": 1.0021674751672769, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 19643 + }, + { + "epoch": 0.19644, + "grad_norm": 1.0675909299862363, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 19644 + }, + { + "epoch": 0.19645, + "grad_norm": 1.0107519134522043, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 19645 + }, + { + "epoch": 0.19646, + "grad_norm": 1.1025867901172475, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 19646 + }, + { + "epoch": 0.19647, + "grad_norm": 0.8284739080966057, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 19647 + }, + { + "epoch": 0.19648, + "grad_norm": 0.7270951908958995, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 19648 + }, + { + "epoch": 0.19649, + "grad_norm": 0.7710490808011464, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 19649 + }, + { + "epoch": 0.1965, + "grad_norm": 0.8848802236200047, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 19650 + }, + { + "epoch": 0.19651, + "grad_norm": 1.0774926919785093, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 19651 + }, + { + "epoch": 0.19652, + "grad_norm": 1.1362621198906069, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 19652 + }, + { + "epoch": 0.19653, + "grad_norm": 0.9247064662537829, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 19653 + }, + { + "epoch": 0.19654, + "grad_norm": 0.8936436484304016, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 19654 + }, + { + "epoch": 0.19655, + "grad_norm": 0.98716827264183, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 19655 + }, + { + "epoch": 0.19656, + "grad_norm": 1.0106779134206885, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 19656 + }, + { + "epoch": 0.19657, + "grad_norm": 0.8389634395452044, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 19657 + }, + { + "epoch": 0.19658, + "grad_norm": 0.8126126389489013, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 19658 + }, + { + "epoch": 0.19659, + "grad_norm": 0.7590071171626337, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 19659 + }, + { + "epoch": 0.1966, + "grad_norm": 0.7992150930481975, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 19660 + }, + { + "epoch": 0.19661, + "grad_norm": 0.9637472510188058, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 19661 + }, + { + "epoch": 0.19662, + "grad_norm": 1.111990045897436, + "learning_rate": 0.003, + "loss": 4.087, + "step": 19662 + }, + { + "epoch": 0.19663, + "grad_norm": 0.9616634759268835, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 19663 + }, + { + "epoch": 0.19664, + "grad_norm": 1.0109756472601052, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 19664 + }, + { + "epoch": 0.19665, + "grad_norm": 0.9784657538349492, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 19665 + }, + { + "epoch": 0.19666, + "grad_norm": 1.0233782450256468, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 19666 + }, + { + "epoch": 0.19667, + "grad_norm": 1.0384438121030655, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 19667 + }, + { + "epoch": 0.19668, + "grad_norm": 1.0661705947840365, + "learning_rate": 0.003, + "loss": 4.054, + "step": 19668 + }, + { + "epoch": 0.19669, + "grad_norm": 0.9395447683901564, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 19669 + }, + { + "epoch": 0.1967, + "grad_norm": 0.9558765034267854, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 19670 + }, + { + "epoch": 0.19671, + "grad_norm": 1.1608338068984734, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 19671 + }, + { + "epoch": 0.19672, + "grad_norm": 0.751740136424847, + "learning_rate": 0.003, + "loss": 4.034, + "step": 19672 + }, + { + "epoch": 0.19673, + "grad_norm": 0.6821824535053846, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 19673 + }, + { + "epoch": 0.19674, + "grad_norm": 0.6404662033617751, + "learning_rate": 0.003, + "loss": 4.032, + "step": 19674 + }, + { + "epoch": 0.19675, + "grad_norm": 0.7087708448421819, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 19675 + }, + { + "epoch": 0.19676, + "grad_norm": 0.9362332098376248, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 19676 + }, + { + "epoch": 0.19677, + "grad_norm": 1.1325899664530585, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 19677 + }, + { + "epoch": 0.19678, + "grad_norm": 0.9202287386628409, + "learning_rate": 0.003, + "loss": 4.067, + "step": 19678 + }, + { + "epoch": 0.19679, + "grad_norm": 0.8831079853084167, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 19679 + }, + { + "epoch": 0.1968, + "grad_norm": 0.8468692919029652, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 19680 + }, + { + "epoch": 0.19681, + "grad_norm": 0.7368182715685371, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 19681 + }, + { + "epoch": 0.19682, + "grad_norm": 0.7474720761224553, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 19682 + }, + { + "epoch": 0.19683, + "grad_norm": 0.7968755783528199, + "learning_rate": 0.003, + "loss": 4.059, + "step": 19683 + }, + { + "epoch": 0.19684, + "grad_norm": 0.8302497756578229, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 19684 + }, + { + "epoch": 0.19685, + "grad_norm": 0.8083807470510587, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 19685 + }, + { + "epoch": 0.19686, + "grad_norm": 0.6840671889461788, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 19686 + }, + { + "epoch": 0.19687, + "grad_norm": 0.6379636062580218, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 19687 + }, + { + "epoch": 0.19688, + "grad_norm": 0.6755767538822668, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 19688 + }, + { + "epoch": 0.19689, + "grad_norm": 0.6155383828889599, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 19689 + }, + { + "epoch": 0.1969, + "grad_norm": 0.611227770767368, + "learning_rate": 0.003, + "loss": 4.072, + "step": 19690 + }, + { + "epoch": 0.19691, + "grad_norm": 0.8032269452447673, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 19691 + }, + { + "epoch": 0.19692, + "grad_norm": 1.198196493639563, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 19692 + }, + { + "epoch": 0.19693, + "grad_norm": 1.1955061577906123, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19693 + }, + { + "epoch": 0.19694, + "grad_norm": 0.8579327325889964, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 19694 + }, + { + "epoch": 0.19695, + "grad_norm": 0.7537554169929473, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 19695 + }, + { + "epoch": 0.19696, + "grad_norm": 0.7589037687157838, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 19696 + }, + { + "epoch": 0.19697, + "grad_norm": 0.810450154172506, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 19697 + }, + { + "epoch": 0.19698, + "grad_norm": 0.7526534961346523, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 19698 + }, + { + "epoch": 0.19699, + "grad_norm": 0.6561645834753397, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 19699 + }, + { + "epoch": 0.197, + "grad_norm": 0.8700640046075444, + "learning_rate": 0.003, + "loss": 4.059, + "step": 19700 + }, + { + "epoch": 0.19701, + "grad_norm": 1.258262587953789, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 19701 + }, + { + "epoch": 0.19702, + "grad_norm": 1.0040667850909977, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 19702 + }, + { + "epoch": 0.19703, + "grad_norm": 0.7556615313818323, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 19703 + }, + { + "epoch": 0.19704, + "grad_norm": 0.5976184252835506, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 19704 + }, + { + "epoch": 0.19705, + "grad_norm": 0.650572677181974, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 19705 + }, + { + "epoch": 0.19706, + "grad_norm": 0.7443194742393572, + "learning_rate": 0.003, + "loss": 4.014, + "step": 19706 + }, + { + "epoch": 0.19707, + "grad_norm": 0.9616357129827846, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 19707 + }, + { + "epoch": 0.19708, + "grad_norm": 1.1756003143638607, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 19708 + }, + { + "epoch": 0.19709, + "grad_norm": 0.8357305268406774, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 19709 + }, + { + "epoch": 0.1971, + "grad_norm": 0.824451659941323, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 19710 + }, + { + "epoch": 0.19711, + "grad_norm": 0.7773752404696846, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 19711 + }, + { + "epoch": 0.19712, + "grad_norm": 0.8012507574969546, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 19712 + }, + { + "epoch": 0.19713, + "grad_norm": 0.9489867745855541, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 19713 + }, + { + "epoch": 0.19714, + "grad_norm": 1.2844531029883166, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 19714 + }, + { + "epoch": 0.19715, + "grad_norm": 0.8908589160181729, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 19715 + }, + { + "epoch": 0.19716, + "grad_norm": 0.9580833767775246, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 19716 + }, + { + "epoch": 0.19717, + "grad_norm": 1.1680224981426544, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 19717 + }, + { + "epoch": 0.19718, + "grad_norm": 0.9140874492684244, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 19718 + }, + { + "epoch": 0.19719, + "grad_norm": 0.919575110954966, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 19719 + }, + { + "epoch": 0.1972, + "grad_norm": 1.0296479937908731, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 19720 + }, + { + "epoch": 0.19721, + "grad_norm": 1.073998288315665, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 19721 + }, + { + "epoch": 0.19722, + "grad_norm": 0.9543246554641899, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 19722 + }, + { + "epoch": 0.19723, + "grad_norm": 1.028807164252435, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 19723 + }, + { + "epoch": 0.19724, + "grad_norm": 1.150989933268326, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 19724 + }, + { + "epoch": 0.19725, + "grad_norm": 0.8941327796303788, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 19725 + }, + { + "epoch": 0.19726, + "grad_norm": 0.8978539867956097, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 19726 + }, + { + "epoch": 0.19727, + "grad_norm": 0.9735069665033468, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 19727 + }, + { + "epoch": 0.19728, + "grad_norm": 1.0393834033535847, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 19728 + }, + { + "epoch": 0.19729, + "grad_norm": 1.0101489645821173, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 19729 + }, + { + "epoch": 0.1973, + "grad_norm": 0.8446477851013826, + "learning_rate": 0.003, + "loss": 4.074, + "step": 19730 + }, + { + "epoch": 0.19731, + "grad_norm": 0.8508381578061052, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 19731 + }, + { + "epoch": 0.19732, + "grad_norm": 0.8852094200145928, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 19732 + }, + { + "epoch": 0.19733, + "grad_norm": 0.8712907379011499, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 19733 + }, + { + "epoch": 0.19734, + "grad_norm": 0.9957658734056823, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 19734 + }, + { + "epoch": 0.19735, + "grad_norm": 1.1041746410416118, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 19735 + }, + { + "epoch": 0.19736, + "grad_norm": 1.0030570397593415, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 19736 + }, + { + "epoch": 0.19737, + "grad_norm": 1.0849763446626417, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 19737 + }, + { + "epoch": 0.19738, + "grad_norm": 1.0089892081159848, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 19738 + }, + { + "epoch": 0.19739, + "grad_norm": 0.9486442335835147, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 19739 + }, + { + "epoch": 0.1974, + "grad_norm": 0.9928907586099743, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 19740 + }, + { + "epoch": 0.19741, + "grad_norm": 0.9317527123033049, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 19741 + }, + { + "epoch": 0.19742, + "grad_norm": 0.9604994076147799, + "learning_rate": 0.003, + "loss": 4.058, + "step": 19742 + }, + { + "epoch": 0.19743, + "grad_norm": 0.9723372857862396, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 19743 + }, + { + "epoch": 0.19744, + "grad_norm": 1.003076679951651, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 19744 + }, + { + "epoch": 0.19745, + "grad_norm": 0.9745238325892046, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 19745 + }, + { + "epoch": 0.19746, + "grad_norm": 0.952883463657387, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 19746 + }, + { + "epoch": 0.19747, + "grad_norm": 0.7611905659456139, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 19747 + }, + { + "epoch": 0.19748, + "grad_norm": 0.6188209555231131, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 19748 + }, + { + "epoch": 0.19749, + "grad_norm": 0.7040836631450551, + "learning_rate": 0.003, + "loss": 4.058, + "step": 19749 + }, + { + "epoch": 0.1975, + "grad_norm": 0.6293627760012084, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 19750 + }, + { + "epoch": 0.19751, + "grad_norm": 0.5887160146363231, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 19751 + }, + { + "epoch": 0.19752, + "grad_norm": 0.559532344798392, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 19752 + }, + { + "epoch": 0.19753, + "grad_norm": 0.5352111996411758, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 19753 + }, + { + "epoch": 0.19754, + "grad_norm": 0.6066195478172508, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 19754 + }, + { + "epoch": 0.19755, + "grad_norm": 0.6629454218326679, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 19755 + }, + { + "epoch": 0.19756, + "grad_norm": 0.781823368244423, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 19756 + }, + { + "epoch": 0.19757, + "grad_norm": 0.9193894567332758, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 19757 + }, + { + "epoch": 0.19758, + "grad_norm": 1.1038639820589233, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 19758 + }, + { + "epoch": 0.19759, + "grad_norm": 0.9780960388395343, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 19759 + }, + { + "epoch": 0.1976, + "grad_norm": 0.9927967189575263, + "learning_rate": 0.003, + "loss": 4.1021, + "step": 19760 + }, + { + "epoch": 0.19761, + "grad_norm": 1.1180090639873808, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 19761 + }, + { + "epoch": 0.19762, + "grad_norm": 0.9732835515636639, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 19762 + }, + { + "epoch": 0.19763, + "grad_norm": 0.9175308228768257, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 19763 + }, + { + "epoch": 0.19764, + "grad_norm": 0.8394048070192874, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 19764 + }, + { + "epoch": 0.19765, + "grad_norm": 0.9225166966022803, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 19765 + }, + { + "epoch": 0.19766, + "grad_norm": 1.009090741829306, + "learning_rate": 0.003, + "loss": 4.0951, + "step": 19766 + }, + { + "epoch": 0.19767, + "grad_norm": 1.2512525657798192, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 19767 + }, + { + "epoch": 0.19768, + "grad_norm": 1.0024586200818144, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 19768 + }, + { + "epoch": 0.19769, + "grad_norm": 0.9202985172242644, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 19769 + }, + { + "epoch": 0.1977, + "grad_norm": 0.8682879533663649, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 19770 + }, + { + "epoch": 0.19771, + "grad_norm": 0.8300298620182743, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 19771 + }, + { + "epoch": 0.19772, + "grad_norm": 0.8936461475002073, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 19772 + }, + { + "epoch": 0.19773, + "grad_norm": 0.9616423495584655, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 19773 + }, + { + "epoch": 0.19774, + "grad_norm": 0.8663505610530289, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 19774 + }, + { + "epoch": 0.19775, + "grad_norm": 0.88791514089382, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19775 + }, + { + "epoch": 0.19776, + "grad_norm": 0.9869297135387154, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19776 + }, + { + "epoch": 0.19777, + "grad_norm": 0.9167075446875883, + "learning_rate": 0.003, + "loss": 4.038, + "step": 19777 + }, + { + "epoch": 0.19778, + "grad_norm": 0.7190250293344528, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 19778 + }, + { + "epoch": 0.19779, + "grad_norm": 0.6441673767575137, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 19779 + }, + { + "epoch": 0.1978, + "grad_norm": 0.643983411523129, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 19780 + }, + { + "epoch": 0.19781, + "grad_norm": 0.7175178029511101, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 19781 + }, + { + "epoch": 0.19782, + "grad_norm": 0.8329549253153612, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 19782 + }, + { + "epoch": 0.19783, + "grad_norm": 0.9424467432848166, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 19783 + }, + { + "epoch": 0.19784, + "grad_norm": 1.1536430734054381, + "learning_rate": 0.003, + "loss": 4.074, + "step": 19784 + }, + { + "epoch": 0.19785, + "grad_norm": 0.8073132228902993, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 19785 + }, + { + "epoch": 0.19786, + "grad_norm": 0.8199887374947842, + "learning_rate": 0.003, + "loss": 4.055, + "step": 19786 + }, + { + "epoch": 0.19787, + "grad_norm": 0.8804475781197203, + "learning_rate": 0.003, + "loss": 4.055, + "step": 19787 + }, + { + "epoch": 0.19788, + "grad_norm": 1.031976531151439, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 19788 + }, + { + "epoch": 0.19789, + "grad_norm": 1.287596305378352, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 19789 + }, + { + "epoch": 0.1979, + "grad_norm": 0.6316533332578996, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 19790 + }, + { + "epoch": 0.19791, + "grad_norm": 0.7994082318587541, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 19791 + }, + { + "epoch": 0.19792, + "grad_norm": 0.9152851017072172, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 19792 + }, + { + "epoch": 0.19793, + "grad_norm": 0.9140567040182891, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 19793 + }, + { + "epoch": 0.19794, + "grad_norm": 0.8925892623813582, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 19794 + }, + { + "epoch": 0.19795, + "grad_norm": 0.9151546478166942, + "learning_rate": 0.003, + "loss": 4.065, + "step": 19795 + }, + { + "epoch": 0.19796, + "grad_norm": 1.046776094430734, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 19796 + }, + { + "epoch": 0.19797, + "grad_norm": 1.0861390961888773, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 19797 + }, + { + "epoch": 0.19798, + "grad_norm": 1.0465579906692883, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 19798 + }, + { + "epoch": 0.19799, + "grad_norm": 1.0146469760986077, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 19799 + }, + { + "epoch": 0.198, + "grad_norm": 0.9964749861423164, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 19800 + }, + { + "epoch": 0.19801, + "grad_norm": 0.9823527901657623, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 19801 + }, + { + "epoch": 0.19802, + "grad_norm": 0.9879118978824735, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 19802 + }, + { + "epoch": 0.19803, + "grad_norm": 1.0018465737365319, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 19803 + }, + { + "epoch": 0.19804, + "grad_norm": 1.0920266169684745, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 19804 + }, + { + "epoch": 0.19805, + "grad_norm": 0.9443044636960449, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 19805 + }, + { + "epoch": 0.19806, + "grad_norm": 0.9999826116147256, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 19806 + }, + { + "epoch": 0.19807, + "grad_norm": 1.0618088952745237, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 19807 + }, + { + "epoch": 0.19808, + "grad_norm": 1.001497051288458, + "learning_rate": 0.003, + "loss": 4.1156, + "step": 19808 + }, + { + "epoch": 0.19809, + "grad_norm": 1.0528528445282366, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 19809 + }, + { + "epoch": 0.1981, + "grad_norm": 0.8955237049017132, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 19810 + }, + { + "epoch": 0.19811, + "grad_norm": 0.7835393392828128, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 19811 + }, + { + "epoch": 0.19812, + "grad_norm": 0.8973380312443668, + "learning_rate": 0.003, + "loss": 4.043, + "step": 19812 + }, + { + "epoch": 0.19813, + "grad_norm": 1.081027770683145, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 19813 + }, + { + "epoch": 0.19814, + "grad_norm": 1.141195561051589, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 19814 + }, + { + "epoch": 0.19815, + "grad_norm": 0.844089372192303, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 19815 + }, + { + "epoch": 0.19816, + "grad_norm": 0.7989452218297074, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 19816 + }, + { + "epoch": 0.19817, + "grad_norm": 0.8138069963141369, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 19817 + }, + { + "epoch": 0.19818, + "grad_norm": 0.8574226539254926, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 19818 + }, + { + "epoch": 0.19819, + "grad_norm": 0.7977803039052026, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 19819 + }, + { + "epoch": 0.1982, + "grad_norm": 0.752007945666223, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 19820 + }, + { + "epoch": 0.19821, + "grad_norm": 0.8180053670746039, + "learning_rate": 0.003, + "loss": 4.063, + "step": 19821 + }, + { + "epoch": 0.19822, + "grad_norm": 0.9991391048961866, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 19822 + }, + { + "epoch": 0.19823, + "grad_norm": 1.1112560788388506, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 19823 + }, + { + "epoch": 0.19824, + "grad_norm": 1.0015471589835643, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 19824 + }, + { + "epoch": 0.19825, + "grad_norm": 0.8825345688164498, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 19825 + }, + { + "epoch": 0.19826, + "grad_norm": 0.6901532947233038, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19826 + }, + { + "epoch": 0.19827, + "grad_norm": 0.6033120091223885, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 19827 + }, + { + "epoch": 0.19828, + "grad_norm": 0.6079825563637126, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 19828 + }, + { + "epoch": 0.19829, + "grad_norm": 0.6561029843983504, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 19829 + }, + { + "epoch": 0.1983, + "grad_norm": 0.7168356167290718, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 19830 + }, + { + "epoch": 0.19831, + "grad_norm": 0.8027757272656064, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 19831 + }, + { + "epoch": 0.19832, + "grad_norm": 0.7531498764422621, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 19832 + }, + { + "epoch": 0.19833, + "grad_norm": 0.5724504129836073, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 19833 + }, + { + "epoch": 0.19834, + "grad_norm": 0.5417026494312349, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 19834 + }, + { + "epoch": 0.19835, + "grad_norm": 0.6664803251147037, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 19835 + }, + { + "epoch": 0.19836, + "grad_norm": 0.8605774550359709, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 19836 + }, + { + "epoch": 0.19837, + "grad_norm": 0.9613068224672485, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 19837 + }, + { + "epoch": 0.19838, + "grad_norm": 1.034405634544546, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 19838 + }, + { + "epoch": 0.19839, + "grad_norm": 1.218292370650084, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 19839 + }, + { + "epoch": 0.1984, + "grad_norm": 0.7228430867683359, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 19840 + }, + { + "epoch": 0.19841, + "grad_norm": 0.7155503591441723, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 19841 + }, + { + "epoch": 0.19842, + "grad_norm": 0.7949137251277599, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 19842 + }, + { + "epoch": 0.19843, + "grad_norm": 1.0480441969647263, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 19843 + }, + { + "epoch": 0.19844, + "grad_norm": 1.1111687513827093, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 19844 + }, + { + "epoch": 0.19845, + "grad_norm": 0.8451794233928357, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 19845 + }, + { + "epoch": 0.19846, + "grad_norm": 0.8571616278383336, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 19846 + }, + { + "epoch": 0.19847, + "grad_norm": 0.8407128714816388, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 19847 + }, + { + "epoch": 0.19848, + "grad_norm": 1.0110269823089972, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 19848 + }, + { + "epoch": 0.19849, + "grad_norm": 1.2944775759063354, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 19849 + }, + { + "epoch": 0.1985, + "grad_norm": 0.7967670539514138, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 19850 + }, + { + "epoch": 0.19851, + "grad_norm": 0.8151984717720282, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 19851 + }, + { + "epoch": 0.19852, + "grad_norm": 0.9428974553924526, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 19852 + }, + { + "epoch": 0.19853, + "grad_norm": 1.0652837128197106, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 19853 + }, + { + "epoch": 0.19854, + "grad_norm": 0.9955810564174555, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 19854 + }, + { + "epoch": 0.19855, + "grad_norm": 1.0103477292459686, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 19855 + }, + { + "epoch": 0.19856, + "grad_norm": 0.9197432984947649, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 19856 + }, + { + "epoch": 0.19857, + "grad_norm": 0.9606376147101987, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 19857 + }, + { + "epoch": 0.19858, + "grad_norm": 1.0384020553475217, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 19858 + }, + { + "epoch": 0.19859, + "grad_norm": 1.0988358737331458, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 19859 + }, + { + "epoch": 0.1986, + "grad_norm": 1.3451903501613105, + "learning_rate": 0.003, + "loss": 4.079, + "step": 19860 + }, + { + "epoch": 0.19861, + "grad_norm": 0.8881360761180166, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 19861 + }, + { + "epoch": 0.19862, + "grad_norm": 0.8660717523920124, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 19862 + }, + { + "epoch": 0.19863, + "grad_norm": 0.9976570646768376, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 19863 + }, + { + "epoch": 0.19864, + "grad_norm": 0.9848723273395584, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 19864 + }, + { + "epoch": 0.19865, + "grad_norm": 0.8934798896098098, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 19865 + }, + { + "epoch": 0.19866, + "grad_norm": 0.8913654835852564, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 19866 + }, + { + "epoch": 0.19867, + "grad_norm": 0.9182084049802524, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 19867 + }, + { + "epoch": 0.19868, + "grad_norm": 0.9657264721270611, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 19868 + }, + { + "epoch": 0.19869, + "grad_norm": 0.957293500686211, + "learning_rate": 0.003, + "loss": 4.076, + "step": 19869 + }, + { + "epoch": 0.1987, + "grad_norm": 1.0365374160034644, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 19870 + }, + { + "epoch": 0.19871, + "grad_norm": 1.179155935998019, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 19871 + }, + { + "epoch": 0.19872, + "grad_norm": 0.9142861225462765, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 19872 + }, + { + "epoch": 0.19873, + "grad_norm": 0.7806002618662166, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 19873 + }, + { + "epoch": 0.19874, + "grad_norm": 0.6421677922073148, + "learning_rate": 0.003, + "loss": 4.066, + "step": 19874 + }, + { + "epoch": 0.19875, + "grad_norm": 0.5749383292252597, + "learning_rate": 0.003, + "loss": 4.042, + "step": 19875 + }, + { + "epoch": 0.19876, + "grad_norm": 0.6521724037453608, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 19876 + }, + { + "epoch": 0.19877, + "grad_norm": 0.8131557022513219, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 19877 + }, + { + "epoch": 0.19878, + "grad_norm": 0.8139523164329852, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 19878 + }, + { + "epoch": 0.19879, + "grad_norm": 0.8487393099543807, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 19879 + }, + { + "epoch": 0.1988, + "grad_norm": 0.8453183984430512, + "learning_rate": 0.003, + "loss": 4.055, + "step": 19880 + }, + { + "epoch": 0.19881, + "grad_norm": 0.8421102454998768, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 19881 + }, + { + "epoch": 0.19882, + "grad_norm": 0.71154179744596, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 19882 + }, + { + "epoch": 0.19883, + "grad_norm": 0.8159281335825941, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 19883 + }, + { + "epoch": 0.19884, + "grad_norm": 1.0723499246928387, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 19884 + }, + { + "epoch": 0.19885, + "grad_norm": 1.2251720973359979, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 19885 + }, + { + "epoch": 0.19886, + "grad_norm": 0.7352708327150798, + "learning_rate": 0.003, + "loss": 4.037, + "step": 19886 + }, + { + "epoch": 0.19887, + "grad_norm": 0.6677081852101372, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 19887 + }, + { + "epoch": 0.19888, + "grad_norm": 0.8420646776254377, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 19888 + }, + { + "epoch": 0.19889, + "grad_norm": 1.0181240476962217, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 19889 + }, + { + "epoch": 0.1989, + "grad_norm": 1.0490571071709465, + "learning_rate": 0.003, + "loss": 4.076, + "step": 19890 + }, + { + "epoch": 0.19891, + "grad_norm": 0.8728379339581817, + "learning_rate": 0.003, + "loss": 4.057, + "step": 19891 + }, + { + "epoch": 0.19892, + "grad_norm": 0.8813000974001235, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 19892 + }, + { + "epoch": 0.19893, + "grad_norm": 0.7264968031599618, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 19893 + }, + { + "epoch": 0.19894, + "grad_norm": 0.7250654055700293, + "learning_rate": 0.003, + "loss": 4.05, + "step": 19894 + }, + { + "epoch": 0.19895, + "grad_norm": 0.7702901389350747, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 19895 + }, + { + "epoch": 0.19896, + "grad_norm": 0.8909511427953801, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 19896 + }, + { + "epoch": 0.19897, + "grad_norm": 0.9173829257595774, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 19897 + }, + { + "epoch": 0.19898, + "grad_norm": 0.8842631804974519, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 19898 + }, + { + "epoch": 0.19899, + "grad_norm": 0.7923682624901234, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 19899 + }, + { + "epoch": 0.199, + "grad_norm": 0.7539960971375723, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 19900 + }, + { + "epoch": 0.19901, + "grad_norm": 0.703005987116502, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 19901 + }, + { + "epoch": 0.19902, + "grad_norm": 0.8009496531249488, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 19902 + }, + { + "epoch": 0.19903, + "grad_norm": 0.9067062801110316, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 19903 + }, + { + "epoch": 0.19904, + "grad_norm": 1.1786764927253293, + "learning_rate": 0.003, + "loss": 4.066, + "step": 19904 + }, + { + "epoch": 0.19905, + "grad_norm": 1.1361924670089292, + "learning_rate": 0.003, + "loss": 4.067, + "step": 19905 + }, + { + "epoch": 0.19906, + "grad_norm": 1.0529521477198889, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 19906 + }, + { + "epoch": 0.19907, + "grad_norm": 1.0046040807412195, + "learning_rate": 0.003, + "loss": 4.071, + "step": 19907 + }, + { + "epoch": 0.19908, + "grad_norm": 1.0326223487037012, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 19908 + }, + { + "epoch": 0.19909, + "grad_norm": 0.8737179046502894, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 19909 + }, + { + "epoch": 0.1991, + "grad_norm": 0.890611080041377, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 19910 + }, + { + "epoch": 0.19911, + "grad_norm": 1.0069355533898776, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 19911 + }, + { + "epoch": 0.19912, + "grad_norm": 1.2056728238597598, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 19912 + }, + { + "epoch": 0.19913, + "grad_norm": 0.9177299198604985, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 19913 + }, + { + "epoch": 0.19914, + "grad_norm": 1.1338328387087382, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 19914 + }, + { + "epoch": 0.19915, + "grad_norm": 1.136958933495833, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 19915 + }, + { + "epoch": 0.19916, + "grad_norm": 0.8991149258341761, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 19916 + }, + { + "epoch": 0.19917, + "grad_norm": 0.8335027715715886, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 19917 + }, + { + "epoch": 0.19918, + "grad_norm": 0.9043135139359022, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 19918 + }, + { + "epoch": 0.19919, + "grad_norm": 0.9987739405236165, + "learning_rate": 0.003, + "loss": 4.064, + "step": 19919 + }, + { + "epoch": 0.1992, + "grad_norm": 0.9957258338930796, + "learning_rate": 0.003, + "loss": 4.082, + "step": 19920 + }, + { + "epoch": 0.19921, + "grad_norm": 0.8952638043759665, + "learning_rate": 0.003, + "loss": 4.076, + "step": 19921 + }, + { + "epoch": 0.19922, + "grad_norm": 0.8106292391350349, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 19922 + }, + { + "epoch": 0.19923, + "grad_norm": 0.7809987620556, + "learning_rate": 0.003, + "loss": 4.079, + "step": 19923 + }, + { + "epoch": 0.19924, + "grad_norm": 0.635369095839311, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 19924 + }, + { + "epoch": 0.19925, + "grad_norm": 0.585805181251373, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 19925 + }, + { + "epoch": 0.19926, + "grad_norm": 0.5323816818131343, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 19926 + }, + { + "epoch": 0.19927, + "grad_norm": 0.49650199814277274, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 19927 + }, + { + "epoch": 0.19928, + "grad_norm": 0.479325318995144, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 19928 + }, + { + "epoch": 0.19929, + "grad_norm": 0.5299288043658947, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 19929 + }, + { + "epoch": 0.1993, + "grad_norm": 0.6228564791277241, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 19930 + }, + { + "epoch": 0.19931, + "grad_norm": 0.8395177636113068, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 19931 + }, + { + "epoch": 0.19932, + "grad_norm": 1.2770943951623002, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 19932 + }, + { + "epoch": 0.19933, + "grad_norm": 0.7990604435104527, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 19933 + }, + { + "epoch": 0.19934, + "grad_norm": 0.7105284669586823, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 19934 + }, + { + "epoch": 0.19935, + "grad_norm": 0.861766218815007, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 19935 + }, + { + "epoch": 0.19936, + "grad_norm": 1.0003200038958466, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 19936 + }, + { + "epoch": 0.19937, + "grad_norm": 1.1483690283132395, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 19937 + }, + { + "epoch": 0.19938, + "grad_norm": 0.9464365396234878, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 19938 + }, + { + "epoch": 0.19939, + "grad_norm": 0.8942595361201251, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 19939 + }, + { + "epoch": 0.1994, + "grad_norm": 0.8395261997307861, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 19940 + }, + { + "epoch": 0.19941, + "grad_norm": 0.8439607063196037, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 19941 + }, + { + "epoch": 0.19942, + "grad_norm": 0.7949157567469377, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 19942 + }, + { + "epoch": 0.19943, + "grad_norm": 0.7837447744102928, + "learning_rate": 0.003, + "loss": 4.077, + "step": 19943 + }, + { + "epoch": 0.19944, + "grad_norm": 0.8196648127077458, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 19944 + }, + { + "epoch": 0.19945, + "grad_norm": 0.847908100014604, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 19945 + }, + { + "epoch": 0.19946, + "grad_norm": 0.7954309923061973, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 19946 + }, + { + "epoch": 0.19947, + "grad_norm": 0.8829725129318641, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 19947 + }, + { + "epoch": 0.19948, + "grad_norm": 1.154184820109059, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 19948 + }, + { + "epoch": 0.19949, + "grad_norm": 1.2474068658566835, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 19949 + }, + { + "epoch": 0.1995, + "grad_norm": 0.7853262660096543, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 19950 + }, + { + "epoch": 0.19951, + "grad_norm": 0.6203693143297675, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 19951 + }, + { + "epoch": 0.19952, + "grad_norm": 0.641965893367166, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 19952 + }, + { + "epoch": 0.19953, + "grad_norm": 0.7600255241031556, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 19953 + }, + { + "epoch": 0.19954, + "grad_norm": 0.9000486167165967, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 19954 + }, + { + "epoch": 0.19955, + "grad_norm": 1.0178304320588876, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 19955 + }, + { + "epoch": 0.19956, + "grad_norm": 1.0722741876495072, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 19956 + }, + { + "epoch": 0.19957, + "grad_norm": 0.8604334677789589, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 19957 + }, + { + "epoch": 0.19958, + "grad_norm": 0.725246645570351, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 19958 + }, + { + "epoch": 0.19959, + "grad_norm": 0.8575132762287329, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 19959 + }, + { + "epoch": 0.1996, + "grad_norm": 1.0150048930227236, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 19960 + }, + { + "epoch": 0.19961, + "grad_norm": 1.0465999661583107, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 19961 + }, + { + "epoch": 0.19962, + "grad_norm": 0.954332165206631, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 19962 + }, + { + "epoch": 0.19963, + "grad_norm": 0.9826237662598315, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 19963 + }, + { + "epoch": 0.19964, + "grad_norm": 1.0495364764278192, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 19964 + }, + { + "epoch": 0.19965, + "grad_norm": 0.9288997451281783, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 19965 + }, + { + "epoch": 0.19966, + "grad_norm": 0.924625144905568, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 19966 + }, + { + "epoch": 0.19967, + "grad_norm": 0.7039083317237521, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 19967 + }, + { + "epoch": 0.19968, + "grad_norm": 0.7667125873925056, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 19968 + }, + { + "epoch": 0.19969, + "grad_norm": 0.8032642139561513, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 19969 + }, + { + "epoch": 0.1997, + "grad_norm": 0.8634620299362555, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 19970 + }, + { + "epoch": 0.19971, + "grad_norm": 0.8711120377179361, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 19971 + }, + { + "epoch": 0.19972, + "grad_norm": 0.8219934537551538, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 19972 + }, + { + "epoch": 0.19973, + "grad_norm": 0.9277090498631383, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 19973 + }, + { + "epoch": 0.19974, + "grad_norm": 1.0010322521579693, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 19974 + }, + { + "epoch": 0.19975, + "grad_norm": 1.0754248434811629, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 19975 + }, + { + "epoch": 0.19976, + "grad_norm": 0.9996759123531492, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 19976 + }, + { + "epoch": 0.19977, + "grad_norm": 1.0835730332822093, + "learning_rate": 0.003, + "loss": 4.029, + "step": 19977 + }, + { + "epoch": 0.19978, + "grad_norm": 1.1182571439294586, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 19978 + }, + { + "epoch": 0.19979, + "grad_norm": 0.9474921357436006, + "learning_rate": 0.003, + "loss": 4.064, + "step": 19979 + }, + { + "epoch": 0.1998, + "grad_norm": 1.0688651065177377, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 19980 + }, + { + "epoch": 0.19981, + "grad_norm": 1.0509467419193252, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 19981 + }, + { + "epoch": 0.19982, + "grad_norm": 1.0361447387297629, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 19982 + }, + { + "epoch": 0.19983, + "grad_norm": 1.0118352507968307, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 19983 + }, + { + "epoch": 0.19984, + "grad_norm": 0.7224112297187693, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 19984 + }, + { + "epoch": 0.19985, + "grad_norm": 0.6682364334954591, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 19985 + }, + { + "epoch": 0.19986, + "grad_norm": 0.7095733709866191, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 19986 + }, + { + "epoch": 0.19987, + "grad_norm": 0.6131987616027459, + "learning_rate": 0.003, + "loss": 4.023, + "step": 19987 + }, + { + "epoch": 0.19988, + "grad_norm": 0.6676935999382909, + "learning_rate": 0.003, + "loss": 4.0007, + "step": 19988 + }, + { + "epoch": 0.19989, + "grad_norm": 0.7968219762064099, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 19989 + }, + { + "epoch": 0.1999, + "grad_norm": 0.9445274498411697, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 19990 + }, + { + "epoch": 0.19991, + "grad_norm": 1.0782156526501272, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 19991 + }, + { + "epoch": 0.19992, + "grad_norm": 0.8426083373781742, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 19992 + }, + { + "epoch": 0.19993, + "grad_norm": 0.8047996806530708, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 19993 + }, + { + "epoch": 0.19994, + "grad_norm": 0.856081906243602, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 19994 + }, + { + "epoch": 0.19995, + "grad_norm": 0.8403147233563304, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 19995 + }, + { + "epoch": 0.19996, + "grad_norm": 0.845467173435506, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 19996 + }, + { + "epoch": 0.19997, + "grad_norm": 0.6845837412725747, + "learning_rate": 0.003, + "loss": 4.031, + "step": 19997 + }, + { + "epoch": 0.19998, + "grad_norm": 0.6789030617477443, + "learning_rate": 0.003, + "loss": 4.065, + "step": 19998 + }, + { + "epoch": 0.19999, + "grad_norm": 0.5958043457351749, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 19999 + }, + { + "epoch": 0.2, + "grad_norm": 0.6026949561857949, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 20000 + }, + { + "epoch": 0.20001, + "grad_norm": 0.6327553979490099, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 20001 + }, + { + "epoch": 0.20002, + "grad_norm": 0.7678679790990631, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 20002 + }, + { + "epoch": 0.20003, + "grad_norm": 1.1010005031709966, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 20003 + }, + { + "epoch": 0.20004, + "grad_norm": 1.1403031258403544, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 20004 + }, + { + "epoch": 0.20005, + "grad_norm": 0.9623742236138284, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 20005 + }, + { + "epoch": 0.20006, + "grad_norm": 0.9806403495556745, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 20006 + }, + { + "epoch": 0.20007, + "grad_norm": 1.0725467394031813, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 20007 + }, + { + "epoch": 0.20008, + "grad_norm": 1.0881138286320984, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 20008 + }, + { + "epoch": 0.20009, + "grad_norm": 1.0718917931095369, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 20009 + }, + { + "epoch": 0.2001, + "grad_norm": 0.914771954112482, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 20010 + }, + { + "epoch": 0.20011, + "grad_norm": 0.9102671702500844, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 20011 + }, + { + "epoch": 0.20012, + "grad_norm": 0.985939701703711, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 20012 + }, + { + "epoch": 0.20013, + "grad_norm": 1.0456172407348348, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 20013 + }, + { + "epoch": 0.20014, + "grad_norm": 0.9093804870690748, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 20014 + }, + { + "epoch": 0.20015, + "grad_norm": 0.9709485483908908, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 20015 + }, + { + "epoch": 0.20016, + "grad_norm": 1.0131604480684195, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 20016 + }, + { + "epoch": 0.20017, + "grad_norm": 1.2173170340003923, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 20017 + }, + { + "epoch": 0.20018, + "grad_norm": 0.9647964027838721, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 20018 + }, + { + "epoch": 0.20019, + "grad_norm": 0.9754702722627663, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 20019 + }, + { + "epoch": 0.2002, + "grad_norm": 1.1954329106768795, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 20020 + }, + { + "epoch": 0.20021, + "grad_norm": 0.8274657349245227, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 20021 + }, + { + "epoch": 0.20022, + "grad_norm": 0.6823130708264915, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 20022 + }, + { + "epoch": 0.20023, + "grad_norm": 0.7110441144050427, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 20023 + }, + { + "epoch": 0.20024, + "grad_norm": 0.7601759184448256, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 20024 + }, + { + "epoch": 0.20025, + "grad_norm": 0.7828937579717817, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 20025 + }, + { + "epoch": 0.20026, + "grad_norm": 0.7425929457343321, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 20026 + }, + { + "epoch": 0.20027, + "grad_norm": 0.6487724460167308, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 20027 + }, + { + "epoch": 0.20028, + "grad_norm": 0.7666537863228957, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 20028 + }, + { + "epoch": 0.20029, + "grad_norm": 0.9761623573020568, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 20029 + }, + { + "epoch": 0.2003, + "grad_norm": 1.0205228779533229, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 20030 + }, + { + "epoch": 0.20031, + "grad_norm": 1.064011516280232, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 20031 + }, + { + "epoch": 0.20032, + "grad_norm": 1.070761784756568, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20032 + }, + { + "epoch": 0.20033, + "grad_norm": 0.9109460012142986, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 20033 + }, + { + "epoch": 0.20034, + "grad_norm": 0.8399057251916916, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 20034 + }, + { + "epoch": 0.20035, + "grad_norm": 0.7273898263902544, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 20035 + }, + { + "epoch": 0.20036, + "grad_norm": 0.727788067922131, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 20036 + }, + { + "epoch": 0.20037, + "grad_norm": 0.7829675425223467, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 20037 + }, + { + "epoch": 0.20038, + "grad_norm": 0.7330935305324451, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 20038 + }, + { + "epoch": 0.20039, + "grad_norm": 0.7200919666213671, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 20039 + }, + { + "epoch": 0.2004, + "grad_norm": 0.8196275115286166, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 20040 + }, + { + "epoch": 0.20041, + "grad_norm": 0.928633667376276, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 20041 + }, + { + "epoch": 0.20042, + "grad_norm": 0.9762241536343469, + "learning_rate": 0.003, + "loss": 4.097, + "step": 20042 + }, + { + "epoch": 0.20043, + "grad_norm": 1.3046399047911965, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 20043 + }, + { + "epoch": 0.20044, + "grad_norm": 0.8898628750453046, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 20044 + }, + { + "epoch": 0.20045, + "grad_norm": 0.8665306581762685, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 20045 + }, + { + "epoch": 0.20046, + "grad_norm": 0.8626539112573827, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 20046 + }, + { + "epoch": 0.20047, + "grad_norm": 0.8990512212145599, + "learning_rate": 0.003, + "loss": 4.034, + "step": 20047 + }, + { + "epoch": 0.20048, + "grad_norm": 1.0182375116981142, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 20048 + }, + { + "epoch": 0.20049, + "grad_norm": 1.0085981594424702, + "learning_rate": 0.003, + "loss": 4.048, + "step": 20049 + }, + { + "epoch": 0.2005, + "grad_norm": 1.056161106794359, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 20050 + }, + { + "epoch": 0.20051, + "grad_norm": 0.980458659881766, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 20051 + }, + { + "epoch": 0.20052, + "grad_norm": 0.8637915953946603, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 20052 + }, + { + "epoch": 0.20053, + "grad_norm": 0.7982949488395766, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 20053 + }, + { + "epoch": 0.20054, + "grad_norm": 0.793427421107944, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 20054 + }, + { + "epoch": 0.20055, + "grad_norm": 0.8668356587530134, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 20055 + }, + { + "epoch": 0.20056, + "grad_norm": 0.8650457817405732, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 20056 + }, + { + "epoch": 0.20057, + "grad_norm": 0.9942552617357852, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 20057 + }, + { + "epoch": 0.20058, + "grad_norm": 1.0241433142808658, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 20058 + }, + { + "epoch": 0.20059, + "grad_norm": 1.0345981695309152, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 20059 + }, + { + "epoch": 0.2006, + "grad_norm": 0.977356780743453, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 20060 + }, + { + "epoch": 0.20061, + "grad_norm": 0.8544746775243427, + "learning_rate": 0.003, + "loss": 4.077, + "step": 20061 + }, + { + "epoch": 0.20062, + "grad_norm": 0.8178153506969954, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 20062 + }, + { + "epoch": 0.20063, + "grad_norm": 0.865809879265382, + "learning_rate": 0.003, + "loss": 4.0978, + "step": 20063 + }, + { + "epoch": 0.20064, + "grad_norm": 0.832081819032671, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 20064 + }, + { + "epoch": 0.20065, + "grad_norm": 0.7994013015175561, + "learning_rate": 0.003, + "loss": 4.073, + "step": 20065 + }, + { + "epoch": 0.20066, + "grad_norm": 0.8227977247961066, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 20066 + }, + { + "epoch": 0.20067, + "grad_norm": 0.8007848694667724, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 20067 + }, + { + "epoch": 0.20068, + "grad_norm": 0.9951273072755855, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 20068 + }, + { + "epoch": 0.20069, + "grad_norm": 1.2166984111567118, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 20069 + }, + { + "epoch": 0.2007, + "grad_norm": 0.9194056298310658, + "learning_rate": 0.003, + "loss": 4.088, + "step": 20070 + }, + { + "epoch": 0.20071, + "grad_norm": 1.2245614817544845, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 20071 + }, + { + "epoch": 0.20072, + "grad_norm": 1.0794408169196437, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 20072 + }, + { + "epoch": 0.20073, + "grad_norm": 1.0177467952505859, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 20073 + }, + { + "epoch": 0.20074, + "grad_norm": 0.8641970977521174, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 20074 + }, + { + "epoch": 0.20075, + "grad_norm": 0.800882456548685, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 20075 + }, + { + "epoch": 0.20076, + "grad_norm": 0.8988311905748049, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 20076 + }, + { + "epoch": 0.20077, + "grad_norm": 0.966361374738778, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 20077 + }, + { + "epoch": 0.20078, + "grad_norm": 1.096992534192089, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 20078 + }, + { + "epoch": 0.20079, + "grad_norm": 0.9949261719149465, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 20079 + }, + { + "epoch": 0.2008, + "grad_norm": 0.9586607130399144, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 20080 + }, + { + "epoch": 0.20081, + "grad_norm": 0.9217083099859508, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 20081 + }, + { + "epoch": 0.20082, + "grad_norm": 0.8472852150035282, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 20082 + }, + { + "epoch": 0.20083, + "grad_norm": 0.7953677107390348, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 20083 + }, + { + "epoch": 0.20084, + "grad_norm": 0.680053715728845, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 20084 + }, + { + "epoch": 0.20085, + "grad_norm": 0.6585833058754239, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 20085 + }, + { + "epoch": 0.20086, + "grad_norm": 0.6413314084931637, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 20086 + }, + { + "epoch": 0.20087, + "grad_norm": 0.7212903936324376, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 20087 + }, + { + "epoch": 0.20088, + "grad_norm": 0.7845166610714689, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 20088 + }, + { + "epoch": 0.20089, + "grad_norm": 0.8679374978768807, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 20089 + }, + { + "epoch": 0.2009, + "grad_norm": 1.1454127256309485, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 20090 + }, + { + "epoch": 0.20091, + "grad_norm": 1.0632061020326555, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 20091 + }, + { + "epoch": 0.20092, + "grad_norm": 0.9064590068000318, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 20092 + }, + { + "epoch": 0.20093, + "grad_norm": 0.7263723515361191, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 20093 + }, + { + "epoch": 0.20094, + "grad_norm": 0.692241184637872, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 20094 + }, + { + "epoch": 0.20095, + "grad_norm": 0.7333801983406207, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 20095 + }, + { + "epoch": 0.20096, + "grad_norm": 0.794351284519141, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 20096 + }, + { + "epoch": 0.20097, + "grad_norm": 0.8389976820726525, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 20097 + }, + { + "epoch": 0.20098, + "grad_norm": 0.9503021293429343, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 20098 + }, + { + "epoch": 0.20099, + "grad_norm": 0.8926167320432626, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 20099 + }, + { + "epoch": 0.201, + "grad_norm": 1.0433992107615353, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 20100 + }, + { + "epoch": 0.20101, + "grad_norm": 1.0482758149776457, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 20101 + }, + { + "epoch": 0.20102, + "grad_norm": 1.0039374065501359, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 20102 + }, + { + "epoch": 0.20103, + "grad_norm": 1.1933143529367924, + "learning_rate": 0.003, + "loss": 4.072, + "step": 20103 + }, + { + "epoch": 0.20104, + "grad_norm": 0.9896487352504916, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 20104 + }, + { + "epoch": 0.20105, + "grad_norm": 1.2045369043709517, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 20105 + }, + { + "epoch": 0.20106, + "grad_norm": 0.9997509169271485, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 20106 + }, + { + "epoch": 0.20107, + "grad_norm": 1.062355001915111, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 20107 + }, + { + "epoch": 0.20108, + "grad_norm": 0.9383116181994937, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 20108 + }, + { + "epoch": 0.20109, + "grad_norm": 0.802970277897396, + "learning_rate": 0.003, + "loss": 4.075, + "step": 20109 + }, + { + "epoch": 0.2011, + "grad_norm": 0.8553015733607501, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 20110 + }, + { + "epoch": 0.20111, + "grad_norm": 0.9761260316599434, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 20111 + }, + { + "epoch": 0.20112, + "grad_norm": 1.1648781140489337, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 20112 + }, + { + "epoch": 0.20113, + "grad_norm": 0.884934332495618, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 20113 + }, + { + "epoch": 0.20114, + "grad_norm": 0.6920056517435634, + "learning_rate": 0.003, + "loss": 4.045, + "step": 20114 + }, + { + "epoch": 0.20115, + "grad_norm": 0.6035490806756311, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 20115 + }, + { + "epoch": 0.20116, + "grad_norm": 0.5843184361926623, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 20116 + }, + { + "epoch": 0.20117, + "grad_norm": 0.7327969657916124, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 20117 + }, + { + "epoch": 0.20118, + "grad_norm": 0.8113752494147356, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 20118 + }, + { + "epoch": 0.20119, + "grad_norm": 0.9418203424481393, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 20119 + }, + { + "epoch": 0.2012, + "grad_norm": 1.0395802115500303, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 20120 + }, + { + "epoch": 0.20121, + "grad_norm": 0.9336540311815786, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 20121 + }, + { + "epoch": 0.20122, + "grad_norm": 0.8491760428753012, + "learning_rate": 0.003, + "loss": 4.089, + "step": 20122 + }, + { + "epoch": 0.20123, + "grad_norm": 0.7735078244816492, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 20123 + }, + { + "epoch": 0.20124, + "grad_norm": 0.7511367894849116, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 20124 + }, + { + "epoch": 0.20125, + "grad_norm": 0.6969386030563182, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 20125 + }, + { + "epoch": 0.20126, + "grad_norm": 0.6999763773198828, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 20126 + }, + { + "epoch": 0.20127, + "grad_norm": 0.7262174914142938, + "learning_rate": 0.003, + "loss": 4.042, + "step": 20127 + }, + { + "epoch": 0.20128, + "grad_norm": 0.8739883979365946, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 20128 + }, + { + "epoch": 0.20129, + "grad_norm": 0.972758276404007, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 20129 + }, + { + "epoch": 0.2013, + "grad_norm": 1.0737773194009104, + "learning_rate": 0.003, + "loss": 4.08, + "step": 20130 + }, + { + "epoch": 0.20131, + "grad_norm": 0.9937402491948394, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 20131 + }, + { + "epoch": 0.20132, + "grad_norm": 0.9111727913298527, + "learning_rate": 0.003, + "loss": 4.076, + "step": 20132 + }, + { + "epoch": 0.20133, + "grad_norm": 0.7512459276066992, + "learning_rate": 0.003, + "loss": 4.049, + "step": 20133 + }, + { + "epoch": 0.20134, + "grad_norm": 0.7969020965012391, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 20134 + }, + { + "epoch": 0.20135, + "grad_norm": 0.7631114134969357, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 20135 + }, + { + "epoch": 0.20136, + "grad_norm": 0.8350155763338005, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 20136 + }, + { + "epoch": 0.20137, + "grad_norm": 0.9764309152862, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 20137 + }, + { + "epoch": 0.20138, + "grad_norm": 1.0119550487547355, + "learning_rate": 0.003, + "loss": 4.073, + "step": 20138 + }, + { + "epoch": 0.20139, + "grad_norm": 0.951657988230318, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 20139 + }, + { + "epoch": 0.2014, + "grad_norm": 0.9788969500181236, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 20140 + }, + { + "epoch": 0.20141, + "grad_norm": 0.9689570898898988, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 20141 + }, + { + "epoch": 0.20142, + "grad_norm": 1.300572130299451, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 20142 + }, + { + "epoch": 0.20143, + "grad_norm": 1.006274161308575, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 20143 + }, + { + "epoch": 0.20144, + "grad_norm": 0.8999385868838741, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 20144 + }, + { + "epoch": 0.20145, + "grad_norm": 0.9353114000308074, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 20145 + }, + { + "epoch": 0.20146, + "grad_norm": 1.0665073005313308, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 20146 + }, + { + "epoch": 0.20147, + "grad_norm": 1.0174843957750728, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 20147 + }, + { + "epoch": 0.20148, + "grad_norm": 0.9992685927973987, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 20148 + }, + { + "epoch": 0.20149, + "grad_norm": 0.9767025424438643, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 20149 + }, + { + "epoch": 0.2015, + "grad_norm": 1.001394768461815, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 20150 + }, + { + "epoch": 0.20151, + "grad_norm": 1.1170636149419098, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 20151 + }, + { + "epoch": 0.20152, + "grad_norm": 0.9104064337618925, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 20152 + }, + { + "epoch": 0.20153, + "grad_norm": 0.8642714880219691, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 20153 + }, + { + "epoch": 0.20154, + "grad_norm": 1.088692879262389, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 20154 + }, + { + "epoch": 0.20155, + "grad_norm": 1.0857414998387973, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 20155 + }, + { + "epoch": 0.20156, + "grad_norm": 1.157432911663517, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 20156 + }, + { + "epoch": 0.20157, + "grad_norm": 0.8833071071180393, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 20157 + }, + { + "epoch": 0.20158, + "grad_norm": 0.7915807017373842, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 20158 + }, + { + "epoch": 0.20159, + "grad_norm": 0.7512532350134129, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 20159 + }, + { + "epoch": 0.2016, + "grad_norm": 0.6932030260042161, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 20160 + }, + { + "epoch": 0.20161, + "grad_norm": 0.7001222311666129, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 20161 + }, + { + "epoch": 0.20162, + "grad_norm": 0.739884028849956, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 20162 + }, + { + "epoch": 0.20163, + "grad_norm": 0.8449774946380476, + "learning_rate": 0.003, + "loss": 4.07, + "step": 20163 + }, + { + "epoch": 0.20164, + "grad_norm": 0.9717015055538465, + "learning_rate": 0.003, + "loss": 4.098, + "step": 20164 + }, + { + "epoch": 0.20165, + "grad_norm": 1.1412913747142812, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 20165 + }, + { + "epoch": 0.20166, + "grad_norm": 1.0779748856212756, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 20166 + }, + { + "epoch": 0.20167, + "grad_norm": 1.0759243555954132, + "learning_rate": 0.003, + "loss": 4.04, + "step": 20167 + }, + { + "epoch": 0.20168, + "grad_norm": 1.0073448137041712, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 20168 + }, + { + "epoch": 0.20169, + "grad_norm": 0.9629958690270425, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 20169 + }, + { + "epoch": 0.2017, + "grad_norm": 0.9124721385627201, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 20170 + }, + { + "epoch": 0.20171, + "grad_norm": 0.8481950211652999, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 20171 + }, + { + "epoch": 0.20172, + "grad_norm": 0.8660288587365784, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 20172 + }, + { + "epoch": 0.20173, + "grad_norm": 0.9027104326908204, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 20173 + }, + { + "epoch": 0.20174, + "grad_norm": 0.854115571736065, + "learning_rate": 0.003, + "loss": 4.033, + "step": 20174 + }, + { + "epoch": 0.20175, + "grad_norm": 0.922048558283872, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 20175 + }, + { + "epoch": 0.20176, + "grad_norm": 0.8879094371694463, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 20176 + }, + { + "epoch": 0.20177, + "grad_norm": 0.8079154857394067, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 20177 + }, + { + "epoch": 0.20178, + "grad_norm": 0.70027015338177, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 20178 + }, + { + "epoch": 0.20179, + "grad_norm": 0.6990237671533888, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 20179 + }, + { + "epoch": 0.2018, + "grad_norm": 0.719796276637802, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 20180 + }, + { + "epoch": 0.20181, + "grad_norm": 0.7717792879200722, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 20181 + }, + { + "epoch": 0.20182, + "grad_norm": 0.8100546162235994, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 20182 + }, + { + "epoch": 0.20183, + "grad_norm": 0.8883890842393081, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 20183 + }, + { + "epoch": 0.20184, + "grad_norm": 1.0540406723671027, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 20184 + }, + { + "epoch": 0.20185, + "grad_norm": 1.073450857768077, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 20185 + }, + { + "epoch": 0.20186, + "grad_norm": 1.0054829814140989, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 20186 + }, + { + "epoch": 0.20187, + "grad_norm": 1.035973238825491, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 20187 + }, + { + "epoch": 0.20188, + "grad_norm": 0.8482124744198021, + "learning_rate": 0.003, + "loss": 4.075, + "step": 20188 + }, + { + "epoch": 0.20189, + "grad_norm": 0.8146430944762663, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 20189 + }, + { + "epoch": 0.2019, + "grad_norm": 0.6385110414005808, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 20190 + }, + { + "epoch": 0.20191, + "grad_norm": 0.6441925369623385, + "learning_rate": 0.003, + "loss": 4.075, + "step": 20191 + }, + { + "epoch": 0.20192, + "grad_norm": 0.6724475842316535, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 20192 + }, + { + "epoch": 0.20193, + "grad_norm": 0.6739740191097858, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 20193 + }, + { + "epoch": 0.20194, + "grad_norm": 0.6068180262602129, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 20194 + }, + { + "epoch": 0.20195, + "grad_norm": 0.6362317453360848, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 20195 + }, + { + "epoch": 0.20196, + "grad_norm": 0.6490550355193817, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 20196 + }, + { + "epoch": 0.20197, + "grad_norm": 0.7900011403127508, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 20197 + }, + { + "epoch": 0.20198, + "grad_norm": 0.9316803893003885, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 20198 + }, + { + "epoch": 0.20199, + "grad_norm": 1.1392797863745374, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 20199 + }, + { + "epoch": 0.202, + "grad_norm": 0.8636391596598383, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 20200 + }, + { + "epoch": 0.20201, + "grad_norm": 0.7846545207634401, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 20201 + }, + { + "epoch": 0.20202, + "grad_norm": 0.7852673460348296, + "learning_rate": 0.003, + "loss": 4.091, + "step": 20202 + }, + { + "epoch": 0.20203, + "grad_norm": 0.8523936286459818, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 20203 + }, + { + "epoch": 0.20204, + "grad_norm": 1.0220483032405032, + "learning_rate": 0.003, + "loss": 4.079, + "step": 20204 + }, + { + "epoch": 0.20205, + "grad_norm": 1.157570379991195, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 20205 + }, + { + "epoch": 0.20206, + "grad_norm": 1.0744460553145834, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 20206 + }, + { + "epoch": 0.20207, + "grad_norm": 1.039895689176448, + "learning_rate": 0.003, + "loss": 4.049, + "step": 20207 + }, + { + "epoch": 0.20208, + "grad_norm": 1.0432754314459542, + "learning_rate": 0.003, + "loss": 4.038, + "step": 20208 + }, + { + "epoch": 0.20209, + "grad_norm": 1.1489355832747565, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 20209 + }, + { + "epoch": 0.2021, + "grad_norm": 1.0043874100022252, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20210 + }, + { + "epoch": 0.20211, + "grad_norm": 0.9361477626640528, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 20211 + }, + { + "epoch": 0.20212, + "grad_norm": 0.937678561593136, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 20212 + }, + { + "epoch": 0.20213, + "grad_norm": 0.9656190352445899, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 20213 + }, + { + "epoch": 0.20214, + "grad_norm": 1.002992366793816, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 20214 + }, + { + "epoch": 0.20215, + "grad_norm": 1.11072175656836, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 20215 + }, + { + "epoch": 0.20216, + "grad_norm": 0.8193804166077443, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 20216 + }, + { + "epoch": 0.20217, + "grad_norm": 0.8839116205628246, + "learning_rate": 0.003, + "loss": 4.047, + "step": 20217 + }, + { + "epoch": 0.20218, + "grad_norm": 0.836534442731943, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 20218 + }, + { + "epoch": 0.20219, + "grad_norm": 0.9775258920919472, + "learning_rate": 0.003, + "loss": 4.045, + "step": 20219 + }, + { + "epoch": 0.2022, + "grad_norm": 1.0743878029089904, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 20220 + }, + { + "epoch": 0.20221, + "grad_norm": 0.7787746213572123, + "learning_rate": 0.003, + "loss": 4.063, + "step": 20221 + }, + { + "epoch": 0.20222, + "grad_norm": 0.7756070732132835, + "learning_rate": 0.003, + "loss": 4.058, + "step": 20222 + }, + { + "epoch": 0.20223, + "grad_norm": 0.8074855184620184, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 20223 + }, + { + "epoch": 0.20224, + "grad_norm": 0.8148244426685318, + "learning_rate": 0.003, + "loss": 4.0937, + "step": 20224 + }, + { + "epoch": 0.20225, + "grad_norm": 0.9368431478590745, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 20225 + }, + { + "epoch": 0.20226, + "grad_norm": 1.2806381041315096, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 20226 + }, + { + "epoch": 0.20227, + "grad_norm": 0.9253337166534994, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 20227 + }, + { + "epoch": 0.20228, + "grad_norm": 0.8986734494879194, + "learning_rate": 0.003, + "loss": 4.037, + "step": 20228 + }, + { + "epoch": 0.20229, + "grad_norm": 0.8509111303238335, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 20229 + }, + { + "epoch": 0.2023, + "grad_norm": 0.7731995600417705, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 20230 + }, + { + "epoch": 0.20231, + "grad_norm": 0.7066170308016798, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 20231 + }, + { + "epoch": 0.20232, + "grad_norm": 0.7137967052910865, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 20232 + }, + { + "epoch": 0.20233, + "grad_norm": 0.8261074430325647, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 20233 + }, + { + "epoch": 0.20234, + "grad_norm": 0.9237243484655616, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 20234 + }, + { + "epoch": 0.20235, + "grad_norm": 0.9572598754813575, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 20235 + }, + { + "epoch": 0.20236, + "grad_norm": 0.8782449084523822, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 20236 + }, + { + "epoch": 0.20237, + "grad_norm": 0.8087711252183536, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 20237 + }, + { + "epoch": 0.20238, + "grad_norm": 0.6463272323360605, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 20238 + }, + { + "epoch": 0.20239, + "grad_norm": 0.715465232808237, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 20239 + }, + { + "epoch": 0.2024, + "grad_norm": 0.6632546418356409, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 20240 + }, + { + "epoch": 0.20241, + "grad_norm": 0.7370390154758478, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 20241 + }, + { + "epoch": 0.20242, + "grad_norm": 0.8813729339328653, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 20242 + }, + { + "epoch": 0.20243, + "grad_norm": 0.975946981254012, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 20243 + }, + { + "epoch": 0.20244, + "grad_norm": 1.0454259863575712, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 20244 + }, + { + "epoch": 0.20245, + "grad_norm": 0.9757392887520446, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 20245 + }, + { + "epoch": 0.20246, + "grad_norm": 0.9733053043852302, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 20246 + }, + { + "epoch": 0.20247, + "grad_norm": 0.9515100434968863, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 20247 + }, + { + "epoch": 0.20248, + "grad_norm": 1.0458783512042407, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 20248 + }, + { + "epoch": 0.20249, + "grad_norm": 0.8950611747541216, + "learning_rate": 0.003, + "loss": 4.063, + "step": 20249 + }, + { + "epoch": 0.2025, + "grad_norm": 0.9453026772316225, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 20250 + }, + { + "epoch": 0.20251, + "grad_norm": 0.9299160323287975, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 20251 + }, + { + "epoch": 0.20252, + "grad_norm": 0.8638490249225725, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 20252 + }, + { + "epoch": 0.20253, + "grad_norm": 0.8863765910978502, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 20253 + }, + { + "epoch": 0.20254, + "grad_norm": 1.0156510446622808, + "learning_rate": 0.003, + "loss": 4.1059, + "step": 20254 + }, + { + "epoch": 0.20255, + "grad_norm": 1.2454649637167758, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 20255 + }, + { + "epoch": 0.20256, + "grad_norm": 0.9437518589174526, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 20256 + }, + { + "epoch": 0.20257, + "grad_norm": 1.2598994010738565, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 20257 + }, + { + "epoch": 0.20258, + "grad_norm": 0.8993262476436259, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 20258 + }, + { + "epoch": 0.20259, + "grad_norm": 0.9342959878453537, + "learning_rate": 0.003, + "loss": 4.073, + "step": 20259 + }, + { + "epoch": 0.2026, + "grad_norm": 1.20464026324837, + "learning_rate": 0.003, + "loss": 4.074, + "step": 20260 + }, + { + "epoch": 0.20261, + "grad_norm": 1.1732592843323337, + "learning_rate": 0.003, + "loss": 4.0989, + "step": 20261 + }, + { + "epoch": 0.20262, + "grad_norm": 1.0258714972568241, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 20262 + }, + { + "epoch": 0.20263, + "grad_norm": 1.0385998257771591, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 20263 + }, + { + "epoch": 0.20264, + "grad_norm": 0.8650415025419821, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 20264 + }, + { + "epoch": 0.20265, + "grad_norm": 0.8201303882430809, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 20265 + }, + { + "epoch": 0.20266, + "grad_norm": 0.6738704437704791, + "learning_rate": 0.003, + "loss": 4.046, + "step": 20266 + }, + { + "epoch": 0.20267, + "grad_norm": 0.6896656664142933, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 20267 + }, + { + "epoch": 0.20268, + "grad_norm": 0.7790935277931689, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 20268 + }, + { + "epoch": 0.20269, + "grad_norm": 0.8953200803606676, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20269 + }, + { + "epoch": 0.2027, + "grad_norm": 0.8901116875320236, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 20270 + }, + { + "epoch": 0.20271, + "grad_norm": 1.0079781240388708, + "learning_rate": 0.003, + "loss": 4.087, + "step": 20271 + }, + { + "epoch": 0.20272, + "grad_norm": 1.112553755440471, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 20272 + }, + { + "epoch": 0.20273, + "grad_norm": 1.0474804000177926, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 20273 + }, + { + "epoch": 0.20274, + "grad_norm": 0.9931938834025967, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 20274 + }, + { + "epoch": 0.20275, + "grad_norm": 0.9872937962963939, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 20275 + }, + { + "epoch": 0.20276, + "grad_norm": 1.0901152333849533, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 20276 + }, + { + "epoch": 0.20277, + "grad_norm": 0.9215653619752749, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 20277 + }, + { + "epoch": 0.20278, + "grad_norm": 0.7634688018371093, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 20278 + }, + { + "epoch": 0.20279, + "grad_norm": 0.7297325604080956, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 20279 + }, + { + "epoch": 0.2028, + "grad_norm": 0.69564024170904, + "learning_rate": 0.003, + "loss": 4.016, + "step": 20280 + }, + { + "epoch": 0.20281, + "grad_norm": 0.7179749761523376, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 20281 + }, + { + "epoch": 0.20282, + "grad_norm": 0.6844268746086973, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 20282 + }, + { + "epoch": 0.20283, + "grad_norm": 0.5854488752875975, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 20283 + }, + { + "epoch": 0.20284, + "grad_norm": 0.5963854325655069, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 20284 + }, + { + "epoch": 0.20285, + "grad_norm": 0.6098938917123762, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 20285 + }, + { + "epoch": 0.20286, + "grad_norm": 0.5962729372147907, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 20286 + }, + { + "epoch": 0.20287, + "grad_norm": 0.6072888650026789, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 20287 + }, + { + "epoch": 0.20288, + "grad_norm": 0.6699623530934661, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 20288 + }, + { + "epoch": 0.20289, + "grad_norm": 0.5807723898612621, + "learning_rate": 0.003, + "loss": 4.059, + "step": 20289 + }, + { + "epoch": 0.2029, + "grad_norm": 0.6953945688022403, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 20290 + }, + { + "epoch": 0.20291, + "grad_norm": 0.8432955118802364, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 20291 + }, + { + "epoch": 0.20292, + "grad_norm": 0.7813820299037514, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 20292 + }, + { + "epoch": 0.20293, + "grad_norm": 0.7398672292874032, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 20293 + }, + { + "epoch": 0.20294, + "grad_norm": 0.9661154390823129, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 20294 + }, + { + "epoch": 0.20295, + "grad_norm": 1.4038009561188358, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 20295 + }, + { + "epoch": 0.20296, + "grad_norm": 1.0482101762314422, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 20296 + }, + { + "epoch": 0.20297, + "grad_norm": 1.0796339006398183, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 20297 + }, + { + "epoch": 0.20298, + "grad_norm": 1.0218724339767467, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 20298 + }, + { + "epoch": 0.20299, + "grad_norm": 1.014017049916483, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 20299 + }, + { + "epoch": 0.203, + "grad_norm": 0.9514457008536702, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 20300 + }, + { + "epoch": 0.20301, + "grad_norm": 0.9451901872687277, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 20301 + }, + { + "epoch": 0.20302, + "grad_norm": 0.9336624962525484, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 20302 + }, + { + "epoch": 0.20303, + "grad_norm": 0.9104782036823139, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 20303 + }, + { + "epoch": 0.20304, + "grad_norm": 0.8793264935377031, + "learning_rate": 0.003, + "loss": 4.047, + "step": 20304 + }, + { + "epoch": 0.20305, + "grad_norm": 0.920158024134499, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 20305 + }, + { + "epoch": 0.20306, + "grad_norm": 0.9829195665231453, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 20306 + }, + { + "epoch": 0.20307, + "grad_norm": 0.9827866420816818, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 20307 + }, + { + "epoch": 0.20308, + "grad_norm": 1.009209299906646, + "learning_rate": 0.003, + "loss": 4.06, + "step": 20308 + }, + { + "epoch": 0.20309, + "grad_norm": 0.9132899948677269, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 20309 + }, + { + "epoch": 0.2031, + "grad_norm": 0.830284613812082, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 20310 + }, + { + "epoch": 0.20311, + "grad_norm": 0.9944728139270856, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 20311 + }, + { + "epoch": 0.20312, + "grad_norm": 1.058719052336157, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 20312 + }, + { + "epoch": 0.20313, + "grad_norm": 0.9071972229971375, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 20313 + }, + { + "epoch": 0.20314, + "grad_norm": 0.9422127166926286, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 20314 + }, + { + "epoch": 0.20315, + "grad_norm": 0.9936909107378495, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 20315 + }, + { + "epoch": 0.20316, + "grad_norm": 1.162536321472046, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 20316 + }, + { + "epoch": 0.20317, + "grad_norm": 0.9194171985720333, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 20317 + }, + { + "epoch": 0.20318, + "grad_norm": 0.9935602341141521, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 20318 + }, + { + "epoch": 0.20319, + "grad_norm": 1.0851518539442337, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 20319 + }, + { + "epoch": 0.2032, + "grad_norm": 1.035486145364161, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 20320 + }, + { + "epoch": 0.20321, + "grad_norm": 1.091192956432024, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 20321 + }, + { + "epoch": 0.20322, + "grad_norm": 1.0004206355223928, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 20322 + }, + { + "epoch": 0.20323, + "grad_norm": 1.2288960990384825, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 20323 + }, + { + "epoch": 0.20324, + "grad_norm": 0.8304791348400947, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 20324 + }, + { + "epoch": 0.20325, + "grad_norm": 0.8064271454877502, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 20325 + }, + { + "epoch": 0.20326, + "grad_norm": 0.800986609943331, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 20326 + }, + { + "epoch": 0.20327, + "grad_norm": 0.6831816181323651, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 20327 + }, + { + "epoch": 0.20328, + "grad_norm": 0.6698244473488945, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 20328 + }, + { + "epoch": 0.20329, + "grad_norm": 0.6836107974459831, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 20329 + }, + { + "epoch": 0.2033, + "grad_norm": 0.6670341468558074, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 20330 + }, + { + "epoch": 0.20331, + "grad_norm": 0.6964106515599698, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 20331 + }, + { + "epoch": 0.20332, + "grad_norm": 0.8326064308556393, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 20332 + }, + { + "epoch": 0.20333, + "grad_norm": 0.8854471703948902, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 20333 + }, + { + "epoch": 0.20334, + "grad_norm": 0.9832363699873181, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 20334 + }, + { + "epoch": 0.20335, + "grad_norm": 0.9749256269296941, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 20335 + }, + { + "epoch": 0.20336, + "grad_norm": 0.9308474509699046, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 20336 + }, + { + "epoch": 0.20337, + "grad_norm": 0.8646646010288128, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 20337 + }, + { + "epoch": 0.20338, + "grad_norm": 0.7213441182344377, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 20338 + }, + { + "epoch": 0.20339, + "grad_norm": 0.8554332295457631, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 20339 + }, + { + "epoch": 0.2034, + "grad_norm": 1.150390161504409, + "learning_rate": 0.003, + "loss": 4.0891, + "step": 20340 + }, + { + "epoch": 0.20341, + "grad_norm": 0.9841655065329525, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 20341 + }, + { + "epoch": 0.20342, + "grad_norm": 0.9712187061482087, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 20342 + }, + { + "epoch": 0.20343, + "grad_norm": 1.1508129114703876, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 20343 + }, + { + "epoch": 0.20344, + "grad_norm": 0.8801541375228003, + "learning_rate": 0.003, + "loss": 4.051, + "step": 20344 + }, + { + "epoch": 0.20345, + "grad_norm": 0.8338339714908491, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 20345 + }, + { + "epoch": 0.20346, + "grad_norm": 0.8981142243378957, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 20346 + }, + { + "epoch": 0.20347, + "grad_norm": 1.10750992003621, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 20347 + }, + { + "epoch": 0.20348, + "grad_norm": 0.9262372518515493, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 20348 + }, + { + "epoch": 0.20349, + "grad_norm": 0.9083627863692587, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 20349 + }, + { + "epoch": 0.2035, + "grad_norm": 0.9669370176657444, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 20350 + }, + { + "epoch": 0.20351, + "grad_norm": 0.9371760903200274, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 20351 + }, + { + "epoch": 0.20352, + "grad_norm": 1.0764525814282575, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 20352 + }, + { + "epoch": 0.20353, + "grad_norm": 0.9451992015799721, + "learning_rate": 0.003, + "loss": 4.059, + "step": 20353 + }, + { + "epoch": 0.20354, + "grad_norm": 0.9475795627950271, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 20354 + }, + { + "epoch": 0.20355, + "grad_norm": 0.8889366126293408, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 20355 + }, + { + "epoch": 0.20356, + "grad_norm": 0.8715422943535401, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 20356 + }, + { + "epoch": 0.20357, + "grad_norm": 0.8724064789591822, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 20357 + }, + { + "epoch": 0.20358, + "grad_norm": 0.7577581334178127, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 20358 + }, + { + "epoch": 0.20359, + "grad_norm": 0.849673720907657, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 20359 + }, + { + "epoch": 0.2036, + "grad_norm": 0.9464323635169056, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 20360 + }, + { + "epoch": 0.20361, + "grad_norm": 1.0132644342636135, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 20361 + }, + { + "epoch": 0.20362, + "grad_norm": 1.113709142377863, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 20362 + }, + { + "epoch": 0.20363, + "grad_norm": 1.096318058425732, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 20363 + }, + { + "epoch": 0.20364, + "grad_norm": 0.8918511224263473, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 20364 + }, + { + "epoch": 0.20365, + "grad_norm": 0.8814368778264271, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 20365 + }, + { + "epoch": 0.20366, + "grad_norm": 1.008547111121039, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 20366 + }, + { + "epoch": 0.20367, + "grad_norm": 0.9860100692413479, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 20367 + }, + { + "epoch": 0.20368, + "grad_norm": 0.8759594785251976, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 20368 + }, + { + "epoch": 0.20369, + "grad_norm": 0.9018001691983991, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 20369 + }, + { + "epoch": 0.2037, + "grad_norm": 0.9071969496138035, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 20370 + }, + { + "epoch": 0.20371, + "grad_norm": 0.9574620976549254, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 20371 + }, + { + "epoch": 0.20372, + "grad_norm": 1.1102602667409915, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 20372 + }, + { + "epoch": 0.20373, + "grad_norm": 0.8446084170221155, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 20373 + }, + { + "epoch": 0.20374, + "grad_norm": 0.7107658690378214, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 20374 + }, + { + "epoch": 0.20375, + "grad_norm": 0.6622757320982561, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 20375 + }, + { + "epoch": 0.20376, + "grad_norm": 0.7660040640261622, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 20376 + }, + { + "epoch": 0.20377, + "grad_norm": 0.8934638908111394, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 20377 + }, + { + "epoch": 0.20378, + "grad_norm": 1.0627034179668389, + "learning_rate": 0.003, + "loss": 4.06, + "step": 20378 + }, + { + "epoch": 0.20379, + "grad_norm": 1.0354579035375133, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 20379 + }, + { + "epoch": 0.2038, + "grad_norm": 0.9626189025237234, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 20380 + }, + { + "epoch": 0.20381, + "grad_norm": 0.8320616410557015, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 20381 + }, + { + "epoch": 0.20382, + "grad_norm": 0.7570190863218998, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 20382 + }, + { + "epoch": 0.20383, + "grad_norm": 0.8588888356312102, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 20383 + }, + { + "epoch": 0.20384, + "grad_norm": 0.8352489656803421, + "learning_rate": 0.003, + "loss": 4.089, + "step": 20384 + }, + { + "epoch": 0.20385, + "grad_norm": 0.8449320318114233, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 20385 + }, + { + "epoch": 0.20386, + "grad_norm": 0.9043727654539504, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 20386 + }, + { + "epoch": 0.20387, + "grad_norm": 0.9262840876713996, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 20387 + }, + { + "epoch": 0.20388, + "grad_norm": 0.7633299488947757, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 20388 + }, + { + "epoch": 0.20389, + "grad_norm": 0.7089061109988785, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 20389 + }, + { + "epoch": 0.2039, + "grad_norm": 0.6852516371589236, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 20390 + }, + { + "epoch": 0.20391, + "grad_norm": 0.6890443833152825, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 20391 + }, + { + "epoch": 0.20392, + "grad_norm": 0.6950555176149104, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 20392 + }, + { + "epoch": 0.20393, + "grad_norm": 0.7488331772603604, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 20393 + }, + { + "epoch": 0.20394, + "grad_norm": 0.9987496497416537, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 20394 + }, + { + "epoch": 0.20395, + "grad_norm": 1.4270099794959688, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 20395 + }, + { + "epoch": 0.20396, + "grad_norm": 0.6885336538394853, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 20396 + }, + { + "epoch": 0.20397, + "grad_norm": 0.7090730847797369, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 20397 + }, + { + "epoch": 0.20398, + "grad_norm": 0.6994073842065481, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 20398 + }, + { + "epoch": 0.20399, + "grad_norm": 0.8458455248697703, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 20399 + }, + { + "epoch": 0.204, + "grad_norm": 0.9695888822578032, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 20400 + }, + { + "epoch": 0.20401, + "grad_norm": 1.078946888911514, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 20401 + }, + { + "epoch": 0.20402, + "grad_norm": 1.1196792709555141, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 20402 + }, + { + "epoch": 0.20403, + "grad_norm": 1.3348823087617145, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 20403 + }, + { + "epoch": 0.20404, + "grad_norm": 0.8009892149025997, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 20404 + }, + { + "epoch": 0.20405, + "grad_norm": 0.7065100835983656, + "learning_rate": 0.003, + "loss": 4.038, + "step": 20405 + }, + { + "epoch": 0.20406, + "grad_norm": 0.7573713286699112, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 20406 + }, + { + "epoch": 0.20407, + "grad_norm": 0.6967802330299365, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 20407 + }, + { + "epoch": 0.20408, + "grad_norm": 0.8271654350425968, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 20408 + }, + { + "epoch": 0.20409, + "grad_norm": 0.9871896047370817, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 20409 + }, + { + "epoch": 0.2041, + "grad_norm": 1.0198483403852818, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 20410 + }, + { + "epoch": 0.20411, + "grad_norm": 0.9487269969845029, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 20411 + }, + { + "epoch": 0.20412, + "grad_norm": 0.8475615412902817, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 20412 + }, + { + "epoch": 0.20413, + "grad_norm": 0.759605745291794, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 20413 + }, + { + "epoch": 0.20414, + "grad_norm": 0.695891917218297, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 20414 + }, + { + "epoch": 0.20415, + "grad_norm": 0.6323612963788035, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 20415 + }, + { + "epoch": 0.20416, + "grad_norm": 0.8703389776793776, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 20416 + }, + { + "epoch": 0.20417, + "grad_norm": 1.3567765720103246, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 20417 + }, + { + "epoch": 0.20418, + "grad_norm": 0.9088617391430327, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 20418 + }, + { + "epoch": 0.20419, + "grad_norm": 0.9305037687687804, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 20419 + }, + { + "epoch": 0.2042, + "grad_norm": 0.9393247490018447, + "learning_rate": 0.003, + "loss": 4.047, + "step": 20420 + }, + { + "epoch": 0.20421, + "grad_norm": 1.084251376364569, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 20421 + }, + { + "epoch": 0.20422, + "grad_norm": 1.0480779731881558, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 20422 + }, + { + "epoch": 0.20423, + "grad_norm": 1.096185344863322, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 20423 + }, + { + "epoch": 0.20424, + "grad_norm": 0.8554250545045515, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 20424 + }, + { + "epoch": 0.20425, + "grad_norm": 0.7402055273951321, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 20425 + }, + { + "epoch": 0.20426, + "grad_norm": 0.7666717015350778, + "learning_rate": 0.003, + "loss": 4.056, + "step": 20426 + }, + { + "epoch": 0.20427, + "grad_norm": 0.8699294213206213, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 20427 + }, + { + "epoch": 0.20428, + "grad_norm": 1.036872305261095, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 20428 + }, + { + "epoch": 0.20429, + "grad_norm": 1.0979866792095243, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 20429 + }, + { + "epoch": 0.2043, + "grad_norm": 0.7783377055456764, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 20430 + }, + { + "epoch": 0.20431, + "grad_norm": 0.5795730695623161, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 20431 + }, + { + "epoch": 0.20432, + "grad_norm": 0.5990744608085088, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 20432 + }, + { + "epoch": 0.20433, + "grad_norm": 0.8101839593489611, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 20433 + }, + { + "epoch": 0.20434, + "grad_norm": 0.9516803451186477, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 20434 + }, + { + "epoch": 0.20435, + "grad_norm": 0.9590346770043936, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 20435 + }, + { + "epoch": 0.20436, + "grad_norm": 0.8573206563355135, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 20436 + }, + { + "epoch": 0.20437, + "grad_norm": 0.789035545952203, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 20437 + }, + { + "epoch": 0.20438, + "grad_norm": 0.8466330985259529, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 20438 + }, + { + "epoch": 0.20439, + "grad_norm": 0.8619669794560092, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 20439 + }, + { + "epoch": 0.2044, + "grad_norm": 1.0426690094942102, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 20440 + }, + { + "epoch": 0.20441, + "grad_norm": 1.116082469843466, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 20441 + }, + { + "epoch": 0.20442, + "grad_norm": 0.9464521441924234, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 20442 + }, + { + "epoch": 0.20443, + "grad_norm": 0.9802945819296564, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 20443 + }, + { + "epoch": 0.20444, + "grad_norm": 1.0553713145146217, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 20444 + }, + { + "epoch": 0.20445, + "grad_norm": 0.9954685649700676, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 20445 + }, + { + "epoch": 0.20446, + "grad_norm": 0.9494693143464139, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 20446 + }, + { + "epoch": 0.20447, + "grad_norm": 0.833211628258708, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 20447 + }, + { + "epoch": 0.20448, + "grad_norm": 0.7456359889608268, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 20448 + }, + { + "epoch": 0.20449, + "grad_norm": 0.8680749039270492, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 20449 + }, + { + "epoch": 0.2045, + "grad_norm": 1.0406741653471223, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 20450 + }, + { + "epoch": 0.20451, + "grad_norm": 1.105849498852683, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 20451 + }, + { + "epoch": 0.20452, + "grad_norm": 0.9190408394695084, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 20452 + }, + { + "epoch": 0.20453, + "grad_norm": 0.7934413235091964, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 20453 + }, + { + "epoch": 0.20454, + "grad_norm": 0.6756722314647762, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 20454 + }, + { + "epoch": 0.20455, + "grad_norm": 0.7120986098490817, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 20455 + }, + { + "epoch": 0.20456, + "grad_norm": 0.7610572564116208, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 20456 + }, + { + "epoch": 0.20457, + "grad_norm": 0.7434780570804814, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 20457 + }, + { + "epoch": 0.20458, + "grad_norm": 0.7141520120292778, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 20458 + }, + { + "epoch": 0.20459, + "grad_norm": 0.7490897191636853, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 20459 + }, + { + "epoch": 0.2046, + "grad_norm": 0.7523301229952136, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 20460 + }, + { + "epoch": 0.20461, + "grad_norm": 0.7648327990348492, + "learning_rate": 0.003, + "loss": 4.054, + "step": 20461 + }, + { + "epoch": 0.20462, + "grad_norm": 0.7794387966822662, + "learning_rate": 0.003, + "loss": 4.049, + "step": 20462 + }, + { + "epoch": 0.20463, + "grad_norm": 0.9098949286973006, + "learning_rate": 0.003, + "loss": 4.042, + "step": 20463 + }, + { + "epoch": 0.20464, + "grad_norm": 1.1543422982397333, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 20464 + }, + { + "epoch": 0.20465, + "grad_norm": 1.0639337478565065, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 20465 + }, + { + "epoch": 0.20466, + "grad_norm": 0.9650600560385093, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 20466 + }, + { + "epoch": 0.20467, + "grad_norm": 0.8787347611544989, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 20467 + }, + { + "epoch": 0.20468, + "grad_norm": 0.8979324706387466, + "learning_rate": 0.003, + "loss": 4.067, + "step": 20468 + }, + { + "epoch": 0.20469, + "grad_norm": 0.9565197110482284, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 20469 + }, + { + "epoch": 0.2047, + "grad_norm": 1.001766424582976, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 20470 + }, + { + "epoch": 0.20471, + "grad_norm": 0.905690577277381, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 20471 + }, + { + "epoch": 0.20472, + "grad_norm": 0.8582802231298753, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 20472 + }, + { + "epoch": 0.20473, + "grad_norm": 1.0099994274527537, + "learning_rate": 0.003, + "loss": 4.068, + "step": 20473 + }, + { + "epoch": 0.20474, + "grad_norm": 1.1946003596306265, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 20474 + }, + { + "epoch": 0.20475, + "grad_norm": 0.8619692680160503, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 20475 + }, + { + "epoch": 0.20476, + "grad_norm": 0.9559448796469671, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 20476 + }, + { + "epoch": 0.20477, + "grad_norm": 0.9519911928111551, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 20477 + }, + { + "epoch": 0.20478, + "grad_norm": 1.1173372621957183, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 20478 + }, + { + "epoch": 0.20479, + "grad_norm": 1.060764951724005, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 20479 + }, + { + "epoch": 0.2048, + "grad_norm": 0.9867478296946318, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 20480 + }, + { + "epoch": 0.20481, + "grad_norm": 0.8170563073509959, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 20481 + }, + { + "epoch": 0.20482, + "grad_norm": 0.6254490750945555, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 20482 + }, + { + "epoch": 0.20483, + "grad_norm": 0.7207533839917315, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 20483 + }, + { + "epoch": 0.20484, + "grad_norm": 0.7418929333969281, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 20484 + }, + { + "epoch": 0.20485, + "grad_norm": 0.7465021535974078, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 20485 + }, + { + "epoch": 0.20486, + "grad_norm": 0.7769528365946727, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 20486 + }, + { + "epoch": 0.20487, + "grad_norm": 0.8130562766781182, + "learning_rate": 0.003, + "loss": 4.069, + "step": 20487 + }, + { + "epoch": 0.20488, + "grad_norm": 0.983521172208961, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 20488 + }, + { + "epoch": 0.20489, + "grad_norm": 1.0477977456666285, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 20489 + }, + { + "epoch": 0.2049, + "grad_norm": 0.846817393957543, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 20490 + }, + { + "epoch": 0.20491, + "grad_norm": 0.8217040692277149, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 20491 + }, + { + "epoch": 0.20492, + "grad_norm": 0.8152888096564821, + "learning_rate": 0.003, + "loss": 4.054, + "step": 20492 + }, + { + "epoch": 0.20493, + "grad_norm": 0.8988752345047367, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 20493 + }, + { + "epoch": 0.20494, + "grad_norm": 0.8345402495927507, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 20494 + }, + { + "epoch": 0.20495, + "grad_norm": 0.8456013999689221, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 20495 + }, + { + "epoch": 0.20496, + "grad_norm": 0.9663470825220668, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 20496 + }, + { + "epoch": 0.20497, + "grad_norm": 0.9229345421813118, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 20497 + }, + { + "epoch": 0.20498, + "grad_norm": 1.0108427865567675, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 20498 + }, + { + "epoch": 0.20499, + "grad_norm": 1.1721512182378613, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 20499 + }, + { + "epoch": 0.205, + "grad_norm": 1.2164014831498786, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 20500 + }, + { + "epoch": 0.20501, + "grad_norm": 0.7620436039619853, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 20501 + }, + { + "epoch": 0.20502, + "grad_norm": 0.6769871006699449, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 20502 + }, + { + "epoch": 0.20503, + "grad_norm": 0.754603163994319, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 20503 + }, + { + "epoch": 0.20504, + "grad_norm": 0.730669852587436, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 20504 + }, + { + "epoch": 0.20505, + "grad_norm": 0.7129773647468785, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 20505 + }, + { + "epoch": 0.20506, + "grad_norm": 0.6374569852668859, + "learning_rate": 0.003, + "loss": 4.1124, + "step": 20506 + }, + { + "epoch": 0.20507, + "grad_norm": 0.594683379131078, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 20507 + }, + { + "epoch": 0.20508, + "grad_norm": 0.6885703086274533, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 20508 + }, + { + "epoch": 0.20509, + "grad_norm": 0.7470187540028812, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 20509 + }, + { + "epoch": 0.2051, + "grad_norm": 0.6680552314952584, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 20510 + }, + { + "epoch": 0.20511, + "grad_norm": 0.7365770870646581, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 20511 + }, + { + "epoch": 0.20512, + "grad_norm": 1.1206830302869843, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 20512 + }, + { + "epoch": 0.20513, + "grad_norm": 1.3611316699735958, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 20513 + }, + { + "epoch": 0.20514, + "grad_norm": 0.7911528604006666, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 20514 + }, + { + "epoch": 0.20515, + "grad_norm": 0.7922819361015677, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 20515 + }, + { + "epoch": 0.20516, + "grad_norm": 0.8012931274001179, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 20516 + }, + { + "epoch": 0.20517, + "grad_norm": 0.8098025953248225, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 20517 + }, + { + "epoch": 0.20518, + "grad_norm": 0.8653460020279925, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 20518 + }, + { + "epoch": 0.20519, + "grad_norm": 0.9427554891941522, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 20519 + }, + { + "epoch": 0.2052, + "grad_norm": 1.228778011738753, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 20520 + }, + { + "epoch": 0.20521, + "grad_norm": 0.8243281934386077, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 20521 + }, + { + "epoch": 0.20522, + "grad_norm": 0.7788285962108208, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 20522 + }, + { + "epoch": 0.20523, + "grad_norm": 0.9339471235267384, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 20523 + }, + { + "epoch": 0.20524, + "grad_norm": 1.0396932139833395, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 20524 + }, + { + "epoch": 0.20525, + "grad_norm": 1.005402172635994, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 20525 + }, + { + "epoch": 0.20526, + "grad_norm": 1.053508768873512, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 20526 + }, + { + "epoch": 0.20527, + "grad_norm": 1.2508069802693835, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 20527 + }, + { + "epoch": 0.20528, + "grad_norm": 0.8663705990405522, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 20528 + }, + { + "epoch": 0.20529, + "grad_norm": 0.8818611675931979, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 20529 + }, + { + "epoch": 0.2053, + "grad_norm": 0.9200535337245385, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 20530 + }, + { + "epoch": 0.20531, + "grad_norm": 0.9898780949855865, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 20531 + }, + { + "epoch": 0.20532, + "grad_norm": 0.9578834118378134, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 20532 + }, + { + "epoch": 0.20533, + "grad_norm": 0.9052817941059021, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 20533 + }, + { + "epoch": 0.20534, + "grad_norm": 0.779722377291353, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 20534 + }, + { + "epoch": 0.20535, + "grad_norm": 0.7346532313137373, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 20535 + }, + { + "epoch": 0.20536, + "grad_norm": 0.7093779802165369, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 20536 + }, + { + "epoch": 0.20537, + "grad_norm": 0.7930251293994007, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 20537 + }, + { + "epoch": 0.20538, + "grad_norm": 0.8530749088036973, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 20538 + }, + { + "epoch": 0.20539, + "grad_norm": 0.8372602973870564, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 20539 + }, + { + "epoch": 0.2054, + "grad_norm": 0.863324991148601, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 20540 + }, + { + "epoch": 0.20541, + "grad_norm": 0.9112910016192655, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 20541 + }, + { + "epoch": 0.20542, + "grad_norm": 0.9433012968095559, + "learning_rate": 0.003, + "loss": 4.1106, + "step": 20542 + }, + { + "epoch": 0.20543, + "grad_norm": 1.0913271019690083, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 20543 + }, + { + "epoch": 0.20544, + "grad_norm": 0.8549544332057314, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 20544 + }, + { + "epoch": 0.20545, + "grad_norm": 0.914582378517783, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 20545 + }, + { + "epoch": 0.20546, + "grad_norm": 0.9273838748901237, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 20546 + }, + { + "epoch": 0.20547, + "grad_norm": 0.8948707723708691, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 20547 + }, + { + "epoch": 0.20548, + "grad_norm": 0.8302326146067596, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 20548 + }, + { + "epoch": 0.20549, + "grad_norm": 0.7696476080082122, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 20549 + }, + { + "epoch": 0.2055, + "grad_norm": 0.8407035755703616, + "learning_rate": 0.003, + "loss": 4.071, + "step": 20550 + }, + { + "epoch": 0.20551, + "grad_norm": 0.9257701985007101, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 20551 + }, + { + "epoch": 0.20552, + "grad_norm": 1.1374062132549967, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 20552 + }, + { + "epoch": 0.20553, + "grad_norm": 1.2789927383290687, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 20553 + }, + { + "epoch": 0.20554, + "grad_norm": 1.00202249688981, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 20554 + }, + { + "epoch": 0.20555, + "grad_norm": 0.9794066785980535, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 20555 + }, + { + "epoch": 0.20556, + "grad_norm": 0.9513503323769013, + "learning_rate": 0.003, + "loss": 4.045, + "step": 20556 + }, + { + "epoch": 0.20557, + "grad_norm": 0.9914266893871104, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 20557 + }, + { + "epoch": 0.20558, + "grad_norm": 0.9730358479260954, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 20558 + }, + { + "epoch": 0.20559, + "grad_norm": 0.9337728936224277, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 20559 + }, + { + "epoch": 0.2056, + "grad_norm": 0.8117965590914733, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 20560 + }, + { + "epoch": 0.20561, + "grad_norm": 0.6904545134598877, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 20561 + }, + { + "epoch": 0.20562, + "grad_norm": 0.6025300536075463, + "learning_rate": 0.003, + "loss": 4.057, + "step": 20562 + }, + { + "epoch": 0.20563, + "grad_norm": 0.6224085371276316, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 20563 + }, + { + "epoch": 0.20564, + "grad_norm": 0.6953086951954939, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 20564 + }, + { + "epoch": 0.20565, + "grad_norm": 0.8842491341529575, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 20565 + }, + { + "epoch": 0.20566, + "grad_norm": 1.1292077988719111, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 20566 + }, + { + "epoch": 0.20567, + "grad_norm": 0.9055261812823389, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 20567 + }, + { + "epoch": 0.20568, + "grad_norm": 0.8591778891675724, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 20568 + }, + { + "epoch": 0.20569, + "grad_norm": 0.7893300512642796, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 20569 + }, + { + "epoch": 0.2057, + "grad_norm": 0.7695695498222233, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 20570 + }, + { + "epoch": 0.20571, + "grad_norm": 0.7953962449919407, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 20571 + }, + { + "epoch": 0.20572, + "grad_norm": 0.7986956299090745, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 20572 + }, + { + "epoch": 0.20573, + "grad_norm": 0.7623400127898218, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 20573 + }, + { + "epoch": 0.20574, + "grad_norm": 0.8318733489796745, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 20574 + }, + { + "epoch": 0.20575, + "grad_norm": 0.8858423352603446, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 20575 + }, + { + "epoch": 0.20576, + "grad_norm": 0.8172688016100488, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 20576 + }, + { + "epoch": 0.20577, + "grad_norm": 0.7901722138410501, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 20577 + }, + { + "epoch": 0.20578, + "grad_norm": 0.9796718420688237, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 20578 + }, + { + "epoch": 0.20579, + "grad_norm": 1.2726068969477138, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 20579 + }, + { + "epoch": 0.2058, + "grad_norm": 0.9736309551465265, + "learning_rate": 0.003, + "loss": 4.074, + "step": 20580 + }, + { + "epoch": 0.20581, + "grad_norm": 1.0251965502061382, + "learning_rate": 0.003, + "loss": 4.052, + "step": 20581 + }, + { + "epoch": 0.20582, + "grad_norm": 0.9175721249187118, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 20582 + }, + { + "epoch": 0.20583, + "grad_norm": 1.0244517007583627, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 20583 + }, + { + "epoch": 0.20584, + "grad_norm": 1.0322391854094188, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 20584 + }, + { + "epoch": 0.20585, + "grad_norm": 1.0344090146461153, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 20585 + }, + { + "epoch": 0.20586, + "grad_norm": 0.9806018690955354, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 20586 + }, + { + "epoch": 0.20587, + "grad_norm": 0.7852323290753973, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 20587 + }, + { + "epoch": 0.20588, + "grad_norm": 0.7715173013911564, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 20588 + }, + { + "epoch": 0.20589, + "grad_norm": 0.6826629659862694, + "learning_rate": 0.003, + "loss": 4.047, + "step": 20589 + }, + { + "epoch": 0.2059, + "grad_norm": 0.7476244055476694, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 20590 + }, + { + "epoch": 0.20591, + "grad_norm": 0.6936211816688176, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 20591 + }, + { + "epoch": 0.20592, + "grad_norm": 0.7362555338656742, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 20592 + }, + { + "epoch": 0.20593, + "grad_norm": 0.7607218171203771, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 20593 + }, + { + "epoch": 0.20594, + "grad_norm": 0.7840412765664337, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 20594 + }, + { + "epoch": 0.20595, + "grad_norm": 0.8361335106376782, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 20595 + }, + { + "epoch": 0.20596, + "grad_norm": 1.077492727067051, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 20596 + }, + { + "epoch": 0.20597, + "grad_norm": 1.3908877765779695, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 20597 + }, + { + "epoch": 0.20598, + "grad_norm": 0.7605470713242518, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 20598 + }, + { + "epoch": 0.20599, + "grad_norm": 0.6904481401622857, + "learning_rate": 0.003, + "loss": 4.1014, + "step": 20599 + }, + { + "epoch": 0.206, + "grad_norm": 0.6344621316372291, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 20600 + }, + { + "epoch": 0.20601, + "grad_norm": 0.6366942136686657, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 20601 + }, + { + "epoch": 0.20602, + "grad_norm": 0.6067438234087, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 20602 + }, + { + "epoch": 0.20603, + "grad_norm": 0.7246511436585634, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 20603 + }, + { + "epoch": 0.20604, + "grad_norm": 1.0320882304582961, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 20604 + }, + { + "epoch": 0.20605, + "grad_norm": 1.2606764251968554, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 20605 + }, + { + "epoch": 0.20606, + "grad_norm": 0.7289654659007048, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 20606 + }, + { + "epoch": 0.20607, + "grad_norm": 0.718277453279945, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 20607 + }, + { + "epoch": 0.20608, + "grad_norm": 0.790622781305276, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 20608 + }, + { + "epoch": 0.20609, + "grad_norm": 0.9594703957779677, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 20609 + }, + { + "epoch": 0.2061, + "grad_norm": 1.2055963398714216, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 20610 + }, + { + "epoch": 0.20611, + "grad_norm": 1.019951400337982, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 20611 + }, + { + "epoch": 0.20612, + "grad_norm": 1.0161482326347722, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 20612 + }, + { + "epoch": 0.20613, + "grad_norm": 0.9709674734422439, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 20613 + }, + { + "epoch": 0.20614, + "grad_norm": 0.9265254257637886, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 20614 + }, + { + "epoch": 0.20615, + "grad_norm": 0.9423442343289178, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 20615 + }, + { + "epoch": 0.20616, + "grad_norm": 1.0135090207025705, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 20616 + }, + { + "epoch": 0.20617, + "grad_norm": 1.1006216032314815, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 20617 + }, + { + "epoch": 0.20618, + "grad_norm": 0.954842336561467, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 20618 + }, + { + "epoch": 0.20619, + "grad_norm": 0.7835498651032616, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 20619 + }, + { + "epoch": 0.2062, + "grad_norm": 0.807016408459512, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 20620 + }, + { + "epoch": 0.20621, + "grad_norm": 1.0172214859491475, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 20621 + }, + { + "epoch": 0.20622, + "grad_norm": 1.2010884728781617, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 20622 + }, + { + "epoch": 0.20623, + "grad_norm": 0.9414728170219171, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 20623 + }, + { + "epoch": 0.20624, + "grad_norm": 1.0085687448405847, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 20624 + }, + { + "epoch": 0.20625, + "grad_norm": 0.9953683575782653, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 20625 + }, + { + "epoch": 0.20626, + "grad_norm": 0.9928068630689973, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 20626 + }, + { + "epoch": 0.20627, + "grad_norm": 0.9844217324248021, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 20627 + }, + { + "epoch": 0.20628, + "grad_norm": 1.0190362124410843, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 20628 + }, + { + "epoch": 0.20629, + "grad_norm": 0.9439978860033111, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 20629 + }, + { + "epoch": 0.2063, + "grad_norm": 0.9253956720686918, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 20630 + }, + { + "epoch": 0.20631, + "grad_norm": 1.0515591053619255, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20631 + }, + { + "epoch": 0.20632, + "grad_norm": 1.023671020246924, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 20632 + }, + { + "epoch": 0.20633, + "grad_norm": 0.86648143766903, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 20633 + }, + { + "epoch": 0.20634, + "grad_norm": 0.8333367278901644, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 20634 + }, + { + "epoch": 0.20635, + "grad_norm": 0.8647866587051434, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 20635 + }, + { + "epoch": 0.20636, + "grad_norm": 1.020750856654751, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 20636 + }, + { + "epoch": 0.20637, + "grad_norm": 1.204925371161771, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 20637 + }, + { + "epoch": 0.20638, + "grad_norm": 0.7568887978599166, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 20638 + }, + { + "epoch": 0.20639, + "grad_norm": 0.7852977782044938, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 20639 + }, + { + "epoch": 0.2064, + "grad_norm": 0.8388052677801658, + "learning_rate": 0.003, + "loss": 4.073, + "step": 20640 + }, + { + "epoch": 0.20641, + "grad_norm": 0.891758605286488, + "learning_rate": 0.003, + "loss": 4.059, + "step": 20641 + }, + { + "epoch": 0.20642, + "grad_norm": 0.9364290039788794, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 20642 + }, + { + "epoch": 0.20643, + "grad_norm": 1.1980112051647338, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 20643 + }, + { + "epoch": 0.20644, + "grad_norm": 1.0793532517196802, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 20644 + }, + { + "epoch": 0.20645, + "grad_norm": 0.965015504488339, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 20645 + }, + { + "epoch": 0.20646, + "grad_norm": 1.0813336793108117, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 20646 + }, + { + "epoch": 0.20647, + "grad_norm": 0.9981493057543689, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 20647 + }, + { + "epoch": 0.20648, + "grad_norm": 0.8428348987809836, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 20648 + }, + { + "epoch": 0.20649, + "grad_norm": 0.6849644189236285, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 20649 + }, + { + "epoch": 0.2065, + "grad_norm": 0.6017190142529987, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 20650 + }, + { + "epoch": 0.20651, + "grad_norm": 0.6250684217321145, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 20651 + }, + { + "epoch": 0.20652, + "grad_norm": 0.6188968935601046, + "learning_rate": 0.003, + "loss": 3.9993, + "step": 20652 + }, + { + "epoch": 0.20653, + "grad_norm": 0.5363667060974162, + "learning_rate": 0.003, + "loss": 4.046, + "step": 20653 + }, + { + "epoch": 0.20654, + "grad_norm": 0.6347384871845065, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 20654 + }, + { + "epoch": 0.20655, + "grad_norm": 0.8032137065026761, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 20655 + }, + { + "epoch": 0.20656, + "grad_norm": 1.0406968752417656, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 20656 + }, + { + "epoch": 0.20657, + "grad_norm": 1.0931001468798138, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 20657 + }, + { + "epoch": 0.20658, + "grad_norm": 0.8447870136672336, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 20658 + }, + { + "epoch": 0.20659, + "grad_norm": 0.741803508048347, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 20659 + }, + { + "epoch": 0.2066, + "grad_norm": 0.7609061056370786, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 20660 + }, + { + "epoch": 0.20661, + "grad_norm": 0.8243425693126477, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 20661 + }, + { + "epoch": 0.20662, + "grad_norm": 0.8062762459485772, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 20662 + }, + { + "epoch": 0.20663, + "grad_norm": 0.9261261062417931, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 20663 + }, + { + "epoch": 0.20664, + "grad_norm": 1.2020025510664534, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 20664 + }, + { + "epoch": 0.20665, + "grad_norm": 0.9958478965450076, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 20665 + }, + { + "epoch": 0.20666, + "grad_norm": 0.9435912414102534, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 20666 + }, + { + "epoch": 0.20667, + "grad_norm": 0.7709152620398154, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 20667 + }, + { + "epoch": 0.20668, + "grad_norm": 0.6865848655986527, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 20668 + }, + { + "epoch": 0.20669, + "grad_norm": 0.7534612856236368, + "learning_rate": 0.003, + "loss": 4.016, + "step": 20669 + }, + { + "epoch": 0.2067, + "grad_norm": 0.7970881081505984, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 20670 + }, + { + "epoch": 0.20671, + "grad_norm": 0.9914107819593648, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 20671 + }, + { + "epoch": 0.20672, + "grad_norm": 1.4812493275344005, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 20672 + }, + { + "epoch": 0.20673, + "grad_norm": 0.7589968412493799, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 20673 + }, + { + "epoch": 0.20674, + "grad_norm": 0.7745651088422716, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 20674 + }, + { + "epoch": 0.20675, + "grad_norm": 1.045882241928847, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 20675 + }, + { + "epoch": 0.20676, + "grad_norm": 1.2303870064394389, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 20676 + }, + { + "epoch": 0.20677, + "grad_norm": 0.7953019631338488, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 20677 + }, + { + "epoch": 0.20678, + "grad_norm": 0.738991276268295, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 20678 + }, + { + "epoch": 0.20679, + "grad_norm": 0.845274045959646, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 20679 + }, + { + "epoch": 0.2068, + "grad_norm": 0.8769337009639483, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 20680 + }, + { + "epoch": 0.20681, + "grad_norm": 0.9376339161457683, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 20681 + }, + { + "epoch": 0.20682, + "grad_norm": 0.911274612007686, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 20682 + }, + { + "epoch": 0.20683, + "grad_norm": 0.9139237733288875, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 20683 + }, + { + "epoch": 0.20684, + "grad_norm": 0.9076066287361105, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 20684 + }, + { + "epoch": 0.20685, + "grad_norm": 1.1126390912737316, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 20685 + }, + { + "epoch": 0.20686, + "grad_norm": 1.020779907172978, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 20686 + }, + { + "epoch": 0.20687, + "grad_norm": 0.8989671962564679, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 20687 + }, + { + "epoch": 0.20688, + "grad_norm": 0.9025550399033526, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 20688 + }, + { + "epoch": 0.20689, + "grad_norm": 0.9019164380921044, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 20689 + }, + { + "epoch": 0.2069, + "grad_norm": 0.953989204410349, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 20690 + }, + { + "epoch": 0.20691, + "grad_norm": 0.9263957419699532, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 20691 + }, + { + "epoch": 0.20692, + "grad_norm": 1.024185703940695, + "learning_rate": 0.003, + "loss": 4.08, + "step": 20692 + }, + { + "epoch": 0.20693, + "grad_norm": 1.1113324609358632, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 20693 + }, + { + "epoch": 0.20694, + "grad_norm": 1.0667487243293525, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 20694 + }, + { + "epoch": 0.20695, + "grad_norm": 1.0667691079896835, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 20695 + }, + { + "epoch": 0.20696, + "grad_norm": 0.9645752396874048, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 20696 + }, + { + "epoch": 0.20697, + "grad_norm": 0.8993716925969667, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 20697 + }, + { + "epoch": 0.20698, + "grad_norm": 0.7341038316597985, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 20698 + }, + { + "epoch": 0.20699, + "grad_norm": 0.7079950792544272, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 20699 + }, + { + "epoch": 0.207, + "grad_norm": 0.6393011363260942, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 20700 + }, + { + "epoch": 0.20701, + "grad_norm": 0.6529282234544391, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 20701 + }, + { + "epoch": 0.20702, + "grad_norm": 0.7591582713524963, + "learning_rate": 0.003, + "loss": 4.071, + "step": 20702 + }, + { + "epoch": 0.20703, + "grad_norm": 0.8399027560375545, + "learning_rate": 0.003, + "loss": 4.039, + "step": 20703 + }, + { + "epoch": 0.20704, + "grad_norm": 1.0229046265192852, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 20704 + }, + { + "epoch": 0.20705, + "grad_norm": 1.2528591896392958, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 20705 + }, + { + "epoch": 0.20706, + "grad_norm": 0.7339260783515749, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 20706 + }, + { + "epoch": 0.20707, + "grad_norm": 0.6933835542923777, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 20707 + }, + { + "epoch": 0.20708, + "grad_norm": 0.7133414719199157, + "learning_rate": 0.003, + "loss": 4.066, + "step": 20708 + }, + { + "epoch": 0.20709, + "grad_norm": 0.6871709880104687, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 20709 + }, + { + "epoch": 0.2071, + "grad_norm": 0.7342594176801578, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 20710 + }, + { + "epoch": 0.20711, + "grad_norm": 0.809280070894089, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 20711 + }, + { + "epoch": 0.20712, + "grad_norm": 0.9073706527628451, + "learning_rate": 0.003, + "loss": 4.071, + "step": 20712 + }, + { + "epoch": 0.20713, + "grad_norm": 0.843087423365037, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 20713 + }, + { + "epoch": 0.20714, + "grad_norm": 0.9121406100229121, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 20714 + }, + { + "epoch": 0.20715, + "grad_norm": 1.1185977770452542, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 20715 + }, + { + "epoch": 0.20716, + "grad_norm": 0.9665679098861336, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 20716 + }, + { + "epoch": 0.20717, + "grad_norm": 1.0138902289609497, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 20717 + }, + { + "epoch": 0.20718, + "grad_norm": 1.0341541210214493, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 20718 + }, + { + "epoch": 0.20719, + "grad_norm": 0.909857907822015, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 20719 + }, + { + "epoch": 0.2072, + "grad_norm": 0.883878385228263, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 20720 + }, + { + "epoch": 0.20721, + "grad_norm": 0.9663190029626658, + "learning_rate": 0.003, + "loss": 4.0919, + "step": 20721 + }, + { + "epoch": 0.20722, + "grad_norm": 0.9569429956011049, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 20722 + }, + { + "epoch": 0.20723, + "grad_norm": 0.7961455675081555, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 20723 + }, + { + "epoch": 0.20724, + "grad_norm": 0.7822988601781593, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 20724 + }, + { + "epoch": 0.20725, + "grad_norm": 1.0356323599931876, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 20725 + }, + { + "epoch": 0.20726, + "grad_norm": 1.0625122756479055, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 20726 + }, + { + "epoch": 0.20727, + "grad_norm": 0.9313918687891721, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 20727 + }, + { + "epoch": 0.20728, + "grad_norm": 1.0000886729416245, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 20728 + }, + { + "epoch": 0.20729, + "grad_norm": 1.0193101696864726, + "learning_rate": 0.003, + "loss": 4.044, + "step": 20729 + }, + { + "epoch": 0.2073, + "grad_norm": 1.0133609674621558, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 20730 + }, + { + "epoch": 0.20731, + "grad_norm": 0.9585117508858823, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 20731 + }, + { + "epoch": 0.20732, + "grad_norm": 0.940756485622367, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 20732 + }, + { + "epoch": 0.20733, + "grad_norm": 0.9786370663315664, + "learning_rate": 0.003, + "loss": 4.044, + "step": 20733 + }, + { + "epoch": 0.20734, + "grad_norm": 0.8616271325318224, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 20734 + }, + { + "epoch": 0.20735, + "grad_norm": 0.7615164854628359, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 20735 + }, + { + "epoch": 0.20736, + "grad_norm": 0.7941263273306256, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 20736 + }, + { + "epoch": 0.20737, + "grad_norm": 0.9969172132139513, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 20737 + }, + { + "epoch": 0.20738, + "grad_norm": 1.5023736698642578, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 20738 + }, + { + "epoch": 0.20739, + "grad_norm": 0.9762028808361444, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 20739 + }, + { + "epoch": 0.2074, + "grad_norm": 1.1390590029131378, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 20740 + }, + { + "epoch": 0.20741, + "grad_norm": 1.0108577589696488, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 20741 + }, + { + "epoch": 0.20742, + "grad_norm": 0.8889441424104167, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 20742 + }, + { + "epoch": 0.20743, + "grad_norm": 0.9820342624142727, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 20743 + }, + { + "epoch": 0.20744, + "grad_norm": 1.2376835630873302, + "learning_rate": 0.003, + "loss": 4.1249, + "step": 20744 + }, + { + "epoch": 0.20745, + "grad_norm": 0.9350354190161171, + "learning_rate": 0.003, + "loss": 4.065, + "step": 20745 + }, + { + "epoch": 0.20746, + "grad_norm": 0.8964962553317527, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 20746 + }, + { + "epoch": 0.20747, + "grad_norm": 0.8783792688798422, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 20747 + }, + { + "epoch": 0.20748, + "grad_norm": 0.9841656727166727, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 20748 + }, + { + "epoch": 0.20749, + "grad_norm": 1.1104386965056001, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 20749 + }, + { + "epoch": 0.2075, + "grad_norm": 1.094278241959975, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 20750 + }, + { + "epoch": 0.20751, + "grad_norm": 1.1708732060101759, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 20751 + }, + { + "epoch": 0.20752, + "grad_norm": 0.9256835316651215, + "learning_rate": 0.003, + "loss": 4.1125, + "step": 20752 + }, + { + "epoch": 0.20753, + "grad_norm": 0.8984418312953798, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 20753 + }, + { + "epoch": 0.20754, + "grad_norm": 0.9859929187322075, + "learning_rate": 0.003, + "loss": 4.085, + "step": 20754 + }, + { + "epoch": 0.20755, + "grad_norm": 1.0246863446222312, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 20755 + }, + { + "epoch": 0.20756, + "grad_norm": 0.9704673786656147, + "learning_rate": 0.003, + "loss": 4.076, + "step": 20756 + }, + { + "epoch": 0.20757, + "grad_norm": 1.060174963756663, + "learning_rate": 0.003, + "loss": 4.085, + "step": 20757 + }, + { + "epoch": 0.20758, + "grad_norm": 1.1987580364879207, + "learning_rate": 0.003, + "loss": 4.083, + "step": 20758 + }, + { + "epoch": 0.20759, + "grad_norm": 1.008741287140148, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 20759 + }, + { + "epoch": 0.2076, + "grad_norm": 0.9424250600937093, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 20760 + }, + { + "epoch": 0.20761, + "grad_norm": 1.0289292288640246, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 20761 + }, + { + "epoch": 0.20762, + "grad_norm": 0.9522234811529363, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 20762 + }, + { + "epoch": 0.20763, + "grad_norm": 1.0821446055005235, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 20763 + }, + { + "epoch": 0.20764, + "grad_norm": 1.2069188426787005, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 20764 + }, + { + "epoch": 0.20765, + "grad_norm": 1.0002832818009721, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 20765 + }, + { + "epoch": 0.20766, + "grad_norm": 1.0027780458531717, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 20766 + }, + { + "epoch": 0.20767, + "grad_norm": 1.0986168857699, + "learning_rate": 0.003, + "loss": 4.0996, + "step": 20767 + }, + { + "epoch": 0.20768, + "grad_norm": 0.9867147085182186, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 20768 + }, + { + "epoch": 0.20769, + "grad_norm": 0.9468765183760098, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 20769 + }, + { + "epoch": 0.2077, + "grad_norm": 0.8015486553042366, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 20770 + }, + { + "epoch": 0.20771, + "grad_norm": 0.8833278320256701, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 20771 + }, + { + "epoch": 0.20772, + "grad_norm": 0.9759831595758212, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 20772 + }, + { + "epoch": 0.20773, + "grad_norm": 0.9466826472540202, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 20773 + }, + { + "epoch": 0.20774, + "grad_norm": 0.9070859077420977, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 20774 + }, + { + "epoch": 0.20775, + "grad_norm": 0.9272357848830094, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 20775 + }, + { + "epoch": 0.20776, + "grad_norm": 0.8951049696125486, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 20776 + }, + { + "epoch": 0.20777, + "grad_norm": 0.8373072634772573, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 20777 + }, + { + "epoch": 0.20778, + "grad_norm": 0.8185398936510428, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 20778 + }, + { + "epoch": 0.20779, + "grad_norm": 0.8774021381997962, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20779 + }, + { + "epoch": 0.2078, + "grad_norm": 0.9529960509548837, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 20780 + }, + { + "epoch": 0.20781, + "grad_norm": 0.8872476092655752, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 20781 + }, + { + "epoch": 0.20782, + "grad_norm": 0.7204830834769962, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20782 + }, + { + "epoch": 0.20783, + "grad_norm": 0.6801215653087698, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 20783 + }, + { + "epoch": 0.20784, + "grad_norm": 0.7711327109396316, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 20784 + }, + { + "epoch": 0.20785, + "grad_norm": 0.961085338661194, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 20785 + }, + { + "epoch": 0.20786, + "grad_norm": 1.2514225253609057, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 20786 + }, + { + "epoch": 0.20787, + "grad_norm": 0.7618441745477507, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 20787 + }, + { + "epoch": 0.20788, + "grad_norm": 0.7210302510964424, + "learning_rate": 0.003, + "loss": 4.03, + "step": 20788 + }, + { + "epoch": 0.20789, + "grad_norm": 0.8015987605300772, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 20789 + }, + { + "epoch": 0.2079, + "grad_norm": 0.8358797712113142, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 20790 + }, + { + "epoch": 0.20791, + "grad_norm": 0.7711390563496814, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 20791 + }, + { + "epoch": 0.20792, + "grad_norm": 0.7373600734332294, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 20792 + }, + { + "epoch": 0.20793, + "grad_norm": 0.8648692180455652, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 20793 + }, + { + "epoch": 0.20794, + "grad_norm": 1.1045574010554873, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 20794 + }, + { + "epoch": 0.20795, + "grad_norm": 0.8706162549835064, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 20795 + }, + { + "epoch": 0.20796, + "grad_norm": 0.7991366020970061, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 20796 + }, + { + "epoch": 0.20797, + "grad_norm": 0.8266699540161978, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 20797 + }, + { + "epoch": 0.20798, + "grad_norm": 0.9295114312035573, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 20798 + }, + { + "epoch": 0.20799, + "grad_norm": 0.9722309746197256, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 20799 + }, + { + "epoch": 0.208, + "grad_norm": 1.0150228422407528, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 20800 + }, + { + "epoch": 0.20801, + "grad_norm": 0.9434807998329356, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 20801 + }, + { + "epoch": 0.20802, + "grad_norm": 0.8426632504188805, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 20802 + }, + { + "epoch": 0.20803, + "grad_norm": 0.7962692959280201, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 20803 + }, + { + "epoch": 0.20804, + "grad_norm": 0.8521264590982106, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 20804 + }, + { + "epoch": 0.20805, + "grad_norm": 0.9657034625579123, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 20805 + }, + { + "epoch": 0.20806, + "grad_norm": 1.1519085946825933, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 20806 + }, + { + "epoch": 0.20807, + "grad_norm": 0.8500282300755746, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 20807 + }, + { + "epoch": 0.20808, + "grad_norm": 0.8156829967817358, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 20808 + }, + { + "epoch": 0.20809, + "grad_norm": 0.7827086334718794, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 20809 + }, + { + "epoch": 0.2081, + "grad_norm": 0.6837126100099973, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 20810 + }, + { + "epoch": 0.20811, + "grad_norm": 0.6619164480276214, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 20811 + }, + { + "epoch": 0.20812, + "grad_norm": 0.6699766058255511, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 20812 + }, + { + "epoch": 0.20813, + "grad_norm": 0.8790057035161044, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 20813 + }, + { + "epoch": 0.20814, + "grad_norm": 1.0609515662283509, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 20814 + }, + { + "epoch": 0.20815, + "grad_norm": 0.9883373278839427, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 20815 + }, + { + "epoch": 0.20816, + "grad_norm": 1.0900471224543256, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 20816 + }, + { + "epoch": 0.20817, + "grad_norm": 0.8779952327281877, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 20817 + }, + { + "epoch": 0.20818, + "grad_norm": 0.7728878959603992, + "learning_rate": 0.003, + "loss": 4.086, + "step": 20818 + }, + { + "epoch": 0.20819, + "grad_norm": 0.7201337922741882, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 20819 + }, + { + "epoch": 0.2082, + "grad_norm": 0.7395626855942592, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 20820 + }, + { + "epoch": 0.20821, + "grad_norm": 0.7683142880475228, + "learning_rate": 0.003, + "loss": 4.032, + "step": 20821 + }, + { + "epoch": 0.20822, + "grad_norm": 0.8218967880588436, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 20822 + }, + { + "epoch": 0.20823, + "grad_norm": 0.834345833872554, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 20823 + }, + { + "epoch": 0.20824, + "grad_norm": 0.7987532170222379, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 20824 + }, + { + "epoch": 0.20825, + "grad_norm": 0.8198056489592254, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 20825 + }, + { + "epoch": 0.20826, + "grad_norm": 0.885014579488387, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 20826 + }, + { + "epoch": 0.20827, + "grad_norm": 0.9228575617691458, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 20827 + }, + { + "epoch": 0.20828, + "grad_norm": 0.8477871640540304, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 20828 + }, + { + "epoch": 0.20829, + "grad_norm": 0.9524373852750858, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 20829 + }, + { + "epoch": 0.2083, + "grad_norm": 1.1650023972190342, + "learning_rate": 0.003, + "loss": 4.044, + "step": 20830 + }, + { + "epoch": 0.20831, + "grad_norm": 0.8958374467271202, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 20831 + }, + { + "epoch": 0.20832, + "grad_norm": 0.7579417270438731, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 20832 + }, + { + "epoch": 0.20833, + "grad_norm": 0.6654744014024893, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 20833 + }, + { + "epoch": 0.20834, + "grad_norm": 0.7169839141950568, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 20834 + }, + { + "epoch": 0.20835, + "grad_norm": 0.9159520675068701, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 20835 + }, + { + "epoch": 0.20836, + "grad_norm": 0.8988485140413005, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 20836 + }, + { + "epoch": 0.20837, + "grad_norm": 0.8700377997356233, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 20837 + }, + { + "epoch": 0.20838, + "grad_norm": 1.0247823704201304, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 20838 + }, + { + "epoch": 0.20839, + "grad_norm": 1.2176239640475501, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 20839 + }, + { + "epoch": 0.2084, + "grad_norm": 0.8474460656441073, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 20840 + }, + { + "epoch": 0.20841, + "grad_norm": 0.7247828273171255, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 20841 + }, + { + "epoch": 0.20842, + "grad_norm": 0.6973333704945514, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 20842 + }, + { + "epoch": 0.20843, + "grad_norm": 0.6930687854800611, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 20843 + }, + { + "epoch": 0.20844, + "grad_norm": 0.7565297191579194, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 20844 + }, + { + "epoch": 0.20845, + "grad_norm": 0.8720648821399303, + "learning_rate": 0.003, + "loss": 4.056, + "step": 20845 + }, + { + "epoch": 0.20846, + "grad_norm": 1.1494438524283832, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 20846 + }, + { + "epoch": 0.20847, + "grad_norm": 1.0470518786789225, + "learning_rate": 0.003, + "loss": 4.085, + "step": 20847 + }, + { + "epoch": 0.20848, + "grad_norm": 1.0835417349275442, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 20848 + }, + { + "epoch": 0.20849, + "grad_norm": 0.940728512493772, + "learning_rate": 0.003, + "loss": 4.052, + "step": 20849 + }, + { + "epoch": 0.2085, + "grad_norm": 0.7481177236998657, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 20850 + }, + { + "epoch": 0.20851, + "grad_norm": 0.6206226004147564, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 20851 + }, + { + "epoch": 0.20852, + "grad_norm": 0.5888691845170154, + "learning_rate": 0.003, + "loss": 4.052, + "step": 20852 + }, + { + "epoch": 0.20853, + "grad_norm": 0.6709893240010038, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 20853 + }, + { + "epoch": 0.20854, + "grad_norm": 0.9154878899593396, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 20854 + }, + { + "epoch": 0.20855, + "grad_norm": 1.348181078472298, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 20855 + }, + { + "epoch": 0.20856, + "grad_norm": 0.533570289649261, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 20856 + }, + { + "epoch": 0.20857, + "grad_norm": 0.8123699800950966, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 20857 + }, + { + "epoch": 0.20858, + "grad_norm": 1.2357741301996246, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 20858 + }, + { + "epoch": 0.20859, + "grad_norm": 0.5919763432136748, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 20859 + }, + { + "epoch": 0.2086, + "grad_norm": 0.6796295592904262, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 20860 + }, + { + "epoch": 0.20861, + "grad_norm": 0.8091425758358133, + "learning_rate": 0.003, + "loss": 4.044, + "step": 20861 + }, + { + "epoch": 0.20862, + "grad_norm": 0.8879189719290468, + "learning_rate": 0.003, + "loss": 4.04, + "step": 20862 + }, + { + "epoch": 0.20863, + "grad_norm": 0.9195269575657935, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 20863 + }, + { + "epoch": 0.20864, + "grad_norm": 0.9066008738921145, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 20864 + }, + { + "epoch": 0.20865, + "grad_norm": 0.8895437059425502, + "learning_rate": 0.003, + "loss": 4.057, + "step": 20865 + }, + { + "epoch": 0.20866, + "grad_norm": 0.9063383514726553, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 20866 + }, + { + "epoch": 0.20867, + "grad_norm": 1.0377430733789423, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 20867 + }, + { + "epoch": 0.20868, + "grad_norm": 0.8914793363756183, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 20868 + }, + { + "epoch": 0.20869, + "grad_norm": 0.8682923363415345, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 20869 + }, + { + "epoch": 0.2087, + "grad_norm": 0.9380749540622735, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 20870 + }, + { + "epoch": 0.20871, + "grad_norm": 0.8936449518543564, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 20871 + }, + { + "epoch": 0.20872, + "grad_norm": 0.9068738242848167, + "learning_rate": 0.003, + "loss": 4.0966, + "step": 20872 + }, + { + "epoch": 0.20873, + "grad_norm": 0.9227432132140321, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 20873 + }, + { + "epoch": 0.20874, + "grad_norm": 0.9128861571901247, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 20874 + }, + { + "epoch": 0.20875, + "grad_norm": 0.8761302133314882, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 20875 + }, + { + "epoch": 0.20876, + "grad_norm": 0.8366480631194192, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 20876 + }, + { + "epoch": 0.20877, + "grad_norm": 0.855363742188142, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 20877 + }, + { + "epoch": 0.20878, + "grad_norm": 0.7831765518582599, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 20878 + }, + { + "epoch": 0.20879, + "grad_norm": 0.8521269894108455, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 20879 + }, + { + "epoch": 0.2088, + "grad_norm": 0.9035866348006848, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 20880 + }, + { + "epoch": 0.20881, + "grad_norm": 0.9265370826142766, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 20881 + }, + { + "epoch": 0.20882, + "grad_norm": 1.0037908592074307, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 20882 + }, + { + "epoch": 0.20883, + "grad_norm": 0.968690414016621, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 20883 + }, + { + "epoch": 0.20884, + "grad_norm": 1.1660468868596874, + "learning_rate": 0.003, + "loss": 4.1004, + "step": 20884 + }, + { + "epoch": 0.20885, + "grad_norm": 1.058396478086012, + "learning_rate": 0.003, + "loss": 4.066, + "step": 20885 + }, + { + "epoch": 0.20886, + "grad_norm": 0.9555840750202294, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 20886 + }, + { + "epoch": 0.20887, + "grad_norm": 1.1188187594505572, + "learning_rate": 0.003, + "loss": 4.1098, + "step": 20887 + }, + { + "epoch": 0.20888, + "grad_norm": 0.9444405945730495, + "learning_rate": 0.003, + "loss": 4.086, + "step": 20888 + }, + { + "epoch": 0.20889, + "grad_norm": 0.9386917222804859, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 20889 + }, + { + "epoch": 0.2089, + "grad_norm": 0.8123897004406706, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 20890 + }, + { + "epoch": 0.20891, + "grad_norm": 0.7839225369108362, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 20891 + }, + { + "epoch": 0.20892, + "grad_norm": 0.8935177672635446, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 20892 + }, + { + "epoch": 0.20893, + "grad_norm": 0.9836856323206956, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 20893 + }, + { + "epoch": 0.20894, + "grad_norm": 1.1754153858999739, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 20894 + }, + { + "epoch": 0.20895, + "grad_norm": 0.7068288140116837, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 20895 + }, + { + "epoch": 0.20896, + "grad_norm": 0.6449442557927029, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 20896 + }, + { + "epoch": 0.20897, + "grad_norm": 0.6398501563580534, + "learning_rate": 0.003, + "loss": 4.045, + "step": 20897 + }, + { + "epoch": 0.20898, + "grad_norm": 0.5404930010042057, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 20898 + }, + { + "epoch": 0.20899, + "grad_norm": 0.5655871736814144, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 20899 + }, + { + "epoch": 0.209, + "grad_norm": 0.6020237435608026, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 20900 + }, + { + "epoch": 0.20901, + "grad_norm": 0.7601498429681562, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 20901 + }, + { + "epoch": 0.20902, + "grad_norm": 0.9586553021196883, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 20902 + }, + { + "epoch": 0.20903, + "grad_norm": 1.1292970133758855, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 20903 + }, + { + "epoch": 0.20904, + "grad_norm": 0.9060190545624713, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 20904 + }, + { + "epoch": 0.20905, + "grad_norm": 0.8036263350051671, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 20905 + }, + { + "epoch": 0.20906, + "grad_norm": 0.8514165614844775, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 20906 + }, + { + "epoch": 0.20907, + "grad_norm": 0.8891648021569436, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 20907 + }, + { + "epoch": 0.20908, + "grad_norm": 0.9041538277723808, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 20908 + }, + { + "epoch": 0.20909, + "grad_norm": 0.9657488087980987, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 20909 + }, + { + "epoch": 0.2091, + "grad_norm": 1.0294279686468277, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 20910 + }, + { + "epoch": 0.20911, + "grad_norm": 1.1272615296740602, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 20911 + }, + { + "epoch": 0.20912, + "grad_norm": 0.96115295435553, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 20912 + }, + { + "epoch": 0.20913, + "grad_norm": 0.867637661574733, + "learning_rate": 0.003, + "loss": 4.069, + "step": 20913 + }, + { + "epoch": 0.20914, + "grad_norm": 0.8872088685369496, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 20914 + }, + { + "epoch": 0.20915, + "grad_norm": 0.8862017167689719, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 20915 + }, + { + "epoch": 0.20916, + "grad_norm": 0.764827789701457, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 20916 + }, + { + "epoch": 0.20917, + "grad_norm": 0.6561755595633655, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 20917 + }, + { + "epoch": 0.20918, + "grad_norm": 0.6362220571951497, + "learning_rate": 0.003, + "loss": 4.051, + "step": 20918 + }, + { + "epoch": 0.20919, + "grad_norm": 0.760983641615835, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 20919 + }, + { + "epoch": 0.2092, + "grad_norm": 0.9662819849835595, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 20920 + }, + { + "epoch": 0.20921, + "grad_norm": 0.9497005033603845, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 20921 + }, + { + "epoch": 0.20922, + "grad_norm": 1.0256543509228493, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 20922 + }, + { + "epoch": 0.20923, + "grad_norm": 1.0288801321590189, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 20923 + }, + { + "epoch": 0.20924, + "grad_norm": 0.9575742206763543, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 20924 + }, + { + "epoch": 0.20925, + "grad_norm": 0.9518517727333672, + "learning_rate": 0.003, + "loss": 4.08, + "step": 20925 + }, + { + "epoch": 0.20926, + "grad_norm": 1.0101356656910154, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 20926 + }, + { + "epoch": 0.20927, + "grad_norm": 0.9208202329357542, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 20927 + }, + { + "epoch": 0.20928, + "grad_norm": 0.9321816400545891, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 20928 + }, + { + "epoch": 0.20929, + "grad_norm": 1.0111036566490585, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 20929 + }, + { + "epoch": 0.2093, + "grad_norm": 0.9735364164661735, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 20930 + }, + { + "epoch": 0.20931, + "grad_norm": 0.9459289793368022, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 20931 + }, + { + "epoch": 0.20932, + "grad_norm": 1.0179234302052216, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 20932 + }, + { + "epoch": 0.20933, + "grad_norm": 0.9970085209525912, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 20933 + }, + { + "epoch": 0.20934, + "grad_norm": 1.0353416538699811, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 20934 + }, + { + "epoch": 0.20935, + "grad_norm": 0.9682634285806023, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 20935 + }, + { + "epoch": 0.20936, + "grad_norm": 1.0529142887227032, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 20936 + }, + { + "epoch": 0.20937, + "grad_norm": 0.7584596088651663, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 20937 + }, + { + "epoch": 0.20938, + "grad_norm": 0.8028796370689205, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 20938 + }, + { + "epoch": 0.20939, + "grad_norm": 0.7215962925971722, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 20939 + }, + { + "epoch": 0.2094, + "grad_norm": 0.7726815801453961, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 20940 + }, + { + "epoch": 0.20941, + "grad_norm": 0.8234341681451163, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 20941 + }, + { + "epoch": 0.20942, + "grad_norm": 0.9294759080755696, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 20942 + }, + { + "epoch": 0.20943, + "grad_norm": 1.0540080989455443, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 20943 + }, + { + "epoch": 0.20944, + "grad_norm": 1.0771125126478065, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 20944 + }, + { + "epoch": 0.20945, + "grad_norm": 0.9606589362088807, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 20945 + }, + { + "epoch": 0.20946, + "grad_norm": 0.9138274590289285, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 20946 + }, + { + "epoch": 0.20947, + "grad_norm": 0.8336271922049947, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 20947 + }, + { + "epoch": 0.20948, + "grad_norm": 0.7548154530579022, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 20948 + }, + { + "epoch": 0.20949, + "grad_norm": 0.6556948254700824, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 20949 + }, + { + "epoch": 0.2095, + "grad_norm": 0.7527190926661659, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 20950 + }, + { + "epoch": 0.20951, + "grad_norm": 0.7849746003955136, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 20951 + }, + { + "epoch": 0.20952, + "grad_norm": 0.8720462836880155, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 20952 + }, + { + "epoch": 0.20953, + "grad_norm": 1.012691571410559, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 20953 + }, + { + "epoch": 0.20954, + "grad_norm": 0.9642645746189286, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 20954 + }, + { + "epoch": 0.20955, + "grad_norm": 0.906695269261336, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 20955 + }, + { + "epoch": 0.20956, + "grad_norm": 0.8938285305084355, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 20956 + }, + { + "epoch": 0.20957, + "grad_norm": 0.8915711865436335, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 20957 + }, + { + "epoch": 0.20958, + "grad_norm": 0.87353229798638, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 20958 + }, + { + "epoch": 0.20959, + "grad_norm": 0.9904221888857875, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 20959 + }, + { + "epoch": 0.2096, + "grad_norm": 1.2355844583948616, + "learning_rate": 0.003, + "loss": 4.059, + "step": 20960 + }, + { + "epoch": 0.20961, + "grad_norm": 0.9292678513392834, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 20961 + }, + { + "epoch": 0.20962, + "grad_norm": 0.899412887479403, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 20962 + }, + { + "epoch": 0.20963, + "grad_norm": 0.8005233489070702, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 20963 + }, + { + "epoch": 0.20964, + "grad_norm": 0.8393734088076394, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 20964 + }, + { + "epoch": 0.20965, + "grad_norm": 0.9801653239512133, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 20965 + }, + { + "epoch": 0.20966, + "grad_norm": 1.1035705657174708, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 20966 + }, + { + "epoch": 0.20967, + "grad_norm": 0.8951714877176081, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 20967 + }, + { + "epoch": 0.20968, + "grad_norm": 0.9984300213184893, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 20968 + }, + { + "epoch": 0.20969, + "grad_norm": 1.1447322626526333, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 20969 + }, + { + "epoch": 0.2097, + "grad_norm": 0.981719037133992, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 20970 + }, + { + "epoch": 0.20971, + "grad_norm": 0.8274862463046855, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 20971 + }, + { + "epoch": 0.20972, + "grad_norm": 0.6057424137656625, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 20972 + }, + { + "epoch": 0.20973, + "grad_norm": 0.6194221064418588, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 20973 + }, + { + "epoch": 0.20974, + "grad_norm": 0.7136364923130752, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 20974 + }, + { + "epoch": 0.20975, + "grad_norm": 0.7920099308835159, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 20975 + }, + { + "epoch": 0.20976, + "grad_norm": 0.8529938394212799, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 20976 + }, + { + "epoch": 0.20977, + "grad_norm": 0.7476903419204045, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 20977 + }, + { + "epoch": 0.20978, + "grad_norm": 0.7445633455785335, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 20978 + }, + { + "epoch": 0.20979, + "grad_norm": 0.8800103611750361, + "learning_rate": 0.003, + "loss": 4.0988, + "step": 20979 + }, + { + "epoch": 0.2098, + "grad_norm": 0.7821239446012019, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 20980 + }, + { + "epoch": 0.20981, + "grad_norm": 0.7517569464820335, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 20981 + }, + { + "epoch": 0.20982, + "grad_norm": 0.9657130463328368, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 20982 + }, + { + "epoch": 0.20983, + "grad_norm": 1.0867425041569467, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 20983 + }, + { + "epoch": 0.20984, + "grad_norm": 1.0630272947915531, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 20984 + }, + { + "epoch": 0.20985, + "grad_norm": 1.0346921683585952, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 20985 + }, + { + "epoch": 0.20986, + "grad_norm": 1.0325122421335289, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 20986 + }, + { + "epoch": 0.20987, + "grad_norm": 1.033786469848335, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 20987 + }, + { + "epoch": 0.20988, + "grad_norm": 0.940109667780733, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 20988 + }, + { + "epoch": 0.20989, + "grad_norm": 1.0250377702460591, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 20989 + }, + { + "epoch": 0.2099, + "grad_norm": 0.9870851149142993, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 20990 + }, + { + "epoch": 0.20991, + "grad_norm": 1.0900184477933434, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 20991 + }, + { + "epoch": 0.20992, + "grad_norm": 0.9224192173796866, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 20992 + }, + { + "epoch": 0.20993, + "grad_norm": 0.7347907145935488, + "learning_rate": 0.003, + "loss": 4.1103, + "step": 20993 + }, + { + "epoch": 0.20994, + "grad_norm": 0.6494314159332513, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 20994 + }, + { + "epoch": 0.20995, + "grad_norm": 0.6223911937898218, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 20995 + }, + { + "epoch": 0.20996, + "grad_norm": 0.5943403133762641, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 20996 + }, + { + "epoch": 0.20997, + "grad_norm": 0.6413854579845952, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 20997 + }, + { + "epoch": 0.20998, + "grad_norm": 0.7006561359930732, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 20998 + }, + { + "epoch": 0.20999, + "grad_norm": 0.682415504085104, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 20999 + }, + { + "epoch": 0.21, + "grad_norm": 0.6998869133750484, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 21000 + }, + { + "epoch": 0.21001, + "grad_norm": 0.6435399026341386, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 21001 + }, + { + "epoch": 0.21002, + "grad_norm": 0.6273031068741116, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 21002 + }, + { + "epoch": 0.21003, + "grad_norm": 0.673517232813251, + "learning_rate": 0.003, + "loss": 4.042, + "step": 21003 + }, + { + "epoch": 0.21004, + "grad_norm": 0.7430702012339945, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 21004 + }, + { + "epoch": 0.21005, + "grad_norm": 0.9560227725435448, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 21005 + }, + { + "epoch": 0.21006, + "grad_norm": 1.2456053948463115, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 21006 + }, + { + "epoch": 0.21007, + "grad_norm": 1.0358346003790344, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 21007 + }, + { + "epoch": 0.21008, + "grad_norm": 0.9437335127085855, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 21008 + }, + { + "epoch": 0.21009, + "grad_norm": 0.9332984345554216, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 21009 + }, + { + "epoch": 0.2101, + "grad_norm": 0.9413725326339256, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 21010 + }, + { + "epoch": 0.21011, + "grad_norm": 1.1666496645699866, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 21011 + }, + { + "epoch": 0.21012, + "grad_norm": 1.0288992373381118, + "learning_rate": 0.003, + "loss": 4.076, + "step": 21012 + }, + { + "epoch": 0.21013, + "grad_norm": 0.8636091001155501, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 21013 + }, + { + "epoch": 0.21014, + "grad_norm": 0.8403243600059537, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 21014 + }, + { + "epoch": 0.21015, + "grad_norm": 0.8784863249597779, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 21015 + }, + { + "epoch": 0.21016, + "grad_norm": 0.8631408808007526, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21016 + }, + { + "epoch": 0.21017, + "grad_norm": 0.8644516790702244, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 21017 + }, + { + "epoch": 0.21018, + "grad_norm": 0.9616570865165733, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 21018 + }, + { + "epoch": 0.21019, + "grad_norm": 0.9906264791766691, + "learning_rate": 0.003, + "loss": 4.046, + "step": 21019 + }, + { + "epoch": 0.2102, + "grad_norm": 1.1804328901765606, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 21020 + }, + { + "epoch": 0.21021, + "grad_norm": 1.0531486649930049, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 21021 + }, + { + "epoch": 0.21022, + "grad_norm": 1.084725270812928, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 21022 + }, + { + "epoch": 0.21023, + "grad_norm": 1.0324941560813041, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 21023 + }, + { + "epoch": 0.21024, + "grad_norm": 1.0161370054174468, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 21024 + }, + { + "epoch": 0.21025, + "grad_norm": 1.0137450360714135, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 21025 + }, + { + "epoch": 0.21026, + "grad_norm": 1.1833859414069834, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 21026 + }, + { + "epoch": 0.21027, + "grad_norm": 0.8923078275673527, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 21027 + }, + { + "epoch": 0.21028, + "grad_norm": 0.8130443215360488, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 21028 + }, + { + "epoch": 0.21029, + "grad_norm": 0.7670036528086152, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 21029 + }, + { + "epoch": 0.2103, + "grad_norm": 0.6912671653028333, + "learning_rate": 0.003, + "loss": 4.068, + "step": 21030 + }, + { + "epoch": 0.21031, + "grad_norm": 0.6086060166202992, + "learning_rate": 0.003, + "loss": 4.048, + "step": 21031 + }, + { + "epoch": 0.21032, + "grad_norm": 0.7017603978899594, + "learning_rate": 0.003, + "loss": 4.049, + "step": 21032 + }, + { + "epoch": 0.21033, + "grad_norm": 0.7622072450090098, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 21033 + }, + { + "epoch": 0.21034, + "grad_norm": 0.8642013213533124, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 21034 + }, + { + "epoch": 0.21035, + "grad_norm": 1.0029696165418491, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 21035 + }, + { + "epoch": 0.21036, + "grad_norm": 1.2489051096803996, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 21036 + }, + { + "epoch": 0.21037, + "grad_norm": 0.7941122063837073, + "learning_rate": 0.003, + "loss": 4.056, + "step": 21037 + }, + { + "epoch": 0.21038, + "grad_norm": 0.822774509536195, + "learning_rate": 0.003, + "loss": 4.072, + "step": 21038 + }, + { + "epoch": 0.21039, + "grad_norm": 0.9504645683506947, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 21039 + }, + { + "epoch": 0.2104, + "grad_norm": 1.2007252057555626, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 21040 + }, + { + "epoch": 0.21041, + "grad_norm": 0.8525491884398844, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 21041 + }, + { + "epoch": 0.21042, + "grad_norm": 0.7650256027036704, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 21042 + }, + { + "epoch": 0.21043, + "grad_norm": 0.7397310280269651, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 21043 + }, + { + "epoch": 0.21044, + "grad_norm": 0.7557234288434997, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 21044 + }, + { + "epoch": 0.21045, + "grad_norm": 0.8213142899806527, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 21045 + }, + { + "epoch": 0.21046, + "grad_norm": 0.7997509105781899, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 21046 + }, + { + "epoch": 0.21047, + "grad_norm": 0.7419456621563829, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 21047 + }, + { + "epoch": 0.21048, + "grad_norm": 0.8268305174669067, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 21048 + }, + { + "epoch": 0.21049, + "grad_norm": 1.0048050806880642, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 21049 + }, + { + "epoch": 0.2105, + "grad_norm": 1.0325392092503214, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 21050 + }, + { + "epoch": 0.21051, + "grad_norm": 0.8314267179215811, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 21051 + }, + { + "epoch": 0.21052, + "grad_norm": 0.786931387085538, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 21052 + }, + { + "epoch": 0.21053, + "grad_norm": 0.698446646816597, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 21053 + }, + { + "epoch": 0.21054, + "grad_norm": 0.7073659733355145, + "learning_rate": 0.003, + "loss": 4.071, + "step": 21054 + }, + { + "epoch": 0.21055, + "grad_norm": 0.7162249843379157, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 21055 + }, + { + "epoch": 0.21056, + "grad_norm": 0.7590214084202431, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 21056 + }, + { + "epoch": 0.21057, + "grad_norm": 0.7979945720370192, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 21057 + }, + { + "epoch": 0.21058, + "grad_norm": 0.8619043099997161, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 21058 + }, + { + "epoch": 0.21059, + "grad_norm": 1.101043783627659, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 21059 + }, + { + "epoch": 0.2106, + "grad_norm": 1.071976055417973, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 21060 + }, + { + "epoch": 0.21061, + "grad_norm": 0.9616464727901872, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 21061 + }, + { + "epoch": 0.21062, + "grad_norm": 1.016752772330011, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 21062 + }, + { + "epoch": 0.21063, + "grad_norm": 1.0098029454164281, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 21063 + }, + { + "epoch": 0.21064, + "grad_norm": 0.9269076398128128, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 21064 + }, + { + "epoch": 0.21065, + "grad_norm": 0.8422468539490227, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 21065 + }, + { + "epoch": 0.21066, + "grad_norm": 0.8183083063942778, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 21066 + }, + { + "epoch": 0.21067, + "grad_norm": 0.7730981172085367, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 21067 + }, + { + "epoch": 0.21068, + "grad_norm": 0.8351167347595602, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 21068 + }, + { + "epoch": 0.21069, + "grad_norm": 0.7623075012114217, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 21069 + }, + { + "epoch": 0.2107, + "grad_norm": 0.811489738558358, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 21070 + }, + { + "epoch": 0.21071, + "grad_norm": 0.7183944988996523, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 21071 + }, + { + "epoch": 0.21072, + "grad_norm": 0.7226136180078927, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 21072 + }, + { + "epoch": 0.21073, + "grad_norm": 0.9121160681331745, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 21073 + }, + { + "epoch": 0.21074, + "grad_norm": 1.1824481576173265, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 21074 + }, + { + "epoch": 0.21075, + "grad_norm": 1.0709399945555922, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 21075 + }, + { + "epoch": 0.21076, + "grad_norm": 1.1417092377978277, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 21076 + }, + { + "epoch": 0.21077, + "grad_norm": 0.9127570254238125, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 21077 + }, + { + "epoch": 0.21078, + "grad_norm": 0.7998226678057007, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 21078 + }, + { + "epoch": 0.21079, + "grad_norm": 0.735479029798332, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 21079 + }, + { + "epoch": 0.2108, + "grad_norm": 0.9112384260161761, + "learning_rate": 0.003, + "loss": 4.017, + "step": 21080 + }, + { + "epoch": 0.21081, + "grad_norm": 1.050632586819599, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 21081 + }, + { + "epoch": 0.21082, + "grad_norm": 1.0501659096074498, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 21082 + }, + { + "epoch": 0.21083, + "grad_norm": 1.014205008967147, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 21083 + }, + { + "epoch": 0.21084, + "grad_norm": 0.9561378141575905, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 21084 + }, + { + "epoch": 0.21085, + "grad_norm": 0.9482915838054728, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 21085 + }, + { + "epoch": 0.21086, + "grad_norm": 1.0836474842428978, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 21086 + }, + { + "epoch": 0.21087, + "grad_norm": 0.9506211186558229, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 21087 + }, + { + "epoch": 0.21088, + "grad_norm": 0.9675611930060828, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 21088 + }, + { + "epoch": 0.21089, + "grad_norm": 1.0861416219497653, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 21089 + }, + { + "epoch": 0.2109, + "grad_norm": 1.026866491003027, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 21090 + }, + { + "epoch": 0.21091, + "grad_norm": 1.1113761885995883, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 21091 + }, + { + "epoch": 0.21092, + "grad_norm": 0.782651591909814, + "learning_rate": 0.003, + "loss": 4.078, + "step": 21092 + }, + { + "epoch": 0.21093, + "grad_norm": 0.6976526250302678, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 21093 + }, + { + "epoch": 0.21094, + "grad_norm": 0.7023845356491255, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 21094 + }, + { + "epoch": 0.21095, + "grad_norm": 0.6796931345316704, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 21095 + }, + { + "epoch": 0.21096, + "grad_norm": 0.6974224560446238, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 21096 + }, + { + "epoch": 0.21097, + "grad_norm": 0.8763911602676883, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 21097 + }, + { + "epoch": 0.21098, + "grad_norm": 1.1301164496685343, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 21098 + }, + { + "epoch": 0.21099, + "grad_norm": 1.0560107088308537, + "learning_rate": 0.003, + "loss": 4.04, + "step": 21099 + }, + { + "epoch": 0.211, + "grad_norm": 0.914678192060454, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 21100 + }, + { + "epoch": 0.21101, + "grad_norm": 0.728814810016415, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 21101 + }, + { + "epoch": 0.21102, + "grad_norm": 0.7043960115867001, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 21102 + }, + { + "epoch": 0.21103, + "grad_norm": 0.731339414118106, + "learning_rate": 0.003, + "loss": 4.058, + "step": 21103 + }, + { + "epoch": 0.21104, + "grad_norm": 0.6503725794176464, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 21104 + }, + { + "epoch": 0.21105, + "grad_norm": 0.5908475393905657, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 21105 + }, + { + "epoch": 0.21106, + "grad_norm": 0.6944105738869012, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 21106 + }, + { + "epoch": 0.21107, + "grad_norm": 0.6728185090033827, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 21107 + }, + { + "epoch": 0.21108, + "grad_norm": 0.6387129313466028, + "learning_rate": 0.003, + "loss": 4.052, + "step": 21108 + }, + { + "epoch": 0.21109, + "grad_norm": 0.8395486307179711, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 21109 + }, + { + "epoch": 0.2111, + "grad_norm": 1.032022023581922, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 21110 + }, + { + "epoch": 0.21111, + "grad_norm": 1.2141059703309585, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 21111 + }, + { + "epoch": 0.21112, + "grad_norm": 1.003124856862386, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 21112 + }, + { + "epoch": 0.21113, + "grad_norm": 1.2313198953043571, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 21113 + }, + { + "epoch": 0.21114, + "grad_norm": 0.9446560106026629, + "learning_rate": 0.003, + "loss": 4.08, + "step": 21114 + }, + { + "epoch": 0.21115, + "grad_norm": 0.9690466848006403, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 21115 + }, + { + "epoch": 0.21116, + "grad_norm": 0.7765049882661763, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 21116 + }, + { + "epoch": 0.21117, + "grad_norm": 0.7674281129106878, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 21117 + }, + { + "epoch": 0.21118, + "grad_norm": 0.8050658312568353, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 21118 + }, + { + "epoch": 0.21119, + "grad_norm": 0.9527115024060966, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 21119 + }, + { + "epoch": 0.2112, + "grad_norm": 1.0664148839093905, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 21120 + }, + { + "epoch": 0.21121, + "grad_norm": 0.8989309264153537, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 21121 + }, + { + "epoch": 0.21122, + "grad_norm": 0.8553488932637456, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 21122 + }, + { + "epoch": 0.21123, + "grad_norm": 0.8530798206832544, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 21123 + }, + { + "epoch": 0.21124, + "grad_norm": 0.8607838825788022, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 21124 + }, + { + "epoch": 0.21125, + "grad_norm": 0.8716327668518643, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 21125 + }, + { + "epoch": 0.21126, + "grad_norm": 0.9891438273656992, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 21126 + }, + { + "epoch": 0.21127, + "grad_norm": 1.1312964391301088, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 21127 + }, + { + "epoch": 0.21128, + "grad_norm": 0.8552845659556313, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 21128 + }, + { + "epoch": 0.21129, + "grad_norm": 0.7829051331214251, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 21129 + }, + { + "epoch": 0.2113, + "grad_norm": 0.8032688655307006, + "learning_rate": 0.003, + "loss": 4.056, + "step": 21130 + }, + { + "epoch": 0.21131, + "grad_norm": 0.887240068118571, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 21131 + }, + { + "epoch": 0.21132, + "grad_norm": 1.0255753122391946, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 21132 + }, + { + "epoch": 0.21133, + "grad_norm": 0.9770052068304652, + "learning_rate": 0.003, + "loss": 4.027, + "step": 21133 + }, + { + "epoch": 0.21134, + "grad_norm": 1.0838121625918715, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 21134 + }, + { + "epoch": 0.21135, + "grad_norm": 1.0212083668102394, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 21135 + }, + { + "epoch": 0.21136, + "grad_norm": 0.9957979334382595, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 21136 + }, + { + "epoch": 0.21137, + "grad_norm": 0.9654371177807874, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 21137 + }, + { + "epoch": 0.21138, + "grad_norm": 0.9935918355132888, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 21138 + }, + { + "epoch": 0.21139, + "grad_norm": 0.9798029322391021, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 21139 + }, + { + "epoch": 0.2114, + "grad_norm": 1.0888142750803371, + "learning_rate": 0.003, + "loss": 4.079, + "step": 21140 + }, + { + "epoch": 0.21141, + "grad_norm": 0.8104995200408619, + "learning_rate": 0.003, + "loss": 4.05, + "step": 21141 + }, + { + "epoch": 0.21142, + "grad_norm": 0.7112571958199563, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 21142 + }, + { + "epoch": 0.21143, + "grad_norm": 0.7255826242780293, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 21143 + }, + { + "epoch": 0.21144, + "grad_norm": 0.793102625204424, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 21144 + }, + { + "epoch": 0.21145, + "grad_norm": 0.8317932635144787, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 21145 + }, + { + "epoch": 0.21146, + "grad_norm": 0.8967437236032432, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 21146 + }, + { + "epoch": 0.21147, + "grad_norm": 0.8759587248159304, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 21147 + }, + { + "epoch": 0.21148, + "grad_norm": 0.8234335022952023, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 21148 + }, + { + "epoch": 0.21149, + "grad_norm": 0.8347779727113863, + "learning_rate": 0.003, + "loss": 4.091, + "step": 21149 + }, + { + "epoch": 0.2115, + "grad_norm": 0.8771653489074704, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 21150 + }, + { + "epoch": 0.21151, + "grad_norm": 0.8868562373035072, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 21151 + }, + { + "epoch": 0.21152, + "grad_norm": 0.9260194636969789, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 21152 + }, + { + "epoch": 0.21153, + "grad_norm": 1.0278428326891138, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 21153 + }, + { + "epoch": 0.21154, + "grad_norm": 0.8406737049790429, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 21154 + }, + { + "epoch": 0.21155, + "grad_norm": 0.7115851023266901, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 21155 + }, + { + "epoch": 0.21156, + "grad_norm": 0.6527447656067576, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 21156 + }, + { + "epoch": 0.21157, + "grad_norm": 0.8198752729573484, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 21157 + }, + { + "epoch": 0.21158, + "grad_norm": 0.9566103901226111, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 21158 + }, + { + "epoch": 0.21159, + "grad_norm": 1.0908284159853958, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 21159 + }, + { + "epoch": 0.2116, + "grad_norm": 0.9359106029478939, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 21160 + }, + { + "epoch": 0.21161, + "grad_norm": 0.864010193572614, + "learning_rate": 0.003, + "loss": 4.047, + "step": 21161 + }, + { + "epoch": 0.21162, + "grad_norm": 0.8075852395167353, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 21162 + }, + { + "epoch": 0.21163, + "grad_norm": 0.9827122186692935, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 21163 + }, + { + "epoch": 0.21164, + "grad_norm": 1.2260172194200103, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 21164 + }, + { + "epoch": 0.21165, + "grad_norm": 0.8469270663970344, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 21165 + }, + { + "epoch": 0.21166, + "grad_norm": 0.7684862743742701, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 21166 + }, + { + "epoch": 0.21167, + "grad_norm": 0.7819136830951748, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 21167 + }, + { + "epoch": 0.21168, + "grad_norm": 0.8139741309161088, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 21168 + }, + { + "epoch": 0.21169, + "grad_norm": 0.9138785383857093, + "learning_rate": 0.003, + "loss": 4.059, + "step": 21169 + }, + { + "epoch": 0.2117, + "grad_norm": 0.9330372982311411, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 21170 + }, + { + "epoch": 0.21171, + "grad_norm": 0.8708103617659637, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 21171 + }, + { + "epoch": 0.21172, + "grad_norm": 0.9418160284270444, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 21172 + }, + { + "epoch": 0.21173, + "grad_norm": 1.0376885824291926, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 21173 + }, + { + "epoch": 0.21174, + "grad_norm": 0.9404880236615147, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 21174 + }, + { + "epoch": 0.21175, + "grad_norm": 0.9934125181461049, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 21175 + }, + { + "epoch": 0.21176, + "grad_norm": 1.1353149684021964, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 21176 + }, + { + "epoch": 0.21177, + "grad_norm": 0.8201070903363143, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21177 + }, + { + "epoch": 0.21178, + "grad_norm": 0.8173249937295153, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 21178 + }, + { + "epoch": 0.21179, + "grad_norm": 0.8997426303253961, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 21179 + }, + { + "epoch": 0.2118, + "grad_norm": 0.9041938656269032, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 21180 + }, + { + "epoch": 0.21181, + "grad_norm": 0.8587673964737166, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 21181 + }, + { + "epoch": 0.21182, + "grad_norm": 1.0796240432190018, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 21182 + }, + { + "epoch": 0.21183, + "grad_norm": 1.0340901099432858, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 21183 + }, + { + "epoch": 0.21184, + "grad_norm": 1.0284079048161205, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 21184 + }, + { + "epoch": 0.21185, + "grad_norm": 0.9837865666573926, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 21185 + }, + { + "epoch": 0.21186, + "grad_norm": 0.999957853922835, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 21186 + }, + { + "epoch": 0.21187, + "grad_norm": 1.0867239755988904, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 21187 + }, + { + "epoch": 0.21188, + "grad_norm": 0.8751720863136792, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 21188 + }, + { + "epoch": 0.21189, + "grad_norm": 0.7883284206571608, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 21189 + }, + { + "epoch": 0.2119, + "grad_norm": 0.833843271179596, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 21190 + }, + { + "epoch": 0.21191, + "grad_norm": 0.7945471805342696, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 21191 + }, + { + "epoch": 0.21192, + "grad_norm": 0.859568000810372, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21192 + }, + { + "epoch": 0.21193, + "grad_norm": 0.9499322342318987, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 21193 + }, + { + "epoch": 0.21194, + "grad_norm": 1.1699675423333233, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 21194 + }, + { + "epoch": 0.21195, + "grad_norm": 0.8798374398677336, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 21195 + }, + { + "epoch": 0.21196, + "grad_norm": 0.8420315326759298, + "learning_rate": 0.003, + "loss": 4.075, + "step": 21196 + }, + { + "epoch": 0.21197, + "grad_norm": 0.7435566165737909, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 21197 + }, + { + "epoch": 0.21198, + "grad_norm": 0.790755597947907, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 21198 + }, + { + "epoch": 0.21199, + "grad_norm": 0.7636089927077947, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 21199 + }, + { + "epoch": 0.212, + "grad_norm": 0.736372345884053, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 21200 + }, + { + "epoch": 0.21201, + "grad_norm": 0.7987946497201371, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 21201 + }, + { + "epoch": 0.21202, + "grad_norm": 1.0621123139084045, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 21202 + }, + { + "epoch": 0.21203, + "grad_norm": 1.0043085045508686, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 21203 + }, + { + "epoch": 0.21204, + "grad_norm": 0.9689772473591405, + "learning_rate": 0.003, + "loss": 4.064, + "step": 21204 + }, + { + "epoch": 0.21205, + "grad_norm": 0.8777533125102754, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 21205 + }, + { + "epoch": 0.21206, + "grad_norm": 0.9267300075194698, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 21206 + }, + { + "epoch": 0.21207, + "grad_norm": 0.9623997678273425, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 21207 + }, + { + "epoch": 0.21208, + "grad_norm": 1.0258782821472325, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 21208 + }, + { + "epoch": 0.21209, + "grad_norm": 0.9868977199698608, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 21209 + }, + { + "epoch": 0.2121, + "grad_norm": 0.8702205824102925, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 21210 + }, + { + "epoch": 0.21211, + "grad_norm": 0.9318447210972713, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 21211 + }, + { + "epoch": 0.21212, + "grad_norm": 1.080939522993914, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 21212 + }, + { + "epoch": 0.21213, + "grad_norm": 1.0355856763240312, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 21213 + }, + { + "epoch": 0.21214, + "grad_norm": 1.0599578015664297, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 21214 + }, + { + "epoch": 0.21215, + "grad_norm": 0.8878829110296694, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 21215 + }, + { + "epoch": 0.21216, + "grad_norm": 0.7177601093066023, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 21216 + }, + { + "epoch": 0.21217, + "grad_norm": 0.7111340447626696, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 21217 + }, + { + "epoch": 0.21218, + "grad_norm": 0.7273870077396913, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 21218 + }, + { + "epoch": 0.21219, + "grad_norm": 0.716870960657489, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 21219 + }, + { + "epoch": 0.2122, + "grad_norm": 0.7547271994958835, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 21220 + }, + { + "epoch": 0.21221, + "grad_norm": 0.9258241847127848, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 21221 + }, + { + "epoch": 0.21222, + "grad_norm": 1.084845220900056, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 21222 + }, + { + "epoch": 0.21223, + "grad_norm": 1.0505118625844345, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 21223 + }, + { + "epoch": 0.21224, + "grad_norm": 0.9754180762791759, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 21224 + }, + { + "epoch": 0.21225, + "grad_norm": 1.1567928169511146, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 21225 + }, + { + "epoch": 0.21226, + "grad_norm": 1.100104882338202, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 21226 + }, + { + "epoch": 0.21227, + "grad_norm": 0.7957564511547448, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 21227 + }, + { + "epoch": 0.21228, + "grad_norm": 0.7075555210106396, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 21228 + }, + { + "epoch": 0.21229, + "grad_norm": 0.7652379480633924, + "learning_rate": 0.003, + "loss": 4.06, + "step": 21229 + }, + { + "epoch": 0.2123, + "grad_norm": 0.7981211742797875, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 21230 + }, + { + "epoch": 0.21231, + "grad_norm": 0.8404954939470825, + "learning_rate": 0.003, + "loss": 4.072, + "step": 21231 + }, + { + "epoch": 0.21232, + "grad_norm": 0.8486995279467043, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 21232 + }, + { + "epoch": 0.21233, + "grad_norm": 1.0226108619112986, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 21233 + }, + { + "epoch": 0.21234, + "grad_norm": 1.0786944266582215, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 21234 + }, + { + "epoch": 0.21235, + "grad_norm": 0.8269548201711181, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 21235 + }, + { + "epoch": 0.21236, + "grad_norm": 0.7566574607208635, + "learning_rate": 0.003, + "loss": 4.034, + "step": 21236 + }, + { + "epoch": 0.21237, + "grad_norm": 0.8561778377388048, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 21237 + }, + { + "epoch": 0.21238, + "grad_norm": 0.9009112049680555, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 21238 + }, + { + "epoch": 0.21239, + "grad_norm": 0.7869095725484021, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 21239 + }, + { + "epoch": 0.2124, + "grad_norm": 0.8015076597896063, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 21240 + }, + { + "epoch": 0.21241, + "grad_norm": 0.8916400926794642, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 21241 + }, + { + "epoch": 0.21242, + "grad_norm": 0.8593105568320265, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 21242 + }, + { + "epoch": 0.21243, + "grad_norm": 0.8642499020925598, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 21243 + }, + { + "epoch": 0.21244, + "grad_norm": 0.9351452208459016, + "learning_rate": 0.003, + "loss": 4.05, + "step": 21244 + }, + { + "epoch": 0.21245, + "grad_norm": 1.051505354109061, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 21245 + }, + { + "epoch": 0.21246, + "grad_norm": 1.211082749679639, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 21246 + }, + { + "epoch": 0.21247, + "grad_norm": 1.0324620044571742, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 21247 + }, + { + "epoch": 0.21248, + "grad_norm": 1.0528376039558933, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 21248 + }, + { + "epoch": 0.21249, + "grad_norm": 1.0172951092601181, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 21249 + }, + { + "epoch": 0.2125, + "grad_norm": 1.0455916547903843, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 21250 + }, + { + "epoch": 0.21251, + "grad_norm": 0.9721666841995168, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 21251 + }, + { + "epoch": 0.21252, + "grad_norm": 0.9354980752446461, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 21252 + }, + { + "epoch": 0.21253, + "grad_norm": 0.8235369177814564, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 21253 + }, + { + "epoch": 0.21254, + "grad_norm": 0.708241936578851, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 21254 + }, + { + "epoch": 0.21255, + "grad_norm": 0.6590801615768045, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 21255 + }, + { + "epoch": 0.21256, + "grad_norm": 0.7687869672904036, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 21256 + }, + { + "epoch": 0.21257, + "grad_norm": 0.8110294457074578, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 21257 + }, + { + "epoch": 0.21258, + "grad_norm": 0.8835741457620595, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 21258 + }, + { + "epoch": 0.21259, + "grad_norm": 1.031106931791354, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 21259 + }, + { + "epoch": 0.2126, + "grad_norm": 1.0822589123455826, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 21260 + }, + { + "epoch": 0.21261, + "grad_norm": 0.8988172268989614, + "learning_rate": 0.003, + "loss": 4.062, + "step": 21261 + }, + { + "epoch": 0.21262, + "grad_norm": 0.7603041652393627, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 21262 + }, + { + "epoch": 0.21263, + "grad_norm": 0.6376354168758694, + "learning_rate": 0.003, + "loss": 4.0042, + "step": 21263 + }, + { + "epoch": 0.21264, + "grad_norm": 0.6667641683867043, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 21264 + }, + { + "epoch": 0.21265, + "grad_norm": 0.6712455135629539, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 21265 + }, + { + "epoch": 0.21266, + "grad_norm": 0.6568901608609755, + "learning_rate": 0.003, + "loss": 4.042, + "step": 21266 + }, + { + "epoch": 0.21267, + "grad_norm": 0.6299236679437372, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 21267 + }, + { + "epoch": 0.21268, + "grad_norm": 0.6619955714541537, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 21268 + }, + { + "epoch": 0.21269, + "grad_norm": 0.7720190038476618, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 21269 + }, + { + "epoch": 0.2127, + "grad_norm": 0.83960982461192, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 21270 + }, + { + "epoch": 0.21271, + "grad_norm": 0.816094079275652, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 21271 + }, + { + "epoch": 0.21272, + "grad_norm": 0.9558392509854234, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 21272 + }, + { + "epoch": 0.21273, + "grad_norm": 1.192041026982684, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 21273 + }, + { + "epoch": 0.21274, + "grad_norm": 0.8334068208891056, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 21274 + }, + { + "epoch": 0.21275, + "grad_norm": 0.8147451132296317, + "learning_rate": 0.003, + "loss": 4.008, + "step": 21275 + }, + { + "epoch": 0.21276, + "grad_norm": 0.8069330768413078, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 21276 + }, + { + "epoch": 0.21277, + "grad_norm": 0.9188295233850001, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 21277 + }, + { + "epoch": 0.21278, + "grad_norm": 1.23921945828254, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 21278 + }, + { + "epoch": 0.21279, + "grad_norm": 1.0243127214157668, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 21279 + }, + { + "epoch": 0.2128, + "grad_norm": 0.949600296006374, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 21280 + }, + { + "epoch": 0.21281, + "grad_norm": 0.8667360246932553, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 21281 + }, + { + "epoch": 0.21282, + "grad_norm": 0.8882339822228139, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 21282 + }, + { + "epoch": 0.21283, + "grad_norm": 0.8440891879162166, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 21283 + }, + { + "epoch": 0.21284, + "grad_norm": 0.8432786676373919, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 21284 + }, + { + "epoch": 0.21285, + "grad_norm": 0.9727308305242739, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 21285 + }, + { + "epoch": 0.21286, + "grad_norm": 1.1983776814240794, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 21286 + }, + { + "epoch": 0.21287, + "grad_norm": 0.8130550644223, + "learning_rate": 0.003, + "loss": 4.046, + "step": 21287 + }, + { + "epoch": 0.21288, + "grad_norm": 0.7421248628961178, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 21288 + }, + { + "epoch": 0.21289, + "grad_norm": 0.8242295095790686, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 21289 + }, + { + "epoch": 0.2129, + "grad_norm": 0.8853489494645336, + "learning_rate": 0.003, + "loss": 4.024, + "step": 21290 + }, + { + "epoch": 0.21291, + "grad_norm": 1.0966030917056349, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 21291 + }, + { + "epoch": 0.21292, + "grad_norm": 1.1923335315819226, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 21292 + }, + { + "epoch": 0.21293, + "grad_norm": 0.8737413966416677, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 21293 + }, + { + "epoch": 0.21294, + "grad_norm": 0.8072464878105456, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 21294 + }, + { + "epoch": 0.21295, + "grad_norm": 0.8197551822012741, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 21295 + }, + { + "epoch": 0.21296, + "grad_norm": 0.7666384684898897, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 21296 + }, + { + "epoch": 0.21297, + "grad_norm": 0.8659945444239481, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 21297 + }, + { + "epoch": 0.21298, + "grad_norm": 1.1660665394360092, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 21298 + }, + { + "epoch": 0.21299, + "grad_norm": 1.023780513710772, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 21299 + }, + { + "epoch": 0.213, + "grad_norm": 1.0678845657179334, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 21300 + }, + { + "epoch": 0.21301, + "grad_norm": 0.863857021155737, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 21301 + }, + { + "epoch": 0.21302, + "grad_norm": 0.775615516091285, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 21302 + }, + { + "epoch": 0.21303, + "grad_norm": 0.7506875613355735, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 21303 + }, + { + "epoch": 0.21304, + "grad_norm": 0.7135119295797271, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 21304 + }, + { + "epoch": 0.21305, + "grad_norm": 0.7035218754708156, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 21305 + }, + { + "epoch": 0.21306, + "grad_norm": 0.6711849400837023, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 21306 + }, + { + "epoch": 0.21307, + "grad_norm": 0.7781252451033537, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 21307 + }, + { + "epoch": 0.21308, + "grad_norm": 0.9939189260062887, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 21308 + }, + { + "epoch": 0.21309, + "grad_norm": 1.156881142245501, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 21309 + }, + { + "epoch": 0.2131, + "grad_norm": 0.7792490413634692, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 21310 + }, + { + "epoch": 0.21311, + "grad_norm": 0.7319875650903732, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 21311 + }, + { + "epoch": 0.21312, + "grad_norm": 0.795710184351996, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 21312 + }, + { + "epoch": 0.21313, + "grad_norm": 0.7345238860166494, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 21313 + }, + { + "epoch": 0.21314, + "grad_norm": 0.7004808014406678, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 21314 + }, + { + "epoch": 0.21315, + "grad_norm": 0.7031313239306645, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 21315 + }, + { + "epoch": 0.21316, + "grad_norm": 0.8075956495669555, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 21316 + }, + { + "epoch": 0.21317, + "grad_norm": 0.9973050178910247, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 21317 + }, + { + "epoch": 0.21318, + "grad_norm": 1.3681928645941812, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 21318 + }, + { + "epoch": 0.21319, + "grad_norm": 0.7330198948212318, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 21319 + }, + { + "epoch": 0.2132, + "grad_norm": 0.7739497247246485, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 21320 + }, + { + "epoch": 0.21321, + "grad_norm": 0.8460461446146745, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 21321 + }, + { + "epoch": 0.21322, + "grad_norm": 0.9394888310669375, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 21322 + }, + { + "epoch": 0.21323, + "grad_norm": 1.1570726057632688, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 21323 + }, + { + "epoch": 0.21324, + "grad_norm": 0.8497915856023048, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 21324 + }, + { + "epoch": 0.21325, + "grad_norm": 0.8038646541488577, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 21325 + }, + { + "epoch": 0.21326, + "grad_norm": 0.8961067657186734, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 21326 + }, + { + "epoch": 0.21327, + "grad_norm": 1.0219810982118778, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 21327 + }, + { + "epoch": 0.21328, + "grad_norm": 1.1051294523482973, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 21328 + }, + { + "epoch": 0.21329, + "grad_norm": 0.9889025080245338, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 21329 + }, + { + "epoch": 0.2133, + "grad_norm": 1.0100949131470505, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 21330 + }, + { + "epoch": 0.21331, + "grad_norm": 1.126585164504275, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 21331 + }, + { + "epoch": 0.21332, + "grad_norm": 0.7962512890266764, + "learning_rate": 0.003, + "loss": 4.068, + "step": 21332 + }, + { + "epoch": 0.21333, + "grad_norm": 0.8522845771627242, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 21333 + }, + { + "epoch": 0.21334, + "grad_norm": 0.9526029103214415, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 21334 + }, + { + "epoch": 0.21335, + "grad_norm": 1.1669842014286143, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 21335 + }, + { + "epoch": 0.21336, + "grad_norm": 0.8933792119256343, + "learning_rate": 0.003, + "loss": 4.056, + "step": 21336 + }, + { + "epoch": 0.21337, + "grad_norm": 0.877876816362361, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 21337 + }, + { + "epoch": 0.21338, + "grad_norm": 0.9273135760349166, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 21338 + }, + { + "epoch": 0.21339, + "grad_norm": 0.9318291343925488, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 21339 + }, + { + "epoch": 0.2134, + "grad_norm": 0.8403932154757081, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 21340 + }, + { + "epoch": 0.21341, + "grad_norm": 0.8695387354890809, + "learning_rate": 0.003, + "loss": 4.076, + "step": 21341 + }, + { + "epoch": 0.21342, + "grad_norm": 0.9400148271112372, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 21342 + }, + { + "epoch": 0.21343, + "grad_norm": 1.113601714658504, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 21343 + }, + { + "epoch": 0.21344, + "grad_norm": 1.0866489269900732, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 21344 + }, + { + "epoch": 0.21345, + "grad_norm": 1.1229349825443453, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 21345 + }, + { + "epoch": 0.21346, + "grad_norm": 0.9405376207001583, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 21346 + }, + { + "epoch": 0.21347, + "grad_norm": 0.9500346961923676, + "learning_rate": 0.003, + "loss": 4.078, + "step": 21347 + }, + { + "epoch": 0.21348, + "grad_norm": 0.8218563266258336, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 21348 + }, + { + "epoch": 0.21349, + "grad_norm": 0.834416220996816, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 21349 + }, + { + "epoch": 0.2135, + "grad_norm": 0.9078981301095694, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 21350 + }, + { + "epoch": 0.21351, + "grad_norm": 1.1390345127660626, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 21351 + }, + { + "epoch": 0.21352, + "grad_norm": 0.9086358048852432, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 21352 + }, + { + "epoch": 0.21353, + "grad_norm": 0.8204596465265185, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 21353 + }, + { + "epoch": 0.21354, + "grad_norm": 0.9019404314427433, + "learning_rate": 0.003, + "loss": 4.069, + "step": 21354 + }, + { + "epoch": 0.21355, + "grad_norm": 0.920367402924881, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 21355 + }, + { + "epoch": 0.21356, + "grad_norm": 0.7973367884096413, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 21356 + }, + { + "epoch": 0.21357, + "grad_norm": 0.6930357605376251, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 21357 + }, + { + "epoch": 0.21358, + "grad_norm": 0.7407767863293523, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 21358 + }, + { + "epoch": 0.21359, + "grad_norm": 0.7842652627237436, + "learning_rate": 0.003, + "loss": 4.044, + "step": 21359 + }, + { + "epoch": 0.2136, + "grad_norm": 0.8332080172008113, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 21360 + }, + { + "epoch": 0.21361, + "grad_norm": 0.885481222691971, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 21361 + }, + { + "epoch": 0.21362, + "grad_norm": 1.0102925759314678, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 21362 + }, + { + "epoch": 0.21363, + "grad_norm": 1.0928436872355154, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 21363 + }, + { + "epoch": 0.21364, + "grad_norm": 0.8767967390102701, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 21364 + }, + { + "epoch": 0.21365, + "grad_norm": 0.6746446662770155, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 21365 + }, + { + "epoch": 0.21366, + "grad_norm": 0.6755843158706554, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 21366 + }, + { + "epoch": 0.21367, + "grad_norm": 0.8169999232168187, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 21367 + }, + { + "epoch": 0.21368, + "grad_norm": 0.9405999873733871, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 21368 + }, + { + "epoch": 0.21369, + "grad_norm": 0.9740985947277164, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 21369 + }, + { + "epoch": 0.2137, + "grad_norm": 1.049346979694071, + "learning_rate": 0.003, + "loss": 4.0995, + "step": 21370 + }, + { + "epoch": 0.21371, + "grad_norm": 1.0640850585130066, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 21371 + }, + { + "epoch": 0.21372, + "grad_norm": 0.957192180995113, + "learning_rate": 0.003, + "loss": 4.0916, + "step": 21372 + }, + { + "epoch": 0.21373, + "grad_norm": 1.1119459871247441, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 21373 + }, + { + "epoch": 0.21374, + "grad_norm": 1.0553609199919132, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 21374 + }, + { + "epoch": 0.21375, + "grad_norm": 0.905381816938977, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 21375 + }, + { + "epoch": 0.21376, + "grad_norm": 0.9007522996477779, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 21376 + }, + { + "epoch": 0.21377, + "grad_norm": 0.9671815182500294, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 21377 + }, + { + "epoch": 0.21378, + "grad_norm": 0.9794147638607366, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 21378 + }, + { + "epoch": 0.21379, + "grad_norm": 1.0823450455423465, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 21379 + }, + { + "epoch": 0.2138, + "grad_norm": 1.0776837298983957, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 21380 + }, + { + "epoch": 0.21381, + "grad_norm": 1.0550059105757812, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 21381 + }, + { + "epoch": 0.21382, + "grad_norm": 0.9060408240886062, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 21382 + }, + { + "epoch": 0.21383, + "grad_norm": 0.9105263545879362, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 21383 + }, + { + "epoch": 0.21384, + "grad_norm": 0.9325095562625706, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 21384 + }, + { + "epoch": 0.21385, + "grad_norm": 0.9105796920224166, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 21385 + }, + { + "epoch": 0.21386, + "grad_norm": 1.0527195723375458, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 21386 + }, + { + "epoch": 0.21387, + "grad_norm": 0.8948238423275834, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 21387 + }, + { + "epoch": 0.21388, + "grad_norm": 0.800240153649954, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 21388 + }, + { + "epoch": 0.21389, + "grad_norm": 0.8274793267588422, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 21389 + }, + { + "epoch": 0.2139, + "grad_norm": 0.6110452753450674, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 21390 + }, + { + "epoch": 0.21391, + "grad_norm": 0.5946604858150054, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 21391 + }, + { + "epoch": 0.21392, + "grad_norm": 0.6960747948357103, + "learning_rate": 0.003, + "loss": 4.05, + "step": 21392 + }, + { + "epoch": 0.21393, + "grad_norm": 0.7462711901382988, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 21393 + }, + { + "epoch": 0.21394, + "grad_norm": 0.733471496702728, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 21394 + }, + { + "epoch": 0.21395, + "grad_norm": 0.7577502191514479, + "learning_rate": 0.003, + "loss": 4.049, + "step": 21395 + }, + { + "epoch": 0.21396, + "grad_norm": 0.840421108050357, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 21396 + }, + { + "epoch": 0.21397, + "grad_norm": 0.818519133768956, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 21397 + }, + { + "epoch": 0.21398, + "grad_norm": 0.8779364785315508, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 21398 + }, + { + "epoch": 0.21399, + "grad_norm": 1.059892231131895, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 21399 + }, + { + "epoch": 0.214, + "grad_norm": 1.1741248691709623, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 21400 + }, + { + "epoch": 0.21401, + "grad_norm": 0.8686644036094342, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 21401 + }, + { + "epoch": 0.21402, + "grad_norm": 0.8992972114790656, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 21402 + }, + { + "epoch": 0.21403, + "grad_norm": 0.8911434120168006, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 21403 + }, + { + "epoch": 0.21404, + "grad_norm": 0.7706015238868186, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 21404 + }, + { + "epoch": 0.21405, + "grad_norm": 0.7255141803490246, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 21405 + }, + { + "epoch": 0.21406, + "grad_norm": 0.7647138842330985, + "learning_rate": 0.003, + "loss": 4.032, + "step": 21406 + }, + { + "epoch": 0.21407, + "grad_norm": 0.7389923966534094, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21407 + }, + { + "epoch": 0.21408, + "grad_norm": 0.7087570950921596, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 21408 + }, + { + "epoch": 0.21409, + "grad_norm": 0.6780210754694448, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 21409 + }, + { + "epoch": 0.2141, + "grad_norm": 0.876655044563196, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 21410 + }, + { + "epoch": 0.21411, + "grad_norm": 0.9919748322359662, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 21411 + }, + { + "epoch": 0.21412, + "grad_norm": 1.2636542450740191, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 21412 + }, + { + "epoch": 0.21413, + "grad_norm": 0.7131346267936488, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 21413 + }, + { + "epoch": 0.21414, + "grad_norm": 0.6322143293499282, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 21414 + }, + { + "epoch": 0.21415, + "grad_norm": 0.7819501573375415, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 21415 + }, + { + "epoch": 0.21416, + "grad_norm": 0.9014886808134205, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 21416 + }, + { + "epoch": 0.21417, + "grad_norm": 1.2190233411001883, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 21417 + }, + { + "epoch": 0.21418, + "grad_norm": 0.8659790766710892, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 21418 + }, + { + "epoch": 0.21419, + "grad_norm": 0.8135526653004657, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 21419 + }, + { + "epoch": 0.2142, + "grad_norm": 0.8568164621978299, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 21420 + }, + { + "epoch": 0.21421, + "grad_norm": 0.8490283973014067, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 21421 + }, + { + "epoch": 0.21422, + "grad_norm": 0.8670183618441724, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 21422 + }, + { + "epoch": 0.21423, + "grad_norm": 0.8008910809403382, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21423 + }, + { + "epoch": 0.21424, + "grad_norm": 0.731955970182008, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 21424 + }, + { + "epoch": 0.21425, + "grad_norm": 0.6740513560649253, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 21425 + }, + { + "epoch": 0.21426, + "grad_norm": 0.7676704833964977, + "learning_rate": 0.003, + "loss": 4.072, + "step": 21426 + }, + { + "epoch": 0.21427, + "grad_norm": 0.97995262106898, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 21427 + }, + { + "epoch": 0.21428, + "grad_norm": 1.3215832951005382, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 21428 + }, + { + "epoch": 0.21429, + "grad_norm": 0.9560479202046296, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 21429 + }, + { + "epoch": 0.2143, + "grad_norm": 1.0841098992244047, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 21430 + }, + { + "epoch": 0.21431, + "grad_norm": 1.055062025591932, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 21431 + }, + { + "epoch": 0.21432, + "grad_norm": 1.024259057986223, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 21432 + }, + { + "epoch": 0.21433, + "grad_norm": 0.8386995142459611, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 21433 + }, + { + "epoch": 0.21434, + "grad_norm": 0.8696884076101957, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 21434 + }, + { + "epoch": 0.21435, + "grad_norm": 0.8539816541452752, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 21435 + }, + { + "epoch": 0.21436, + "grad_norm": 0.8098803658264432, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 21436 + }, + { + "epoch": 0.21437, + "grad_norm": 0.9616615687849573, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 21437 + }, + { + "epoch": 0.21438, + "grad_norm": 1.1090659610752636, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 21438 + }, + { + "epoch": 0.21439, + "grad_norm": 0.8833350970630555, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 21439 + }, + { + "epoch": 0.2144, + "grad_norm": 0.9382105328542503, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 21440 + }, + { + "epoch": 0.21441, + "grad_norm": 1.0334773875653411, + "learning_rate": 0.003, + "loss": 4.043, + "step": 21441 + }, + { + "epoch": 0.21442, + "grad_norm": 1.0881440523884154, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 21442 + }, + { + "epoch": 0.21443, + "grad_norm": 1.0229790824041554, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 21443 + }, + { + "epoch": 0.21444, + "grad_norm": 0.8141416583645512, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 21444 + }, + { + "epoch": 0.21445, + "grad_norm": 0.7861323514029146, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 21445 + }, + { + "epoch": 0.21446, + "grad_norm": 0.810458938640283, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 21446 + }, + { + "epoch": 0.21447, + "grad_norm": 1.0473292887194876, + "learning_rate": 0.003, + "loss": 4.035, + "step": 21447 + }, + { + "epoch": 0.21448, + "grad_norm": 1.2288446034500558, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 21448 + }, + { + "epoch": 0.21449, + "grad_norm": 0.8277477834621442, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 21449 + }, + { + "epoch": 0.2145, + "grad_norm": 0.7543736141945366, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 21450 + }, + { + "epoch": 0.21451, + "grad_norm": 0.7154744939626335, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 21451 + }, + { + "epoch": 0.21452, + "grad_norm": 0.7533594485150341, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21452 + }, + { + "epoch": 0.21453, + "grad_norm": 0.8395348351440081, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 21453 + }, + { + "epoch": 0.21454, + "grad_norm": 0.8361094780878154, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 21454 + }, + { + "epoch": 0.21455, + "grad_norm": 0.8511347730897709, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 21455 + }, + { + "epoch": 0.21456, + "grad_norm": 0.8775439946918151, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 21456 + }, + { + "epoch": 0.21457, + "grad_norm": 0.827383981504461, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 21457 + }, + { + "epoch": 0.21458, + "grad_norm": 1.112083865598023, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 21458 + }, + { + "epoch": 0.21459, + "grad_norm": 1.1616812671504075, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 21459 + }, + { + "epoch": 0.2146, + "grad_norm": 0.8078230457548922, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 21460 + }, + { + "epoch": 0.21461, + "grad_norm": 0.7022166266473627, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 21461 + }, + { + "epoch": 0.21462, + "grad_norm": 0.7230187039787708, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 21462 + }, + { + "epoch": 0.21463, + "grad_norm": 0.8674659440770295, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 21463 + }, + { + "epoch": 0.21464, + "grad_norm": 1.0916279614881734, + "learning_rate": 0.003, + "loss": 4.076, + "step": 21464 + }, + { + "epoch": 0.21465, + "grad_norm": 0.9465329070055324, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 21465 + }, + { + "epoch": 0.21466, + "grad_norm": 0.9964260038663965, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 21466 + }, + { + "epoch": 0.21467, + "grad_norm": 1.1221401265737272, + "learning_rate": 0.003, + "loss": 4.083, + "step": 21467 + }, + { + "epoch": 0.21468, + "grad_norm": 0.8291025254374909, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 21468 + }, + { + "epoch": 0.21469, + "grad_norm": 0.8477495422477623, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 21469 + }, + { + "epoch": 0.2147, + "grad_norm": 0.8807154171946066, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 21470 + }, + { + "epoch": 0.21471, + "grad_norm": 0.9242952518551273, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 21471 + }, + { + "epoch": 0.21472, + "grad_norm": 0.8877809205405255, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 21472 + }, + { + "epoch": 0.21473, + "grad_norm": 0.8806846354548536, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 21473 + }, + { + "epoch": 0.21474, + "grad_norm": 0.8608948945127014, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 21474 + }, + { + "epoch": 0.21475, + "grad_norm": 0.7386160940354448, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 21475 + }, + { + "epoch": 0.21476, + "grad_norm": 0.8520807385198372, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 21476 + }, + { + "epoch": 0.21477, + "grad_norm": 0.9312350048914875, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 21477 + }, + { + "epoch": 0.21478, + "grad_norm": 1.0285513080363073, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 21478 + }, + { + "epoch": 0.21479, + "grad_norm": 1.1856745324025493, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 21479 + }, + { + "epoch": 0.2148, + "grad_norm": 1.0755530879024613, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 21480 + }, + { + "epoch": 0.21481, + "grad_norm": 0.9067248080300517, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 21481 + }, + { + "epoch": 0.21482, + "grad_norm": 0.9503152864041258, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 21482 + }, + { + "epoch": 0.21483, + "grad_norm": 1.0586304737700027, + "learning_rate": 0.003, + "loss": 4.065, + "step": 21483 + }, + { + "epoch": 0.21484, + "grad_norm": 0.9660453492246902, + "learning_rate": 0.003, + "loss": 4.072, + "step": 21484 + }, + { + "epoch": 0.21485, + "grad_norm": 0.8018934200619793, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 21485 + }, + { + "epoch": 0.21486, + "grad_norm": 0.6957883514573063, + "learning_rate": 0.003, + "loss": 4.081, + "step": 21486 + }, + { + "epoch": 0.21487, + "grad_norm": 0.6737104591167143, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 21487 + }, + { + "epoch": 0.21488, + "grad_norm": 0.6925581284879242, + "learning_rate": 0.003, + "loss": 4.0887, + "step": 21488 + }, + { + "epoch": 0.21489, + "grad_norm": 0.7557232532101538, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 21489 + }, + { + "epoch": 0.2149, + "grad_norm": 0.8963622020095259, + "learning_rate": 0.003, + "loss": 4.059, + "step": 21490 + }, + { + "epoch": 0.21491, + "grad_norm": 1.102121592063994, + "learning_rate": 0.003, + "loss": 4.1062, + "step": 21491 + }, + { + "epoch": 0.21492, + "grad_norm": 0.7762725138771334, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 21492 + }, + { + "epoch": 0.21493, + "grad_norm": 0.7680256194872983, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 21493 + }, + { + "epoch": 0.21494, + "grad_norm": 0.8702451922180889, + "learning_rate": 0.003, + "loss": 4.1005, + "step": 21494 + }, + { + "epoch": 0.21495, + "grad_norm": 0.848564649327761, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 21495 + }, + { + "epoch": 0.21496, + "grad_norm": 0.9102315469982839, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 21496 + }, + { + "epoch": 0.21497, + "grad_norm": 1.0822172076151446, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 21497 + }, + { + "epoch": 0.21498, + "grad_norm": 1.0227927189533406, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 21498 + }, + { + "epoch": 0.21499, + "grad_norm": 0.9203883246793417, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 21499 + }, + { + "epoch": 0.215, + "grad_norm": 0.9763742479663114, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 21500 + }, + { + "epoch": 0.21501, + "grad_norm": 0.9834431876388631, + "learning_rate": 0.003, + "loss": 4.085, + "step": 21501 + }, + { + "epoch": 0.21502, + "grad_norm": 1.1189385906350726, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 21502 + }, + { + "epoch": 0.21503, + "grad_norm": 0.9885298973184874, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 21503 + }, + { + "epoch": 0.21504, + "grad_norm": 1.128632283203252, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 21504 + }, + { + "epoch": 0.21505, + "grad_norm": 0.9284090737407498, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 21505 + }, + { + "epoch": 0.21506, + "grad_norm": 0.8118828230908535, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 21506 + }, + { + "epoch": 0.21507, + "grad_norm": 0.6797074208566537, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 21507 + }, + { + "epoch": 0.21508, + "grad_norm": 0.7031775790983548, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 21508 + }, + { + "epoch": 0.21509, + "grad_norm": 0.7191577393702786, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 21509 + }, + { + "epoch": 0.2151, + "grad_norm": 0.700204199089381, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 21510 + }, + { + "epoch": 0.21511, + "grad_norm": 0.6202389829836995, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 21511 + }, + { + "epoch": 0.21512, + "grad_norm": 0.5687590389877023, + "learning_rate": 0.003, + "loss": 4.0026, + "step": 21512 + }, + { + "epoch": 0.21513, + "grad_norm": 0.5963252648663018, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 21513 + }, + { + "epoch": 0.21514, + "grad_norm": 0.586654520117425, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 21514 + }, + { + "epoch": 0.21515, + "grad_norm": 0.637903565746543, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 21515 + }, + { + "epoch": 0.21516, + "grad_norm": 0.701677471648405, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 21516 + }, + { + "epoch": 0.21517, + "grad_norm": 0.822810744954545, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 21517 + }, + { + "epoch": 0.21518, + "grad_norm": 1.1199938157365958, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 21518 + }, + { + "epoch": 0.21519, + "grad_norm": 1.3110957899877198, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 21519 + }, + { + "epoch": 0.2152, + "grad_norm": 0.6399605418274914, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 21520 + }, + { + "epoch": 0.21521, + "grad_norm": 0.701215838253175, + "learning_rate": 0.003, + "loss": 4.046, + "step": 21521 + }, + { + "epoch": 0.21522, + "grad_norm": 0.785436350636527, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 21522 + }, + { + "epoch": 0.21523, + "grad_norm": 0.8871016659784559, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 21523 + }, + { + "epoch": 0.21524, + "grad_norm": 0.8941368839002816, + "learning_rate": 0.003, + "loss": 4.064, + "step": 21524 + }, + { + "epoch": 0.21525, + "grad_norm": 0.9401662392758682, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 21525 + }, + { + "epoch": 0.21526, + "grad_norm": 0.9725199864135584, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 21526 + }, + { + "epoch": 0.21527, + "grad_norm": 1.0258967049026004, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 21527 + }, + { + "epoch": 0.21528, + "grad_norm": 0.9167330372871186, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 21528 + }, + { + "epoch": 0.21529, + "grad_norm": 0.9036623471506209, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 21529 + }, + { + "epoch": 0.2153, + "grad_norm": 0.9834314313741844, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 21530 + }, + { + "epoch": 0.21531, + "grad_norm": 0.9625753642740854, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 21531 + }, + { + "epoch": 0.21532, + "grad_norm": 0.9023678816797029, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 21532 + }, + { + "epoch": 0.21533, + "grad_norm": 0.8407633518286873, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 21533 + }, + { + "epoch": 0.21534, + "grad_norm": 0.9129817559963305, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 21534 + }, + { + "epoch": 0.21535, + "grad_norm": 1.256952888320241, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 21535 + }, + { + "epoch": 0.21536, + "grad_norm": 0.9992239122319129, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 21536 + }, + { + "epoch": 0.21537, + "grad_norm": 0.8984541090414525, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 21537 + }, + { + "epoch": 0.21538, + "grad_norm": 0.7979672264163751, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 21538 + }, + { + "epoch": 0.21539, + "grad_norm": 0.7996716650769474, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 21539 + }, + { + "epoch": 0.2154, + "grad_norm": 0.9718535858845164, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 21540 + }, + { + "epoch": 0.21541, + "grad_norm": 1.1305992578210367, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 21541 + }, + { + "epoch": 0.21542, + "grad_norm": 1.2332343700140111, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 21542 + }, + { + "epoch": 0.21543, + "grad_norm": 0.7642761200892136, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 21543 + }, + { + "epoch": 0.21544, + "grad_norm": 0.6267605744638748, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 21544 + }, + { + "epoch": 0.21545, + "grad_norm": 0.6889176470588887, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 21545 + }, + { + "epoch": 0.21546, + "grad_norm": 0.7668689788665622, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 21546 + }, + { + "epoch": 0.21547, + "grad_norm": 0.7800768554991903, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 21547 + }, + { + "epoch": 0.21548, + "grad_norm": 0.8047459837679152, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 21548 + }, + { + "epoch": 0.21549, + "grad_norm": 0.8408625066215695, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 21549 + }, + { + "epoch": 0.2155, + "grad_norm": 0.8056977882994107, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 21550 + }, + { + "epoch": 0.21551, + "grad_norm": 0.7898326719010477, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 21551 + }, + { + "epoch": 0.21552, + "grad_norm": 0.7888800636508843, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 21552 + }, + { + "epoch": 0.21553, + "grad_norm": 0.863964246220263, + "learning_rate": 0.003, + "loss": 4.04, + "step": 21553 + }, + { + "epoch": 0.21554, + "grad_norm": 0.9936442170022771, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 21554 + }, + { + "epoch": 0.21555, + "grad_norm": 1.0209067339858764, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 21555 + }, + { + "epoch": 0.21556, + "grad_norm": 0.918394813071859, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 21556 + }, + { + "epoch": 0.21557, + "grad_norm": 0.841641707812552, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 21557 + }, + { + "epoch": 0.21558, + "grad_norm": 0.8963472011396417, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 21558 + }, + { + "epoch": 0.21559, + "grad_norm": 0.8973791510759853, + "learning_rate": 0.003, + "loss": 4.087, + "step": 21559 + }, + { + "epoch": 0.2156, + "grad_norm": 1.073379763863376, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 21560 + }, + { + "epoch": 0.21561, + "grad_norm": 1.1069851196346872, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 21561 + }, + { + "epoch": 0.21562, + "grad_norm": 1.1147197803258264, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 21562 + }, + { + "epoch": 0.21563, + "grad_norm": 0.8682319839410553, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 21563 + }, + { + "epoch": 0.21564, + "grad_norm": 0.7881534960003074, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 21564 + }, + { + "epoch": 0.21565, + "grad_norm": 0.7243852866001149, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 21565 + }, + { + "epoch": 0.21566, + "grad_norm": 0.8044460801035058, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 21566 + }, + { + "epoch": 0.21567, + "grad_norm": 0.9610971237661414, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 21567 + }, + { + "epoch": 0.21568, + "grad_norm": 1.0299650809931202, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 21568 + }, + { + "epoch": 0.21569, + "grad_norm": 1.1203253930132897, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 21569 + }, + { + "epoch": 0.2157, + "grad_norm": 0.8847565450160156, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 21570 + }, + { + "epoch": 0.21571, + "grad_norm": 0.7899478323008584, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 21571 + }, + { + "epoch": 0.21572, + "grad_norm": 0.7697418053044778, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 21572 + }, + { + "epoch": 0.21573, + "grad_norm": 0.6961944058455329, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 21573 + }, + { + "epoch": 0.21574, + "grad_norm": 0.7714756346724448, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 21574 + }, + { + "epoch": 0.21575, + "grad_norm": 0.808352357277223, + "learning_rate": 0.003, + "loss": 4.063, + "step": 21575 + }, + { + "epoch": 0.21576, + "grad_norm": 0.8327116352852698, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 21576 + }, + { + "epoch": 0.21577, + "grad_norm": 0.9962676451452657, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 21577 + }, + { + "epoch": 0.21578, + "grad_norm": 1.2953923746605633, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 21578 + }, + { + "epoch": 0.21579, + "grad_norm": 0.69211197211348, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 21579 + }, + { + "epoch": 0.2158, + "grad_norm": 0.7359911437887672, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 21580 + }, + { + "epoch": 0.21581, + "grad_norm": 0.9178731069503201, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 21581 + }, + { + "epoch": 0.21582, + "grad_norm": 1.0667355843827313, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 21582 + }, + { + "epoch": 0.21583, + "grad_norm": 1.1037629429987468, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 21583 + }, + { + "epoch": 0.21584, + "grad_norm": 1.0563403491958434, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 21584 + }, + { + "epoch": 0.21585, + "grad_norm": 1.066679010727024, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 21585 + }, + { + "epoch": 0.21586, + "grad_norm": 1.04442020629008, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 21586 + }, + { + "epoch": 0.21587, + "grad_norm": 0.8937336884621242, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 21587 + }, + { + "epoch": 0.21588, + "grad_norm": 0.881480218029145, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 21588 + }, + { + "epoch": 0.21589, + "grad_norm": 0.747716770035972, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 21589 + }, + { + "epoch": 0.2159, + "grad_norm": 0.7978472509505494, + "learning_rate": 0.003, + "loss": 4.043, + "step": 21590 + }, + { + "epoch": 0.21591, + "grad_norm": 0.7609638617708171, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 21591 + }, + { + "epoch": 0.21592, + "grad_norm": 0.8658713023705344, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21592 + }, + { + "epoch": 0.21593, + "grad_norm": 0.9922656287287943, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 21593 + }, + { + "epoch": 0.21594, + "grad_norm": 1.0059345342528614, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 21594 + }, + { + "epoch": 0.21595, + "grad_norm": 1.1250739057614159, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 21595 + }, + { + "epoch": 0.21596, + "grad_norm": 0.8976772724944533, + "learning_rate": 0.003, + "loss": 4.042, + "step": 21596 + }, + { + "epoch": 0.21597, + "grad_norm": 0.9098047129674696, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 21597 + }, + { + "epoch": 0.21598, + "grad_norm": 1.0585225519338008, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 21598 + }, + { + "epoch": 0.21599, + "grad_norm": 1.1396511428165137, + "learning_rate": 0.003, + "loss": 4.057, + "step": 21599 + }, + { + "epoch": 0.216, + "grad_norm": 0.8774084028092378, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 21600 + }, + { + "epoch": 0.21601, + "grad_norm": 0.8939184951426141, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 21601 + }, + { + "epoch": 0.21602, + "grad_norm": 0.811550140556642, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 21602 + }, + { + "epoch": 0.21603, + "grad_norm": 0.6851775951282392, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 21603 + }, + { + "epoch": 0.21604, + "grad_norm": 0.7080152332646765, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 21604 + }, + { + "epoch": 0.21605, + "grad_norm": 0.7462907969939713, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 21605 + }, + { + "epoch": 0.21606, + "grad_norm": 0.7348892141682789, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 21606 + }, + { + "epoch": 0.21607, + "grad_norm": 0.9285085750475869, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 21607 + }, + { + "epoch": 0.21608, + "grad_norm": 1.2740961615973008, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 21608 + }, + { + "epoch": 0.21609, + "grad_norm": 0.8588938489802174, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 21609 + }, + { + "epoch": 0.2161, + "grad_norm": 0.8117988589784464, + "learning_rate": 0.003, + "loss": 4.037, + "step": 21610 + }, + { + "epoch": 0.21611, + "grad_norm": 0.8068452726889168, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 21611 + }, + { + "epoch": 0.21612, + "grad_norm": 0.7470045278346455, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 21612 + }, + { + "epoch": 0.21613, + "grad_norm": 0.7622859900911136, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 21613 + }, + { + "epoch": 0.21614, + "grad_norm": 0.887045992715797, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 21614 + }, + { + "epoch": 0.21615, + "grad_norm": 0.9104412327953356, + "learning_rate": 0.003, + "loss": 4.061, + "step": 21615 + }, + { + "epoch": 0.21616, + "grad_norm": 0.8559730237146324, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 21616 + }, + { + "epoch": 0.21617, + "grad_norm": 1.0582391420907316, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 21617 + }, + { + "epoch": 0.21618, + "grad_norm": 1.0484971344165914, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 21618 + }, + { + "epoch": 0.21619, + "grad_norm": 1.1274502375560465, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 21619 + }, + { + "epoch": 0.2162, + "grad_norm": 1.1224225085747928, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 21620 + }, + { + "epoch": 0.21621, + "grad_norm": 0.9141013533787247, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 21621 + }, + { + "epoch": 0.21622, + "grad_norm": 0.8814419925223598, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 21622 + }, + { + "epoch": 0.21623, + "grad_norm": 0.8747538403440693, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 21623 + }, + { + "epoch": 0.21624, + "grad_norm": 0.9642456331289045, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 21624 + }, + { + "epoch": 0.21625, + "grad_norm": 1.0237967212268972, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 21625 + }, + { + "epoch": 0.21626, + "grad_norm": 0.8914926550164441, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 21626 + }, + { + "epoch": 0.21627, + "grad_norm": 0.8911546461704158, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 21627 + }, + { + "epoch": 0.21628, + "grad_norm": 0.8773831700484963, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 21628 + }, + { + "epoch": 0.21629, + "grad_norm": 0.7283344669390761, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 21629 + }, + { + "epoch": 0.2163, + "grad_norm": 0.6710599648361194, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 21630 + }, + { + "epoch": 0.21631, + "grad_norm": 0.6924432102416787, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 21631 + }, + { + "epoch": 0.21632, + "grad_norm": 0.7931314253636245, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 21632 + }, + { + "epoch": 0.21633, + "grad_norm": 1.0012851697138478, + "learning_rate": 0.003, + "loss": 4.076, + "step": 21633 + }, + { + "epoch": 0.21634, + "grad_norm": 1.1844095005818749, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 21634 + }, + { + "epoch": 0.21635, + "grad_norm": 0.8939170965391628, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 21635 + }, + { + "epoch": 0.21636, + "grad_norm": 0.8386339167110952, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 21636 + }, + { + "epoch": 0.21637, + "grad_norm": 0.9406366770347894, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 21637 + }, + { + "epoch": 0.21638, + "grad_norm": 1.0056696953763926, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 21638 + }, + { + "epoch": 0.21639, + "grad_norm": 1.1459665315558933, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 21639 + }, + { + "epoch": 0.2164, + "grad_norm": 1.2879038591592724, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 21640 + }, + { + "epoch": 0.21641, + "grad_norm": 0.7754210785300859, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 21641 + }, + { + "epoch": 0.21642, + "grad_norm": 0.7576668442682193, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21642 + }, + { + "epoch": 0.21643, + "grad_norm": 0.8291307354555252, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 21643 + }, + { + "epoch": 0.21644, + "grad_norm": 0.9440496609742046, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 21644 + }, + { + "epoch": 0.21645, + "grad_norm": 1.057683259501635, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 21645 + }, + { + "epoch": 0.21646, + "grad_norm": 0.965074045398963, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 21646 + }, + { + "epoch": 0.21647, + "grad_norm": 1.0388419001739948, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 21647 + }, + { + "epoch": 0.21648, + "grad_norm": 0.827878802746287, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 21648 + }, + { + "epoch": 0.21649, + "grad_norm": 0.7772531968996987, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 21649 + }, + { + "epoch": 0.2165, + "grad_norm": 0.9006617238650141, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 21650 + }, + { + "epoch": 0.21651, + "grad_norm": 1.1326992391189854, + "learning_rate": 0.003, + "loss": 4.068, + "step": 21651 + }, + { + "epoch": 0.21652, + "grad_norm": 0.927373925624323, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 21652 + }, + { + "epoch": 0.21653, + "grad_norm": 0.8504422081880315, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 21653 + }, + { + "epoch": 0.21654, + "grad_norm": 0.9677334091580314, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 21654 + }, + { + "epoch": 0.21655, + "grad_norm": 1.0793538904197635, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 21655 + }, + { + "epoch": 0.21656, + "grad_norm": 0.9933384589835113, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 21656 + }, + { + "epoch": 0.21657, + "grad_norm": 1.0372905967029282, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 21657 + }, + { + "epoch": 0.21658, + "grad_norm": 0.9473393589102788, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 21658 + }, + { + "epoch": 0.21659, + "grad_norm": 0.890156533040317, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 21659 + }, + { + "epoch": 0.2166, + "grad_norm": 0.8387443019839843, + "learning_rate": 0.003, + "loss": 4.052, + "step": 21660 + }, + { + "epoch": 0.21661, + "grad_norm": 0.9150766634331974, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 21661 + }, + { + "epoch": 0.21662, + "grad_norm": 1.0447368883019383, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21662 + }, + { + "epoch": 0.21663, + "grad_norm": 0.8275767379179872, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 21663 + }, + { + "epoch": 0.21664, + "grad_norm": 0.9026190988602113, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 21664 + }, + { + "epoch": 0.21665, + "grad_norm": 0.9404827236433632, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 21665 + }, + { + "epoch": 0.21666, + "grad_norm": 1.0903495019012468, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 21666 + }, + { + "epoch": 0.21667, + "grad_norm": 1.0306961696004193, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 21667 + }, + { + "epoch": 0.21668, + "grad_norm": 0.9359899726193733, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 21668 + }, + { + "epoch": 0.21669, + "grad_norm": 0.9787413435144297, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 21669 + }, + { + "epoch": 0.2167, + "grad_norm": 0.9075609465237993, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 21670 + }, + { + "epoch": 0.21671, + "grad_norm": 0.910603955161684, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 21671 + }, + { + "epoch": 0.21672, + "grad_norm": 0.9974863409441619, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 21672 + }, + { + "epoch": 0.21673, + "grad_norm": 1.02372915997606, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 21673 + }, + { + "epoch": 0.21674, + "grad_norm": 0.9040700085483786, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 21674 + }, + { + "epoch": 0.21675, + "grad_norm": 0.839284094566396, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 21675 + }, + { + "epoch": 0.21676, + "grad_norm": 0.9456537268387836, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 21676 + }, + { + "epoch": 0.21677, + "grad_norm": 1.081206362503851, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 21677 + }, + { + "epoch": 0.21678, + "grad_norm": 1.120256209501071, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 21678 + }, + { + "epoch": 0.21679, + "grad_norm": 0.8484967980627793, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 21679 + }, + { + "epoch": 0.2168, + "grad_norm": 0.6274580468827639, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 21680 + }, + { + "epoch": 0.21681, + "grad_norm": 0.7160161315521397, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 21681 + }, + { + "epoch": 0.21682, + "grad_norm": 0.796191190119524, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 21682 + }, + { + "epoch": 0.21683, + "grad_norm": 0.906860065669769, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 21683 + }, + { + "epoch": 0.21684, + "grad_norm": 1.2389275850507053, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 21684 + }, + { + "epoch": 0.21685, + "grad_norm": 0.8667301294930585, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 21685 + }, + { + "epoch": 0.21686, + "grad_norm": 0.6864862165747931, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 21686 + }, + { + "epoch": 0.21687, + "grad_norm": 0.6752751222202794, + "learning_rate": 0.003, + "loss": 4.026, + "step": 21687 + }, + { + "epoch": 0.21688, + "grad_norm": 0.6393432157461432, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 21688 + }, + { + "epoch": 0.21689, + "grad_norm": 0.7213172548442143, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 21689 + }, + { + "epoch": 0.2169, + "grad_norm": 0.8068614310843161, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 21690 + }, + { + "epoch": 0.21691, + "grad_norm": 0.9735663056605472, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 21691 + }, + { + "epoch": 0.21692, + "grad_norm": 1.1510242492447276, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 21692 + }, + { + "epoch": 0.21693, + "grad_norm": 0.9088102727713627, + "learning_rate": 0.003, + "loss": 4.045, + "step": 21693 + }, + { + "epoch": 0.21694, + "grad_norm": 0.8790352573993063, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 21694 + }, + { + "epoch": 0.21695, + "grad_norm": 0.7258250167528519, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 21695 + }, + { + "epoch": 0.21696, + "grad_norm": 0.7566657780636684, + "learning_rate": 0.003, + "loss": 4.054, + "step": 21696 + }, + { + "epoch": 0.21697, + "grad_norm": 0.7481652075474815, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 21697 + }, + { + "epoch": 0.21698, + "grad_norm": 0.7995079503595919, + "learning_rate": 0.003, + "loss": 4.0001, + "step": 21698 + }, + { + "epoch": 0.21699, + "grad_norm": 0.9221970398932061, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 21699 + }, + { + "epoch": 0.217, + "grad_norm": 1.089530519639171, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 21700 + }, + { + "epoch": 0.21701, + "grad_norm": 0.9849748865019732, + "learning_rate": 0.003, + "loss": 4.07, + "step": 21701 + }, + { + "epoch": 0.21702, + "grad_norm": 1.0916368914461505, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 21702 + }, + { + "epoch": 0.21703, + "grad_norm": 0.9190329057858709, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 21703 + }, + { + "epoch": 0.21704, + "grad_norm": 0.9359031951464947, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 21704 + }, + { + "epoch": 0.21705, + "grad_norm": 0.8552540974942002, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 21705 + }, + { + "epoch": 0.21706, + "grad_norm": 0.8625463549116862, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 21706 + }, + { + "epoch": 0.21707, + "grad_norm": 0.7850073924414639, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 21707 + }, + { + "epoch": 0.21708, + "grad_norm": 0.7053690556535402, + "learning_rate": 0.003, + "loss": 4.042, + "step": 21708 + }, + { + "epoch": 0.21709, + "grad_norm": 0.679455169939329, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 21709 + }, + { + "epoch": 0.2171, + "grad_norm": 0.6063083427327239, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 21710 + }, + { + "epoch": 0.21711, + "grad_norm": 0.6042870558918229, + "learning_rate": 0.003, + "loss": 4.043, + "step": 21711 + }, + { + "epoch": 0.21712, + "grad_norm": 0.5876597030766894, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 21712 + }, + { + "epoch": 0.21713, + "grad_norm": 0.7139788375727066, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 21713 + }, + { + "epoch": 0.21714, + "grad_norm": 0.8774489981343453, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 21714 + }, + { + "epoch": 0.21715, + "grad_norm": 1.0956251394978795, + "learning_rate": 0.003, + "loss": 4.051, + "step": 21715 + }, + { + "epoch": 0.21716, + "grad_norm": 1.1323142548228589, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 21716 + }, + { + "epoch": 0.21717, + "grad_norm": 0.855765529664438, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 21717 + }, + { + "epoch": 0.21718, + "grad_norm": 0.7862372621629845, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 21718 + }, + { + "epoch": 0.21719, + "grad_norm": 0.8765073174328232, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 21719 + }, + { + "epoch": 0.2172, + "grad_norm": 1.0701874298564344, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 21720 + }, + { + "epoch": 0.21721, + "grad_norm": 1.003207493277841, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 21721 + }, + { + "epoch": 0.21722, + "grad_norm": 1.0098106141529626, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 21722 + }, + { + "epoch": 0.21723, + "grad_norm": 1.2630803083497524, + "learning_rate": 0.003, + "loss": 4.062, + "step": 21723 + }, + { + "epoch": 0.21724, + "grad_norm": 0.8200706429950618, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 21724 + }, + { + "epoch": 0.21725, + "grad_norm": 0.7808013269229824, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 21725 + }, + { + "epoch": 0.21726, + "grad_norm": 0.8076121987238968, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 21726 + }, + { + "epoch": 0.21727, + "grad_norm": 0.9340666616098632, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 21727 + }, + { + "epoch": 0.21728, + "grad_norm": 0.9423194360069064, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 21728 + }, + { + "epoch": 0.21729, + "grad_norm": 1.0503006535625348, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 21729 + }, + { + "epoch": 0.2173, + "grad_norm": 0.8724642195578092, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 21730 + }, + { + "epoch": 0.21731, + "grad_norm": 0.968236584958818, + "learning_rate": 0.003, + "loss": 4.1054, + "step": 21731 + }, + { + "epoch": 0.21732, + "grad_norm": 0.8207492422821328, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 21732 + }, + { + "epoch": 0.21733, + "grad_norm": 0.8437198040993463, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 21733 + }, + { + "epoch": 0.21734, + "grad_norm": 0.9601208917283135, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 21734 + }, + { + "epoch": 0.21735, + "grad_norm": 1.1337011536669632, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 21735 + }, + { + "epoch": 0.21736, + "grad_norm": 1.124063963359658, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 21736 + }, + { + "epoch": 0.21737, + "grad_norm": 1.0449466232202593, + "learning_rate": 0.003, + "loss": 4.039, + "step": 21737 + }, + { + "epoch": 0.21738, + "grad_norm": 1.0890919556090963, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 21738 + }, + { + "epoch": 0.21739, + "grad_norm": 0.9816031771809035, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 21739 + }, + { + "epoch": 0.2174, + "grad_norm": 0.9426841605446407, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 21740 + }, + { + "epoch": 0.21741, + "grad_norm": 0.9543811669691511, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 21741 + }, + { + "epoch": 0.21742, + "grad_norm": 0.906755483117176, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 21742 + }, + { + "epoch": 0.21743, + "grad_norm": 0.7299678634687914, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 21743 + }, + { + "epoch": 0.21744, + "grad_norm": 0.7287868992837896, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 21744 + }, + { + "epoch": 0.21745, + "grad_norm": 0.8056391306239338, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 21745 + }, + { + "epoch": 0.21746, + "grad_norm": 0.8901162736036207, + "learning_rate": 0.003, + "loss": 4.044, + "step": 21746 + }, + { + "epoch": 0.21747, + "grad_norm": 0.8438633989529243, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 21747 + }, + { + "epoch": 0.21748, + "grad_norm": 0.7606066494856811, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 21748 + }, + { + "epoch": 0.21749, + "grad_norm": 0.6944044836174365, + "learning_rate": 0.003, + "loss": 4.024, + "step": 21749 + }, + { + "epoch": 0.2175, + "grad_norm": 0.6619833367728168, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 21750 + }, + { + "epoch": 0.21751, + "grad_norm": 0.7103281809588515, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 21751 + }, + { + "epoch": 0.21752, + "grad_norm": 0.7354971169745597, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 21752 + }, + { + "epoch": 0.21753, + "grad_norm": 0.833585795845446, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 21753 + }, + { + "epoch": 0.21754, + "grad_norm": 0.8894245683733248, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 21754 + }, + { + "epoch": 0.21755, + "grad_norm": 0.8612553490408327, + "learning_rate": 0.003, + "loss": 4.069, + "step": 21755 + }, + { + "epoch": 0.21756, + "grad_norm": 0.8247254338639682, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 21756 + }, + { + "epoch": 0.21757, + "grad_norm": 0.7995534097460469, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 21757 + }, + { + "epoch": 0.21758, + "grad_norm": 0.8442317205485379, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 21758 + }, + { + "epoch": 0.21759, + "grad_norm": 0.8615741170302611, + "learning_rate": 0.003, + "loss": 4.064, + "step": 21759 + }, + { + "epoch": 0.2176, + "grad_norm": 0.8879986320591802, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 21760 + }, + { + "epoch": 0.21761, + "grad_norm": 0.9912384350380697, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 21761 + }, + { + "epoch": 0.21762, + "grad_norm": 1.1614877312854015, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 21762 + }, + { + "epoch": 0.21763, + "grad_norm": 0.9920690760233691, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 21763 + }, + { + "epoch": 0.21764, + "grad_norm": 1.2023127995063825, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 21764 + }, + { + "epoch": 0.21765, + "grad_norm": 1.1743562689636649, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 21765 + }, + { + "epoch": 0.21766, + "grad_norm": 0.8184194833657082, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 21766 + }, + { + "epoch": 0.21767, + "grad_norm": 0.6096334570290339, + "learning_rate": 0.003, + "loss": 4.06, + "step": 21767 + }, + { + "epoch": 0.21768, + "grad_norm": 0.6160540298539695, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 21768 + }, + { + "epoch": 0.21769, + "grad_norm": 0.6500015352502315, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 21769 + }, + { + "epoch": 0.2177, + "grad_norm": 0.7285631809258732, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 21770 + }, + { + "epoch": 0.21771, + "grad_norm": 0.7609430266966889, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 21771 + }, + { + "epoch": 0.21772, + "grad_norm": 0.8070217227685988, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 21772 + }, + { + "epoch": 0.21773, + "grad_norm": 1.0632512826420057, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 21773 + }, + { + "epoch": 0.21774, + "grad_norm": 1.2353220250868409, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 21774 + }, + { + "epoch": 0.21775, + "grad_norm": 0.9261179251115704, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 21775 + }, + { + "epoch": 0.21776, + "grad_norm": 0.8639712711510896, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 21776 + }, + { + "epoch": 0.21777, + "grad_norm": 0.8128299898805978, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 21777 + }, + { + "epoch": 0.21778, + "grad_norm": 0.855685022439737, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 21778 + }, + { + "epoch": 0.21779, + "grad_norm": 0.8063805750924459, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 21779 + }, + { + "epoch": 0.2178, + "grad_norm": 0.8480521484072908, + "learning_rate": 0.003, + "loss": 4.06, + "step": 21780 + }, + { + "epoch": 0.21781, + "grad_norm": 0.9660245900318699, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 21781 + }, + { + "epoch": 0.21782, + "grad_norm": 1.0618385269148125, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 21782 + }, + { + "epoch": 0.21783, + "grad_norm": 1.0031395180549605, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 21783 + }, + { + "epoch": 0.21784, + "grad_norm": 0.9996017023202192, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 21784 + }, + { + "epoch": 0.21785, + "grad_norm": 1.0830149435163006, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 21785 + }, + { + "epoch": 0.21786, + "grad_norm": 0.8483612642835753, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 21786 + }, + { + "epoch": 0.21787, + "grad_norm": 0.7176720433588569, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 21787 + }, + { + "epoch": 0.21788, + "grad_norm": 0.76656946742843, + "learning_rate": 0.003, + "loss": 4.067, + "step": 21788 + }, + { + "epoch": 0.21789, + "grad_norm": 0.7348930326566943, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 21789 + }, + { + "epoch": 0.2179, + "grad_norm": 0.8178763061238735, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 21790 + }, + { + "epoch": 0.21791, + "grad_norm": 0.8432369152314106, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 21791 + }, + { + "epoch": 0.21792, + "grad_norm": 0.9160346440476382, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 21792 + }, + { + "epoch": 0.21793, + "grad_norm": 1.0035829820589213, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 21793 + }, + { + "epoch": 0.21794, + "grad_norm": 0.9941020274645616, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 21794 + }, + { + "epoch": 0.21795, + "grad_norm": 0.923841291635766, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 21795 + }, + { + "epoch": 0.21796, + "grad_norm": 1.0241090625911111, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 21796 + }, + { + "epoch": 0.21797, + "grad_norm": 1.1407388137491221, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 21797 + }, + { + "epoch": 0.21798, + "grad_norm": 0.8610989870612316, + "learning_rate": 0.003, + "loss": 4.058, + "step": 21798 + }, + { + "epoch": 0.21799, + "grad_norm": 0.8336347491408379, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 21799 + }, + { + "epoch": 0.218, + "grad_norm": 1.0339584357931486, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 21800 + }, + { + "epoch": 0.21801, + "grad_norm": 1.2544756120362408, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 21801 + }, + { + "epoch": 0.21802, + "grad_norm": 0.8172775587518837, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 21802 + }, + { + "epoch": 0.21803, + "grad_norm": 0.8331448364257094, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 21803 + }, + { + "epoch": 0.21804, + "grad_norm": 0.9073667972995151, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 21804 + }, + { + "epoch": 0.21805, + "grad_norm": 1.010912740947555, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 21805 + }, + { + "epoch": 0.21806, + "grad_norm": 1.0703870318792599, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 21806 + }, + { + "epoch": 0.21807, + "grad_norm": 0.9747586919888545, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 21807 + }, + { + "epoch": 0.21808, + "grad_norm": 0.8793745077518524, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 21808 + }, + { + "epoch": 0.21809, + "grad_norm": 0.8539302737976954, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 21809 + }, + { + "epoch": 0.2181, + "grad_norm": 0.8646277883648703, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 21810 + }, + { + "epoch": 0.21811, + "grad_norm": 0.8623922088132618, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 21811 + }, + { + "epoch": 0.21812, + "grad_norm": 0.9474532195929734, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 21812 + }, + { + "epoch": 0.21813, + "grad_norm": 0.9639535168663033, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21813 + }, + { + "epoch": 0.21814, + "grad_norm": 0.9138906912409014, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 21814 + }, + { + "epoch": 0.21815, + "grad_norm": 1.119352753631545, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 21815 + }, + { + "epoch": 0.21816, + "grad_norm": 1.0997231232692022, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 21816 + }, + { + "epoch": 0.21817, + "grad_norm": 1.0971161967582554, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 21817 + }, + { + "epoch": 0.21818, + "grad_norm": 1.11049528237168, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 21818 + }, + { + "epoch": 0.21819, + "grad_norm": 0.8038519007683086, + "learning_rate": 0.003, + "loss": 4.057, + "step": 21819 + }, + { + "epoch": 0.2182, + "grad_norm": 0.8106520088920963, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 21820 + }, + { + "epoch": 0.21821, + "grad_norm": 0.8506136642300416, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 21821 + }, + { + "epoch": 0.21822, + "grad_norm": 0.9479386857753651, + "learning_rate": 0.003, + "loss": 4.026, + "step": 21822 + }, + { + "epoch": 0.21823, + "grad_norm": 0.9578053700772413, + "learning_rate": 0.003, + "loss": 4.0859, + "step": 21823 + }, + { + "epoch": 0.21824, + "grad_norm": 0.868717582079508, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 21824 + }, + { + "epoch": 0.21825, + "grad_norm": 0.7912575729476037, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21825 + }, + { + "epoch": 0.21826, + "grad_norm": 0.7311329522364901, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 21826 + }, + { + "epoch": 0.21827, + "grad_norm": 0.7262920795960275, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 21827 + }, + { + "epoch": 0.21828, + "grad_norm": 0.7514414802070677, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 21828 + }, + { + "epoch": 0.21829, + "grad_norm": 0.8752341715451432, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 21829 + }, + { + "epoch": 0.2183, + "grad_norm": 1.0182278948983072, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 21830 + }, + { + "epoch": 0.21831, + "grad_norm": 1.047232178453799, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 21831 + }, + { + "epoch": 0.21832, + "grad_norm": 1.142938001776131, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 21832 + }, + { + "epoch": 0.21833, + "grad_norm": 0.8663250237993643, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 21833 + }, + { + "epoch": 0.21834, + "grad_norm": 0.7484786077650789, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 21834 + }, + { + "epoch": 0.21835, + "grad_norm": 0.7674261991974324, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 21835 + }, + { + "epoch": 0.21836, + "grad_norm": 0.8879037233906544, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 21836 + }, + { + "epoch": 0.21837, + "grad_norm": 0.9446985187136123, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 21837 + }, + { + "epoch": 0.21838, + "grad_norm": 0.9384609787373933, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 21838 + }, + { + "epoch": 0.21839, + "grad_norm": 0.9770163781229995, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 21839 + }, + { + "epoch": 0.2184, + "grad_norm": 0.9148885808763234, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 21840 + }, + { + "epoch": 0.21841, + "grad_norm": 0.6592954238743538, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 21841 + }, + { + "epoch": 0.21842, + "grad_norm": 0.6663298622239738, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 21842 + }, + { + "epoch": 0.21843, + "grad_norm": 0.8771462388033658, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 21843 + }, + { + "epoch": 0.21844, + "grad_norm": 1.0706011980772319, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21844 + }, + { + "epoch": 0.21845, + "grad_norm": 0.9354602263488214, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 21845 + }, + { + "epoch": 0.21846, + "grad_norm": 0.9344656532962532, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 21846 + }, + { + "epoch": 0.21847, + "grad_norm": 1.0298846182157835, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 21847 + }, + { + "epoch": 0.21848, + "grad_norm": 0.9446983450757662, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 21848 + }, + { + "epoch": 0.21849, + "grad_norm": 1.0034713206357542, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 21849 + }, + { + "epoch": 0.2185, + "grad_norm": 0.8707010208825356, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 21850 + }, + { + "epoch": 0.21851, + "grad_norm": 0.8716262999612676, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 21851 + }, + { + "epoch": 0.21852, + "grad_norm": 0.8397497111919408, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 21852 + }, + { + "epoch": 0.21853, + "grad_norm": 0.9372735724190617, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 21853 + }, + { + "epoch": 0.21854, + "grad_norm": 0.9295463004752186, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 21854 + }, + { + "epoch": 0.21855, + "grad_norm": 0.961755397950133, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 21855 + }, + { + "epoch": 0.21856, + "grad_norm": 1.015196190236595, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 21856 + }, + { + "epoch": 0.21857, + "grad_norm": 0.9869083536078628, + "learning_rate": 0.003, + "loss": 4.045, + "step": 21857 + }, + { + "epoch": 0.21858, + "grad_norm": 0.8937443116883966, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 21858 + }, + { + "epoch": 0.21859, + "grad_norm": 0.7909144463504884, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 21859 + }, + { + "epoch": 0.2186, + "grad_norm": 0.856811950215714, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 21860 + }, + { + "epoch": 0.21861, + "grad_norm": 0.9530697922161849, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 21861 + }, + { + "epoch": 0.21862, + "grad_norm": 1.042586908502439, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 21862 + }, + { + "epoch": 0.21863, + "grad_norm": 0.8531969681684066, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 21863 + }, + { + "epoch": 0.21864, + "grad_norm": 0.9731209413345449, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 21864 + }, + { + "epoch": 0.21865, + "grad_norm": 1.1988167292908487, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 21865 + }, + { + "epoch": 0.21866, + "grad_norm": 0.8389270945775307, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 21866 + }, + { + "epoch": 0.21867, + "grad_norm": 0.7455358541275876, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 21867 + }, + { + "epoch": 0.21868, + "grad_norm": 0.8170531807435585, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 21868 + }, + { + "epoch": 0.21869, + "grad_norm": 0.8094350063446712, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 21869 + }, + { + "epoch": 0.2187, + "grad_norm": 0.8664323643634094, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 21870 + }, + { + "epoch": 0.21871, + "grad_norm": 0.9823236756909407, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 21871 + }, + { + "epoch": 0.21872, + "grad_norm": 1.2424030025414625, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 21872 + }, + { + "epoch": 0.21873, + "grad_norm": 1.0055262272789798, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 21873 + }, + { + "epoch": 0.21874, + "grad_norm": 1.1366836626416323, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 21874 + }, + { + "epoch": 0.21875, + "grad_norm": 0.9011876189089401, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 21875 + }, + { + "epoch": 0.21876, + "grad_norm": 0.812194119712535, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 21876 + }, + { + "epoch": 0.21877, + "grad_norm": 0.8382600713152306, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 21877 + }, + { + "epoch": 0.21878, + "grad_norm": 0.7563550271046633, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 21878 + }, + { + "epoch": 0.21879, + "grad_norm": 0.6657831446827873, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 21879 + }, + { + "epoch": 0.2188, + "grad_norm": 0.7303940167997048, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 21880 + }, + { + "epoch": 0.21881, + "grad_norm": 0.7996671931555955, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21881 + }, + { + "epoch": 0.21882, + "grad_norm": 0.9485744360751281, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 21882 + }, + { + "epoch": 0.21883, + "grad_norm": 1.0743253420193146, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 21883 + }, + { + "epoch": 0.21884, + "grad_norm": 1.121471550017392, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 21884 + }, + { + "epoch": 0.21885, + "grad_norm": 0.7206118261295444, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 21885 + }, + { + "epoch": 0.21886, + "grad_norm": 0.6813490256850663, + "learning_rate": 0.003, + "loss": 4.1092, + "step": 21886 + }, + { + "epoch": 0.21887, + "grad_norm": 0.778450787755007, + "learning_rate": 0.003, + "loss": 4.03, + "step": 21887 + }, + { + "epoch": 0.21888, + "grad_norm": 0.8373205004683666, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 21888 + }, + { + "epoch": 0.21889, + "grad_norm": 0.8920749430057973, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 21889 + }, + { + "epoch": 0.2189, + "grad_norm": 0.8724784267186376, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 21890 + }, + { + "epoch": 0.21891, + "grad_norm": 0.7083332709884621, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 21891 + }, + { + "epoch": 0.21892, + "grad_norm": 0.647214062500226, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 21892 + }, + { + "epoch": 0.21893, + "grad_norm": 0.7844954984467796, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 21893 + }, + { + "epoch": 0.21894, + "grad_norm": 0.8793651469910239, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 21894 + }, + { + "epoch": 0.21895, + "grad_norm": 1.0256973846512143, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 21895 + }, + { + "epoch": 0.21896, + "grad_norm": 1.0649812626595685, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 21896 + }, + { + "epoch": 0.21897, + "grad_norm": 1.012867883109725, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 21897 + }, + { + "epoch": 0.21898, + "grad_norm": 0.954053886164319, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 21898 + }, + { + "epoch": 0.21899, + "grad_norm": 0.9521962465100751, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 21899 + }, + { + "epoch": 0.219, + "grad_norm": 0.9700752620442195, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 21900 + }, + { + "epoch": 0.21901, + "grad_norm": 0.8369189325197408, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 21901 + }, + { + "epoch": 0.21902, + "grad_norm": 0.6738877534835154, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 21902 + }, + { + "epoch": 0.21903, + "grad_norm": 0.6705390718182693, + "learning_rate": 0.003, + "loss": 4.049, + "step": 21903 + }, + { + "epoch": 0.21904, + "grad_norm": 0.7907706610014479, + "learning_rate": 0.003, + "loss": 4.036, + "step": 21904 + }, + { + "epoch": 0.21905, + "grad_norm": 0.9056948905677695, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 21905 + }, + { + "epoch": 0.21906, + "grad_norm": 1.075751867066345, + "learning_rate": 0.003, + "loss": 4.05, + "step": 21906 + }, + { + "epoch": 0.21907, + "grad_norm": 1.1494999490389362, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 21907 + }, + { + "epoch": 0.21908, + "grad_norm": 0.777028480084331, + "learning_rate": 0.003, + "loss": 4.054, + "step": 21908 + }, + { + "epoch": 0.21909, + "grad_norm": 0.6772436702913768, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 21909 + }, + { + "epoch": 0.2191, + "grad_norm": 0.6159040864506407, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 21910 + }, + { + "epoch": 0.21911, + "grad_norm": 0.5689982736070225, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 21911 + }, + { + "epoch": 0.21912, + "grad_norm": 0.6051979480078344, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 21912 + }, + { + "epoch": 0.21913, + "grad_norm": 0.6291469820395099, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 21913 + }, + { + "epoch": 0.21914, + "grad_norm": 0.7460128617886833, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 21914 + }, + { + "epoch": 0.21915, + "grad_norm": 0.9132170400530574, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 21915 + }, + { + "epoch": 0.21916, + "grad_norm": 1.0683635178805837, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 21916 + }, + { + "epoch": 0.21917, + "grad_norm": 0.8503349283094361, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 21917 + }, + { + "epoch": 0.21918, + "grad_norm": 0.7741921512919092, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 21918 + }, + { + "epoch": 0.21919, + "grad_norm": 0.9007887805894577, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 21919 + }, + { + "epoch": 0.2192, + "grad_norm": 1.1315815647411827, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 21920 + }, + { + "epoch": 0.21921, + "grad_norm": 0.900731775488954, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 21921 + }, + { + "epoch": 0.21922, + "grad_norm": 0.8402525578283325, + "learning_rate": 0.003, + "loss": 4.034, + "step": 21922 + }, + { + "epoch": 0.21923, + "grad_norm": 0.8020071535486587, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 21923 + }, + { + "epoch": 0.21924, + "grad_norm": 0.7662828007501628, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 21924 + }, + { + "epoch": 0.21925, + "grad_norm": 0.7963601304778837, + "learning_rate": 0.003, + "loss": 4.068, + "step": 21925 + }, + { + "epoch": 0.21926, + "grad_norm": 0.8035038686789826, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 21926 + }, + { + "epoch": 0.21927, + "grad_norm": 0.8361149920712347, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 21927 + }, + { + "epoch": 0.21928, + "grad_norm": 0.9546495384962267, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 21928 + }, + { + "epoch": 0.21929, + "grad_norm": 1.042465837267675, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 21929 + }, + { + "epoch": 0.2193, + "grad_norm": 0.9732944679204636, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 21930 + }, + { + "epoch": 0.21931, + "grad_norm": 1.2138280279745988, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 21931 + }, + { + "epoch": 0.21932, + "grad_norm": 0.8908117721549045, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 21932 + }, + { + "epoch": 0.21933, + "grad_norm": 0.8535564550124914, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 21933 + }, + { + "epoch": 0.21934, + "grad_norm": 0.9114577820581846, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 21934 + }, + { + "epoch": 0.21935, + "grad_norm": 0.9080055072066445, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 21935 + }, + { + "epoch": 0.21936, + "grad_norm": 1.0330105377112426, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 21936 + }, + { + "epoch": 0.21937, + "grad_norm": 0.9198481958564433, + "learning_rate": 0.003, + "loss": 4.038, + "step": 21937 + }, + { + "epoch": 0.21938, + "grad_norm": 1.0156252206314884, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 21938 + }, + { + "epoch": 0.21939, + "grad_norm": 1.2279596447696417, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 21939 + }, + { + "epoch": 0.2194, + "grad_norm": 0.9494450338245576, + "learning_rate": 0.003, + "loss": 4.064, + "step": 21940 + }, + { + "epoch": 0.21941, + "grad_norm": 0.8397723643997518, + "learning_rate": 0.003, + "loss": 4.0922, + "step": 21941 + }, + { + "epoch": 0.21942, + "grad_norm": 0.8197689727260375, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 21942 + }, + { + "epoch": 0.21943, + "grad_norm": 0.8822972084098939, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 21943 + }, + { + "epoch": 0.21944, + "grad_norm": 0.913576643297579, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 21944 + }, + { + "epoch": 0.21945, + "grad_norm": 0.9681929262240375, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 21945 + }, + { + "epoch": 0.21946, + "grad_norm": 1.1276531714558127, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 21946 + }, + { + "epoch": 0.21947, + "grad_norm": 0.9563917469483959, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 21947 + }, + { + "epoch": 0.21948, + "grad_norm": 0.9297387134990617, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 21948 + }, + { + "epoch": 0.21949, + "grad_norm": 1.0030089167553753, + "learning_rate": 0.003, + "loss": 4.067, + "step": 21949 + }, + { + "epoch": 0.2195, + "grad_norm": 1.0260459078655029, + "learning_rate": 0.003, + "loss": 4.035, + "step": 21950 + }, + { + "epoch": 0.21951, + "grad_norm": 1.0261079793891301, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 21951 + }, + { + "epoch": 0.21952, + "grad_norm": 0.8150090489494513, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 21952 + }, + { + "epoch": 0.21953, + "grad_norm": 0.8330262705530509, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 21953 + }, + { + "epoch": 0.21954, + "grad_norm": 0.8877797175894346, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 21954 + }, + { + "epoch": 0.21955, + "grad_norm": 0.9163156116981159, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 21955 + }, + { + "epoch": 0.21956, + "grad_norm": 0.9117591523794207, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 21956 + }, + { + "epoch": 0.21957, + "grad_norm": 1.0061720500626825, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 21957 + }, + { + "epoch": 0.21958, + "grad_norm": 0.9336551832191011, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 21958 + }, + { + "epoch": 0.21959, + "grad_norm": 0.8638015974782018, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 21959 + }, + { + "epoch": 0.2196, + "grad_norm": 0.9258178951315998, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 21960 + }, + { + "epoch": 0.21961, + "grad_norm": 1.0095917318366774, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 21961 + }, + { + "epoch": 0.21962, + "grad_norm": 1.130325315875703, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 21962 + }, + { + "epoch": 0.21963, + "grad_norm": 0.9432677619169314, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 21963 + }, + { + "epoch": 0.21964, + "grad_norm": 0.9876202238780849, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 21964 + }, + { + "epoch": 0.21965, + "grad_norm": 0.9092715519423531, + "learning_rate": 0.003, + "loss": 4.039, + "step": 21965 + }, + { + "epoch": 0.21966, + "grad_norm": 0.7823542691538469, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 21966 + }, + { + "epoch": 0.21967, + "grad_norm": 0.7548664243000878, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 21967 + }, + { + "epoch": 0.21968, + "grad_norm": 0.8114571919490169, + "learning_rate": 0.003, + "loss": 4.04, + "step": 21968 + }, + { + "epoch": 0.21969, + "grad_norm": 0.8696570578924755, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 21969 + }, + { + "epoch": 0.2197, + "grad_norm": 1.0713546304019836, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 21970 + }, + { + "epoch": 0.21971, + "grad_norm": 1.0442277070403008, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 21971 + }, + { + "epoch": 0.21972, + "grad_norm": 1.0606164479372318, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 21972 + }, + { + "epoch": 0.21973, + "grad_norm": 0.9968035429856121, + "learning_rate": 0.003, + "loss": 4.07, + "step": 21973 + }, + { + "epoch": 0.21974, + "grad_norm": 0.9395825453993736, + "learning_rate": 0.003, + "loss": 4.093, + "step": 21974 + }, + { + "epoch": 0.21975, + "grad_norm": 0.8339717029443773, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 21975 + }, + { + "epoch": 0.21976, + "grad_norm": 0.7656181648361848, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 21976 + }, + { + "epoch": 0.21977, + "grad_norm": 0.7258709980315018, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 21977 + }, + { + "epoch": 0.21978, + "grad_norm": 0.682290123584367, + "learning_rate": 0.003, + "loss": 4.083, + "step": 21978 + }, + { + "epoch": 0.21979, + "grad_norm": 0.8346360893070389, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 21979 + }, + { + "epoch": 0.2198, + "grad_norm": 0.9460559704034633, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 21980 + }, + { + "epoch": 0.21981, + "grad_norm": 0.8432767121223144, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 21981 + }, + { + "epoch": 0.21982, + "grad_norm": 0.8901654019857779, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 21982 + }, + { + "epoch": 0.21983, + "grad_norm": 1.0870122988568107, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 21983 + }, + { + "epoch": 0.21984, + "grad_norm": 0.9472534506491445, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 21984 + }, + { + "epoch": 0.21985, + "grad_norm": 0.8072303785690024, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 21985 + }, + { + "epoch": 0.21986, + "grad_norm": 0.7499471652723284, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 21986 + }, + { + "epoch": 0.21987, + "grad_norm": 0.6842165837969477, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 21987 + }, + { + "epoch": 0.21988, + "grad_norm": 0.6925023766928035, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 21988 + }, + { + "epoch": 0.21989, + "grad_norm": 0.8514035080316976, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 21989 + }, + { + "epoch": 0.2199, + "grad_norm": 0.9955773677075328, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 21990 + }, + { + "epoch": 0.21991, + "grad_norm": 1.0341450259368192, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 21991 + }, + { + "epoch": 0.21992, + "grad_norm": 0.9267237665318736, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 21992 + }, + { + "epoch": 0.21993, + "grad_norm": 0.9514563001539492, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 21993 + }, + { + "epoch": 0.21994, + "grad_norm": 1.0475667411585288, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 21994 + }, + { + "epoch": 0.21995, + "grad_norm": 0.9340612817076597, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 21995 + }, + { + "epoch": 0.21996, + "grad_norm": 0.9053888369308316, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 21996 + }, + { + "epoch": 0.21997, + "grad_norm": 0.9854427296908433, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 21997 + }, + { + "epoch": 0.21998, + "grad_norm": 1.1047818368556792, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 21998 + }, + { + "epoch": 0.21999, + "grad_norm": 0.8928894110392157, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 21999 + }, + { + "epoch": 0.22, + "grad_norm": 0.7950769883951311, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 22000 + }, + { + "epoch": 0.22001, + "grad_norm": 0.734494180822481, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 22001 + }, + { + "epoch": 0.22002, + "grad_norm": 0.6876677544192328, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 22002 + }, + { + "epoch": 0.22003, + "grad_norm": 0.6834251727318637, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 22003 + }, + { + "epoch": 0.22004, + "grad_norm": 0.621448935106141, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 22004 + }, + { + "epoch": 0.22005, + "grad_norm": 0.620067830057128, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 22005 + }, + { + "epoch": 0.22006, + "grad_norm": 0.6880286765366899, + "learning_rate": 0.003, + "loss": 4.047, + "step": 22006 + }, + { + "epoch": 0.22007, + "grad_norm": 0.6762293899215118, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 22007 + }, + { + "epoch": 0.22008, + "grad_norm": 0.6906504675381372, + "learning_rate": 0.003, + "loss": 4.027, + "step": 22008 + }, + { + "epoch": 0.22009, + "grad_norm": 0.8750565593766731, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 22009 + }, + { + "epoch": 0.2201, + "grad_norm": 1.1111229562303055, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 22010 + }, + { + "epoch": 0.22011, + "grad_norm": 0.8122356668206744, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 22011 + }, + { + "epoch": 0.22012, + "grad_norm": 0.6421844897216124, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 22012 + }, + { + "epoch": 0.22013, + "grad_norm": 0.6393545101726368, + "learning_rate": 0.003, + "loss": 4.021, + "step": 22013 + }, + { + "epoch": 0.22014, + "grad_norm": 0.698672663466228, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 22014 + }, + { + "epoch": 0.22015, + "grad_norm": 0.8117114166218956, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 22015 + }, + { + "epoch": 0.22016, + "grad_norm": 1.0789685871338646, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 22016 + }, + { + "epoch": 0.22017, + "grad_norm": 1.1635471219847546, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 22017 + }, + { + "epoch": 0.22018, + "grad_norm": 0.8894181980682901, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 22018 + }, + { + "epoch": 0.22019, + "grad_norm": 0.7599329229872507, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 22019 + }, + { + "epoch": 0.2202, + "grad_norm": 0.8138520555366555, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 22020 + }, + { + "epoch": 0.22021, + "grad_norm": 0.9413706647623215, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 22021 + }, + { + "epoch": 0.22022, + "grad_norm": 0.9956293965437711, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 22022 + }, + { + "epoch": 0.22023, + "grad_norm": 0.872226871472913, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 22023 + }, + { + "epoch": 0.22024, + "grad_norm": 0.8392197383485834, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 22024 + }, + { + "epoch": 0.22025, + "grad_norm": 0.821763293132257, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 22025 + }, + { + "epoch": 0.22026, + "grad_norm": 0.9739707963064859, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 22026 + }, + { + "epoch": 0.22027, + "grad_norm": 1.3005476925099593, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 22027 + }, + { + "epoch": 0.22028, + "grad_norm": 0.8544054111562639, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 22028 + }, + { + "epoch": 0.22029, + "grad_norm": 0.8524055747795404, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 22029 + }, + { + "epoch": 0.2203, + "grad_norm": 0.8159263780645369, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 22030 + }, + { + "epoch": 0.22031, + "grad_norm": 0.8885496352845809, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 22031 + }, + { + "epoch": 0.22032, + "grad_norm": 1.019125991831821, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 22032 + }, + { + "epoch": 0.22033, + "grad_norm": 1.0433179950163909, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 22033 + }, + { + "epoch": 0.22034, + "grad_norm": 1.1013315508093695, + "learning_rate": 0.003, + "loss": 4.066, + "step": 22034 + }, + { + "epoch": 0.22035, + "grad_norm": 1.217801173080061, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 22035 + }, + { + "epoch": 0.22036, + "grad_norm": 0.9719025062139389, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 22036 + }, + { + "epoch": 0.22037, + "grad_norm": 0.9460145715715301, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 22037 + }, + { + "epoch": 0.22038, + "grad_norm": 0.9553322724925778, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 22038 + }, + { + "epoch": 0.22039, + "grad_norm": 0.8793063614064192, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 22039 + }, + { + "epoch": 0.2204, + "grad_norm": 0.7581261730545026, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 22040 + }, + { + "epoch": 0.22041, + "grad_norm": 0.7483301073810726, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 22041 + }, + { + "epoch": 0.22042, + "grad_norm": 0.8856112184457362, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 22042 + }, + { + "epoch": 0.22043, + "grad_norm": 1.2593560711629388, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 22043 + }, + { + "epoch": 0.22044, + "grad_norm": 0.8008771904157289, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 22044 + }, + { + "epoch": 0.22045, + "grad_norm": 0.7384000284456744, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 22045 + }, + { + "epoch": 0.22046, + "grad_norm": 0.7890036037153679, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 22046 + }, + { + "epoch": 0.22047, + "grad_norm": 0.9421431918982789, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 22047 + }, + { + "epoch": 0.22048, + "grad_norm": 1.025099617930658, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 22048 + }, + { + "epoch": 0.22049, + "grad_norm": 1.1616626707805855, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 22049 + }, + { + "epoch": 0.2205, + "grad_norm": 0.7228352275682283, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 22050 + }, + { + "epoch": 0.22051, + "grad_norm": 0.711001669386256, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 22051 + }, + { + "epoch": 0.22052, + "grad_norm": 0.7176328414632288, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 22052 + }, + { + "epoch": 0.22053, + "grad_norm": 0.59507105959698, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 22053 + }, + { + "epoch": 0.22054, + "grad_norm": 0.6946647283928828, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 22054 + }, + { + "epoch": 0.22055, + "grad_norm": 0.7301014932964457, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 22055 + }, + { + "epoch": 0.22056, + "grad_norm": 0.8459781177901133, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 22056 + }, + { + "epoch": 0.22057, + "grad_norm": 1.0307913515441087, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 22057 + }, + { + "epoch": 0.22058, + "grad_norm": 1.1661211277669492, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 22058 + }, + { + "epoch": 0.22059, + "grad_norm": 0.8209930959787649, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 22059 + }, + { + "epoch": 0.2206, + "grad_norm": 1.0499763903556976, + "learning_rate": 0.003, + "loss": 4.073, + "step": 22060 + }, + { + "epoch": 0.22061, + "grad_norm": 1.323809242615842, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 22061 + }, + { + "epoch": 0.22062, + "grad_norm": 0.7677855366310005, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 22062 + }, + { + "epoch": 0.22063, + "grad_norm": 0.6958455279405271, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 22063 + }, + { + "epoch": 0.22064, + "grad_norm": 0.7783873526395044, + "learning_rate": 0.003, + "loss": 4.067, + "step": 22064 + }, + { + "epoch": 0.22065, + "grad_norm": 0.7922877051007191, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 22065 + }, + { + "epoch": 0.22066, + "grad_norm": 0.8265533332452641, + "learning_rate": 0.003, + "loss": 4.024, + "step": 22066 + }, + { + "epoch": 0.22067, + "grad_norm": 0.8544521811434357, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 22067 + }, + { + "epoch": 0.22068, + "grad_norm": 0.7763784129242721, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 22068 + }, + { + "epoch": 0.22069, + "grad_norm": 0.7567372744531591, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 22069 + }, + { + "epoch": 0.2207, + "grad_norm": 0.941578178646652, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 22070 + }, + { + "epoch": 0.22071, + "grad_norm": 1.3653916734084433, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 22071 + }, + { + "epoch": 0.22072, + "grad_norm": 0.6764121067076573, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 22072 + }, + { + "epoch": 0.22073, + "grad_norm": 0.6822005871998648, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 22073 + }, + { + "epoch": 0.22074, + "grad_norm": 0.7734727632237398, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 22074 + }, + { + "epoch": 0.22075, + "grad_norm": 0.8263022034619842, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 22075 + }, + { + "epoch": 0.22076, + "grad_norm": 0.8555431548382049, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 22076 + }, + { + "epoch": 0.22077, + "grad_norm": 0.9943960835641048, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 22077 + }, + { + "epoch": 0.22078, + "grad_norm": 1.3209523536699566, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 22078 + }, + { + "epoch": 0.22079, + "grad_norm": 0.8628817290893656, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 22079 + }, + { + "epoch": 0.2208, + "grad_norm": 0.8026017998921515, + "learning_rate": 0.003, + "loss": 4.058, + "step": 22080 + }, + { + "epoch": 0.22081, + "grad_norm": 0.8013839472692773, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 22081 + }, + { + "epoch": 0.22082, + "grad_norm": 0.8016469163570976, + "learning_rate": 0.003, + "loss": 4.062, + "step": 22082 + }, + { + "epoch": 0.22083, + "grad_norm": 0.7982184060808496, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 22083 + }, + { + "epoch": 0.22084, + "grad_norm": 0.9097833295102461, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 22084 + }, + { + "epoch": 0.22085, + "grad_norm": 0.9517237446547862, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 22085 + }, + { + "epoch": 0.22086, + "grad_norm": 1.0815212225852908, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 22086 + }, + { + "epoch": 0.22087, + "grad_norm": 1.0950994509268546, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 22087 + }, + { + "epoch": 0.22088, + "grad_norm": 1.152165618701013, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 22088 + }, + { + "epoch": 0.22089, + "grad_norm": 0.9979512055191654, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 22089 + }, + { + "epoch": 0.2209, + "grad_norm": 1.0725087493276717, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 22090 + }, + { + "epoch": 0.22091, + "grad_norm": 0.9584336254451938, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 22091 + }, + { + "epoch": 0.22092, + "grad_norm": 0.8307520421776294, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 22092 + }, + { + "epoch": 0.22093, + "grad_norm": 0.7027488547501028, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 22093 + }, + { + "epoch": 0.22094, + "grad_norm": 0.6915087178386995, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 22094 + }, + { + "epoch": 0.22095, + "grad_norm": 0.6957966279279953, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 22095 + }, + { + "epoch": 0.22096, + "grad_norm": 0.6528400653651122, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 22096 + }, + { + "epoch": 0.22097, + "grad_norm": 0.6201580723620347, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 22097 + }, + { + "epoch": 0.22098, + "grad_norm": 0.648348209377725, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 22098 + }, + { + "epoch": 0.22099, + "grad_norm": 0.6883313940083414, + "learning_rate": 0.003, + "loss": 4.023, + "step": 22099 + }, + { + "epoch": 0.221, + "grad_norm": 0.7528538106866004, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 22100 + }, + { + "epoch": 0.22101, + "grad_norm": 1.0352249303340857, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 22101 + }, + { + "epoch": 0.22102, + "grad_norm": 1.2794084290109426, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 22102 + }, + { + "epoch": 0.22103, + "grad_norm": 0.8448328340651118, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 22103 + }, + { + "epoch": 0.22104, + "grad_norm": 0.8392210845201308, + "learning_rate": 0.003, + "loss": 4.074, + "step": 22104 + }, + { + "epoch": 0.22105, + "grad_norm": 0.9451307645110609, + "learning_rate": 0.003, + "loss": 4.048, + "step": 22105 + }, + { + "epoch": 0.22106, + "grad_norm": 1.2340804605629108, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 22106 + }, + { + "epoch": 0.22107, + "grad_norm": 0.9491584976803158, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 22107 + }, + { + "epoch": 0.22108, + "grad_norm": 0.8646855903049168, + "learning_rate": 0.003, + "loss": 4.032, + "step": 22108 + }, + { + "epoch": 0.22109, + "grad_norm": 0.875144157854956, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 22109 + }, + { + "epoch": 0.2211, + "grad_norm": 1.0337424841785552, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 22110 + }, + { + "epoch": 0.22111, + "grad_norm": 1.1641434240189423, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 22111 + }, + { + "epoch": 0.22112, + "grad_norm": 0.7755006738748029, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 22112 + }, + { + "epoch": 0.22113, + "grad_norm": 0.8580274284258331, + "learning_rate": 0.003, + "loss": 4.067, + "step": 22113 + }, + { + "epoch": 0.22114, + "grad_norm": 0.815040159252193, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 22114 + }, + { + "epoch": 0.22115, + "grad_norm": 0.8608390652691527, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 22115 + }, + { + "epoch": 0.22116, + "grad_norm": 0.8741338655947701, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 22116 + }, + { + "epoch": 0.22117, + "grad_norm": 1.0332256061894953, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 22117 + }, + { + "epoch": 0.22118, + "grad_norm": 0.9913553858784968, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 22118 + }, + { + "epoch": 0.22119, + "grad_norm": 0.8952093848739033, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 22119 + }, + { + "epoch": 0.2212, + "grad_norm": 0.8323849163935308, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 22120 + }, + { + "epoch": 0.22121, + "grad_norm": 0.8896918263188238, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 22121 + }, + { + "epoch": 0.22122, + "grad_norm": 0.8380957319175341, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 22122 + }, + { + "epoch": 0.22123, + "grad_norm": 0.8977118166031717, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 22123 + }, + { + "epoch": 0.22124, + "grad_norm": 0.929525155158042, + "learning_rate": 0.003, + "loss": 4.049, + "step": 22124 + }, + { + "epoch": 0.22125, + "grad_norm": 0.858834806480359, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 22125 + }, + { + "epoch": 0.22126, + "grad_norm": 1.126327112589891, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 22126 + }, + { + "epoch": 0.22127, + "grad_norm": 1.053762375541193, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 22127 + }, + { + "epoch": 0.22128, + "grad_norm": 0.9320491937708714, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 22128 + }, + { + "epoch": 0.22129, + "grad_norm": 0.9231865776373712, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 22129 + }, + { + "epoch": 0.2213, + "grad_norm": 0.9667146918097497, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 22130 + }, + { + "epoch": 0.22131, + "grad_norm": 0.9729518931456379, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 22131 + }, + { + "epoch": 0.22132, + "grad_norm": 0.9121518921525817, + "learning_rate": 0.003, + "loss": 4.057, + "step": 22132 + }, + { + "epoch": 0.22133, + "grad_norm": 0.8370225880789848, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 22133 + }, + { + "epoch": 0.22134, + "grad_norm": 0.8354183493663939, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 22134 + }, + { + "epoch": 0.22135, + "grad_norm": 1.0130541711460181, + "learning_rate": 0.003, + "loss": 4.037, + "step": 22135 + }, + { + "epoch": 0.22136, + "grad_norm": 1.040596110292542, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 22136 + }, + { + "epoch": 0.22137, + "grad_norm": 1.0605796720384841, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 22137 + }, + { + "epoch": 0.22138, + "grad_norm": 0.9628032304889746, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 22138 + }, + { + "epoch": 0.22139, + "grad_norm": 0.9665042275463452, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 22139 + }, + { + "epoch": 0.2214, + "grad_norm": 1.1181353071397961, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 22140 + }, + { + "epoch": 0.22141, + "grad_norm": 1.0201701284349813, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 22141 + }, + { + "epoch": 0.22142, + "grad_norm": 1.042802377375285, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 22142 + }, + { + "epoch": 0.22143, + "grad_norm": 0.9934459441553016, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 22143 + }, + { + "epoch": 0.22144, + "grad_norm": 0.9959806335311557, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 22144 + }, + { + "epoch": 0.22145, + "grad_norm": 1.031315476389047, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 22145 + }, + { + "epoch": 0.22146, + "grad_norm": 0.8705023893606555, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 22146 + }, + { + "epoch": 0.22147, + "grad_norm": 0.9043040416893877, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 22147 + }, + { + "epoch": 0.22148, + "grad_norm": 1.0455612389309925, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 22148 + }, + { + "epoch": 0.22149, + "grad_norm": 0.9991875187307905, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 22149 + }, + { + "epoch": 0.2215, + "grad_norm": 1.1248142813176287, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 22150 + }, + { + "epoch": 0.22151, + "grad_norm": 0.997354313488082, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 22151 + }, + { + "epoch": 0.22152, + "grad_norm": 0.8840634081359512, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 22152 + }, + { + "epoch": 0.22153, + "grad_norm": 0.8126178788523708, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 22153 + }, + { + "epoch": 0.22154, + "grad_norm": 0.9061281939783663, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 22154 + }, + { + "epoch": 0.22155, + "grad_norm": 0.8304275294748221, + "learning_rate": 0.003, + "loss": 4.055, + "step": 22155 + }, + { + "epoch": 0.22156, + "grad_norm": 0.7216573910767039, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 22156 + }, + { + "epoch": 0.22157, + "grad_norm": 0.7816462253050105, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 22157 + }, + { + "epoch": 0.22158, + "grad_norm": 0.7675083120895178, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 22158 + }, + { + "epoch": 0.22159, + "grad_norm": 0.7596831584191603, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 22159 + }, + { + "epoch": 0.2216, + "grad_norm": 0.7975989080395691, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 22160 + }, + { + "epoch": 0.22161, + "grad_norm": 0.8317596997184951, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 22161 + }, + { + "epoch": 0.22162, + "grad_norm": 0.7711297528376383, + "learning_rate": 0.003, + "loss": 4.061, + "step": 22162 + }, + { + "epoch": 0.22163, + "grad_norm": 0.740565479950469, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 22163 + }, + { + "epoch": 0.22164, + "grad_norm": 0.7113503872288879, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 22164 + }, + { + "epoch": 0.22165, + "grad_norm": 0.7925254195358435, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 22165 + }, + { + "epoch": 0.22166, + "grad_norm": 1.0010967797337047, + "learning_rate": 0.003, + "loss": 4.039, + "step": 22166 + }, + { + "epoch": 0.22167, + "grad_norm": 1.3451264683620023, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 22167 + }, + { + "epoch": 0.22168, + "grad_norm": 0.6507696929249808, + "learning_rate": 0.003, + "loss": 4.032, + "step": 22168 + }, + { + "epoch": 0.22169, + "grad_norm": 0.5968651095924886, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 22169 + }, + { + "epoch": 0.2217, + "grad_norm": 0.6557308511079163, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 22170 + }, + { + "epoch": 0.22171, + "grad_norm": 0.7078889207658147, + "learning_rate": 0.003, + "loss": 4.068, + "step": 22171 + }, + { + "epoch": 0.22172, + "grad_norm": 0.7554749082054675, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 22172 + }, + { + "epoch": 0.22173, + "grad_norm": 0.9010140617183018, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 22173 + }, + { + "epoch": 0.22174, + "grad_norm": 1.0533805732761516, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 22174 + }, + { + "epoch": 0.22175, + "grad_norm": 1.0787792443321154, + "learning_rate": 0.003, + "loss": 4.042, + "step": 22175 + }, + { + "epoch": 0.22176, + "grad_norm": 0.8603706689604045, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 22176 + }, + { + "epoch": 0.22177, + "grad_norm": 0.8365128173918001, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 22177 + }, + { + "epoch": 0.22178, + "grad_norm": 0.9061165104715343, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 22178 + }, + { + "epoch": 0.22179, + "grad_norm": 1.0330629903860584, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 22179 + }, + { + "epoch": 0.2218, + "grad_norm": 1.0623081407534747, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 22180 + }, + { + "epoch": 0.22181, + "grad_norm": 0.9444235902192694, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 22181 + }, + { + "epoch": 0.22182, + "grad_norm": 0.9949618419516716, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 22182 + }, + { + "epoch": 0.22183, + "grad_norm": 0.9022154973254922, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 22183 + }, + { + "epoch": 0.22184, + "grad_norm": 0.8552683888003327, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 22184 + }, + { + "epoch": 0.22185, + "grad_norm": 0.7865121440982391, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 22185 + }, + { + "epoch": 0.22186, + "grad_norm": 0.6725445992678838, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 22186 + }, + { + "epoch": 0.22187, + "grad_norm": 0.6246129312732067, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 22187 + }, + { + "epoch": 0.22188, + "grad_norm": 0.5910546718489529, + "learning_rate": 0.003, + "loss": 4.079, + "step": 22188 + }, + { + "epoch": 0.22189, + "grad_norm": 0.7346404514035609, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 22189 + }, + { + "epoch": 0.2219, + "grad_norm": 0.8032220144636948, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 22190 + }, + { + "epoch": 0.22191, + "grad_norm": 0.9614117698813585, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 22191 + }, + { + "epoch": 0.22192, + "grad_norm": 1.225521655975889, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 22192 + }, + { + "epoch": 0.22193, + "grad_norm": 0.9097348342293494, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 22193 + }, + { + "epoch": 0.22194, + "grad_norm": 0.8712610714269252, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 22194 + }, + { + "epoch": 0.22195, + "grad_norm": 0.8691822225852284, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 22195 + }, + { + "epoch": 0.22196, + "grad_norm": 0.8836588709202576, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 22196 + }, + { + "epoch": 0.22197, + "grad_norm": 0.8829506610471086, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 22197 + }, + { + "epoch": 0.22198, + "grad_norm": 0.7561080502813564, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 22198 + }, + { + "epoch": 0.22199, + "grad_norm": 0.7797588713555024, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 22199 + }, + { + "epoch": 0.222, + "grad_norm": 0.8651115826878089, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 22200 + }, + { + "epoch": 0.22201, + "grad_norm": 1.126046976718936, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 22201 + }, + { + "epoch": 0.22202, + "grad_norm": 0.8855702016736842, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 22202 + }, + { + "epoch": 0.22203, + "grad_norm": 0.7567553761426342, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 22203 + }, + { + "epoch": 0.22204, + "grad_norm": 0.8060115105780602, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 22204 + }, + { + "epoch": 0.22205, + "grad_norm": 0.8958884789482368, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 22205 + }, + { + "epoch": 0.22206, + "grad_norm": 1.2045798161386938, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 22206 + }, + { + "epoch": 0.22207, + "grad_norm": 1.007792019711113, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 22207 + }, + { + "epoch": 0.22208, + "grad_norm": 1.040523550968073, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 22208 + }, + { + "epoch": 0.22209, + "grad_norm": 1.2164659225319387, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 22209 + }, + { + "epoch": 0.2221, + "grad_norm": 0.8150735851848918, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 22210 + }, + { + "epoch": 0.22211, + "grad_norm": 0.8732099062632754, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 22211 + }, + { + "epoch": 0.22212, + "grad_norm": 1.0572751149780963, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 22212 + }, + { + "epoch": 0.22213, + "grad_norm": 0.9445644027522035, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 22213 + }, + { + "epoch": 0.22214, + "grad_norm": 0.9461109439482839, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 22214 + }, + { + "epoch": 0.22215, + "grad_norm": 0.9757432408925696, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 22215 + }, + { + "epoch": 0.22216, + "grad_norm": 1.0913221516178517, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 22216 + }, + { + "epoch": 0.22217, + "grad_norm": 0.9322208231367511, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 22217 + }, + { + "epoch": 0.22218, + "grad_norm": 0.960267014589131, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 22218 + }, + { + "epoch": 0.22219, + "grad_norm": 0.9851270684249974, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 22219 + }, + { + "epoch": 0.2222, + "grad_norm": 0.8968376529284131, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 22220 + }, + { + "epoch": 0.22221, + "grad_norm": 0.8541058489244915, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 22221 + }, + { + "epoch": 0.22222, + "grad_norm": 0.7404623465811235, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 22222 + }, + { + "epoch": 0.22223, + "grad_norm": 0.6966837123317066, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 22223 + }, + { + "epoch": 0.22224, + "grad_norm": 0.774488663185445, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 22224 + }, + { + "epoch": 0.22225, + "grad_norm": 0.811676375788709, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 22225 + }, + { + "epoch": 0.22226, + "grad_norm": 0.8266184812123376, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 22226 + }, + { + "epoch": 0.22227, + "grad_norm": 0.7848063018162497, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 22227 + }, + { + "epoch": 0.22228, + "grad_norm": 0.7417912641251252, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 22228 + }, + { + "epoch": 0.22229, + "grad_norm": 0.7621330356709436, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 22229 + }, + { + "epoch": 0.2223, + "grad_norm": 0.7374006294003671, + "learning_rate": 0.003, + "loss": 4.031, + "step": 22230 + }, + { + "epoch": 0.22231, + "grad_norm": 0.7016098707997227, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 22231 + }, + { + "epoch": 0.22232, + "grad_norm": 0.7360385235382604, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 22232 + }, + { + "epoch": 0.22233, + "grad_norm": 0.8548974955039184, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 22233 + }, + { + "epoch": 0.22234, + "grad_norm": 1.0789988701948223, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 22234 + }, + { + "epoch": 0.22235, + "grad_norm": 1.1183440907226048, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 22235 + }, + { + "epoch": 0.22236, + "grad_norm": 0.9207834979644175, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 22236 + }, + { + "epoch": 0.22237, + "grad_norm": 0.970248701968993, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 22237 + }, + { + "epoch": 0.22238, + "grad_norm": 0.9743513122486656, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 22238 + }, + { + "epoch": 0.22239, + "grad_norm": 0.9505675812880745, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 22239 + }, + { + "epoch": 0.2224, + "grad_norm": 0.9109406895094778, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 22240 + }, + { + "epoch": 0.22241, + "grad_norm": 0.7591292533552716, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 22241 + }, + { + "epoch": 0.22242, + "grad_norm": 0.7535812438133108, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 22242 + }, + { + "epoch": 0.22243, + "grad_norm": 0.7913443728375273, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 22243 + }, + { + "epoch": 0.22244, + "grad_norm": 0.816978680641882, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 22244 + }, + { + "epoch": 0.22245, + "grad_norm": 0.8707553278244405, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 22245 + }, + { + "epoch": 0.22246, + "grad_norm": 0.9363695816754737, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 22246 + }, + { + "epoch": 0.22247, + "grad_norm": 1.106319931353203, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 22247 + }, + { + "epoch": 0.22248, + "grad_norm": 0.979389331760488, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 22248 + }, + { + "epoch": 0.22249, + "grad_norm": 1.0567714569546038, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 22249 + }, + { + "epoch": 0.2225, + "grad_norm": 1.025156301507456, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 22250 + }, + { + "epoch": 0.22251, + "grad_norm": 0.7958547830790944, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 22251 + }, + { + "epoch": 0.22252, + "grad_norm": 0.7616686415494771, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 22252 + }, + { + "epoch": 0.22253, + "grad_norm": 0.7836887868195285, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 22253 + }, + { + "epoch": 0.22254, + "grad_norm": 0.9174334860499777, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 22254 + }, + { + "epoch": 0.22255, + "grad_norm": 1.080705316356119, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 22255 + }, + { + "epoch": 0.22256, + "grad_norm": 1.02214815117429, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 22256 + }, + { + "epoch": 0.22257, + "grad_norm": 1.075300529619242, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 22257 + }, + { + "epoch": 0.22258, + "grad_norm": 0.9924569549639446, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 22258 + }, + { + "epoch": 0.22259, + "grad_norm": 0.9005378170256678, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 22259 + }, + { + "epoch": 0.2226, + "grad_norm": 0.7691204569824857, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 22260 + }, + { + "epoch": 0.22261, + "grad_norm": 0.7933742659065883, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 22261 + }, + { + "epoch": 0.22262, + "grad_norm": 1.0122217057773604, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 22262 + }, + { + "epoch": 0.22263, + "grad_norm": 0.980441500461147, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 22263 + }, + { + "epoch": 0.22264, + "grad_norm": 1.166041235904597, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 22264 + }, + { + "epoch": 0.22265, + "grad_norm": 0.9965704825137346, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 22265 + }, + { + "epoch": 0.22266, + "grad_norm": 1.092098565087222, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 22266 + }, + { + "epoch": 0.22267, + "grad_norm": 0.9158891393607803, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 22267 + }, + { + "epoch": 0.22268, + "grad_norm": 0.9339800562943872, + "learning_rate": 0.003, + "loss": 4.043, + "step": 22268 + }, + { + "epoch": 0.22269, + "grad_norm": 0.9070806760180086, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 22269 + }, + { + "epoch": 0.2227, + "grad_norm": 0.8135202578134365, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 22270 + }, + { + "epoch": 0.22271, + "grad_norm": 0.9029984110274643, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 22271 + }, + { + "epoch": 0.22272, + "grad_norm": 0.9702777636583725, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 22272 + }, + { + "epoch": 0.22273, + "grad_norm": 1.0855614221788181, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 22273 + }, + { + "epoch": 0.22274, + "grad_norm": 1.0976454744023618, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 22274 + }, + { + "epoch": 0.22275, + "grad_norm": 0.9330981263535711, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 22275 + }, + { + "epoch": 0.22276, + "grad_norm": 0.9970874853493966, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 22276 + }, + { + "epoch": 0.22277, + "grad_norm": 1.0498423082472537, + "learning_rate": 0.003, + "loss": 4.07, + "step": 22277 + }, + { + "epoch": 0.22278, + "grad_norm": 0.8679956379317704, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 22278 + }, + { + "epoch": 0.22279, + "grad_norm": 0.7125520883229424, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 22279 + }, + { + "epoch": 0.2228, + "grad_norm": 0.5870127924238849, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 22280 + }, + { + "epoch": 0.22281, + "grad_norm": 0.6582477572645474, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 22281 + }, + { + "epoch": 0.22282, + "grad_norm": 0.7250964131010654, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 22282 + }, + { + "epoch": 0.22283, + "grad_norm": 0.7702424253612006, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 22283 + }, + { + "epoch": 0.22284, + "grad_norm": 0.7128621458385906, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 22284 + }, + { + "epoch": 0.22285, + "grad_norm": 0.6641360527932991, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 22285 + }, + { + "epoch": 0.22286, + "grad_norm": 0.707612114099158, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 22286 + }, + { + "epoch": 0.22287, + "grad_norm": 0.7913411948093053, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 22287 + }, + { + "epoch": 0.22288, + "grad_norm": 0.8407537381088891, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 22288 + }, + { + "epoch": 0.22289, + "grad_norm": 0.800169299113007, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 22289 + }, + { + "epoch": 0.2229, + "grad_norm": 0.7528566573198635, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 22290 + }, + { + "epoch": 0.22291, + "grad_norm": 0.7436393659146529, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 22291 + }, + { + "epoch": 0.22292, + "grad_norm": 0.7014270350756187, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 22292 + }, + { + "epoch": 0.22293, + "grad_norm": 0.926045097420717, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 22293 + }, + { + "epoch": 0.22294, + "grad_norm": 1.252627835692378, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 22294 + }, + { + "epoch": 0.22295, + "grad_norm": 0.8996018601282918, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 22295 + }, + { + "epoch": 0.22296, + "grad_norm": 0.8787044778209848, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 22296 + }, + { + "epoch": 0.22297, + "grad_norm": 0.9833706736631922, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 22297 + }, + { + "epoch": 0.22298, + "grad_norm": 1.165494918084471, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 22298 + }, + { + "epoch": 0.22299, + "grad_norm": 0.9329827524970048, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 22299 + }, + { + "epoch": 0.223, + "grad_norm": 0.8801949822446797, + "learning_rate": 0.003, + "loss": 4.067, + "step": 22300 + }, + { + "epoch": 0.22301, + "grad_norm": 0.9553959724873491, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 22301 + }, + { + "epoch": 0.22302, + "grad_norm": 0.9064785776113735, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 22302 + }, + { + "epoch": 0.22303, + "grad_norm": 1.0508090619487611, + "learning_rate": 0.003, + "loss": 3.9996, + "step": 22303 + }, + { + "epoch": 0.22304, + "grad_norm": 1.120638968298277, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 22304 + }, + { + "epoch": 0.22305, + "grad_norm": 0.9500244501742219, + "learning_rate": 0.003, + "loss": 4.048, + "step": 22305 + }, + { + "epoch": 0.22306, + "grad_norm": 0.9535745268901713, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 22306 + }, + { + "epoch": 0.22307, + "grad_norm": 1.0857787682830338, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 22307 + }, + { + "epoch": 0.22308, + "grad_norm": 0.9679996787972609, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 22308 + }, + { + "epoch": 0.22309, + "grad_norm": 1.186965179024597, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 22309 + }, + { + "epoch": 0.2231, + "grad_norm": 0.8625048381922663, + "learning_rate": 0.003, + "loss": 4.06, + "step": 22310 + }, + { + "epoch": 0.22311, + "grad_norm": 1.0154775627960273, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 22311 + }, + { + "epoch": 0.22312, + "grad_norm": 0.960274294556906, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 22312 + }, + { + "epoch": 0.22313, + "grad_norm": 0.7665702901480782, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 22313 + }, + { + "epoch": 0.22314, + "grad_norm": 0.6959927651351643, + "learning_rate": 0.003, + "loss": 4.045, + "step": 22314 + }, + { + "epoch": 0.22315, + "grad_norm": 0.6492558182809137, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 22315 + }, + { + "epoch": 0.22316, + "grad_norm": 0.6555061392730082, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 22316 + }, + { + "epoch": 0.22317, + "grad_norm": 0.7549157799869123, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 22317 + }, + { + "epoch": 0.22318, + "grad_norm": 1.0218088420765945, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 22318 + }, + { + "epoch": 0.22319, + "grad_norm": 1.2023068800123458, + "learning_rate": 0.003, + "loss": 4.074, + "step": 22319 + }, + { + "epoch": 0.2232, + "grad_norm": 0.8956242357097777, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 22320 + }, + { + "epoch": 0.22321, + "grad_norm": 0.7921918487447402, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 22321 + }, + { + "epoch": 0.22322, + "grad_norm": 0.6628578418509272, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 22322 + }, + { + "epoch": 0.22323, + "grad_norm": 0.7319420594175866, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 22323 + }, + { + "epoch": 0.22324, + "grad_norm": 0.7298659933019586, + "learning_rate": 0.003, + "loss": 4.043, + "step": 22324 + }, + { + "epoch": 0.22325, + "grad_norm": 0.7176008055459516, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 22325 + }, + { + "epoch": 0.22326, + "grad_norm": 0.6626282350332566, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 22326 + }, + { + "epoch": 0.22327, + "grad_norm": 0.7111288543135935, + "learning_rate": 0.003, + "loss": 4.029, + "step": 22327 + }, + { + "epoch": 0.22328, + "grad_norm": 0.9463900575615173, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 22328 + }, + { + "epoch": 0.22329, + "grad_norm": 1.2562194997330671, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 22329 + }, + { + "epoch": 0.2233, + "grad_norm": 0.8636073504667185, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 22330 + }, + { + "epoch": 0.22331, + "grad_norm": 0.9206882770660257, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 22331 + }, + { + "epoch": 0.22332, + "grad_norm": 1.046592091410422, + "learning_rate": 0.003, + "loss": 4.047, + "step": 22332 + }, + { + "epoch": 0.22333, + "grad_norm": 1.0462322944588742, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 22333 + }, + { + "epoch": 0.22334, + "grad_norm": 0.9020284414609357, + "learning_rate": 0.003, + "loss": 4.0949, + "step": 22334 + }, + { + "epoch": 0.22335, + "grad_norm": 0.9947951804893863, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 22335 + }, + { + "epoch": 0.22336, + "grad_norm": 1.1289436311660437, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 22336 + }, + { + "epoch": 0.22337, + "grad_norm": 0.8530710652814224, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 22337 + }, + { + "epoch": 0.22338, + "grad_norm": 0.7814639217105689, + "learning_rate": 0.003, + "loss": 4.057, + "step": 22338 + }, + { + "epoch": 0.22339, + "grad_norm": 0.7705744082309481, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 22339 + }, + { + "epoch": 0.2234, + "grad_norm": 0.8193633326782319, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 22340 + }, + { + "epoch": 0.22341, + "grad_norm": 0.7376377219136407, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 22341 + }, + { + "epoch": 0.22342, + "grad_norm": 0.667837025005811, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 22342 + }, + { + "epoch": 0.22343, + "grad_norm": 0.8372182190133747, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 22343 + }, + { + "epoch": 0.22344, + "grad_norm": 1.0620723750953345, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 22344 + }, + { + "epoch": 0.22345, + "grad_norm": 1.1000061697785937, + "learning_rate": 0.003, + "loss": 4.054, + "step": 22345 + }, + { + "epoch": 0.22346, + "grad_norm": 0.803681570051572, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 22346 + }, + { + "epoch": 0.22347, + "grad_norm": 0.7678607546404127, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 22347 + }, + { + "epoch": 0.22348, + "grad_norm": 0.8277801165826847, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 22348 + }, + { + "epoch": 0.22349, + "grad_norm": 0.826734993600233, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 22349 + }, + { + "epoch": 0.2235, + "grad_norm": 0.8141172163433259, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 22350 + }, + { + "epoch": 0.22351, + "grad_norm": 0.8259237527287515, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 22351 + }, + { + "epoch": 0.22352, + "grad_norm": 0.9044324309234641, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 22352 + }, + { + "epoch": 0.22353, + "grad_norm": 1.1403936295084518, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 22353 + }, + { + "epoch": 0.22354, + "grad_norm": 1.1350006475764947, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 22354 + }, + { + "epoch": 0.22355, + "grad_norm": 0.9425527216357381, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 22355 + }, + { + "epoch": 0.22356, + "grad_norm": 1.1378720385781806, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 22356 + }, + { + "epoch": 0.22357, + "grad_norm": 1.2487405374216476, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 22357 + }, + { + "epoch": 0.22358, + "grad_norm": 0.9463478737352174, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 22358 + }, + { + "epoch": 0.22359, + "grad_norm": 1.0592010575135142, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 22359 + }, + { + "epoch": 0.2236, + "grad_norm": 1.035350133893251, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 22360 + }, + { + "epoch": 0.22361, + "grad_norm": 0.8917000145572558, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 22361 + }, + { + "epoch": 0.22362, + "grad_norm": 0.8603779596426748, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 22362 + }, + { + "epoch": 0.22363, + "grad_norm": 0.8644951247027882, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 22363 + }, + { + "epoch": 0.22364, + "grad_norm": 1.0618178088122887, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 22364 + }, + { + "epoch": 0.22365, + "grad_norm": 0.9393762913581301, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 22365 + }, + { + "epoch": 0.22366, + "grad_norm": 0.804479893219825, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 22366 + }, + { + "epoch": 0.22367, + "grad_norm": 0.771663889860158, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 22367 + }, + { + "epoch": 0.22368, + "grad_norm": 0.7346449402990187, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 22368 + }, + { + "epoch": 0.22369, + "grad_norm": 0.8186387414665576, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 22369 + }, + { + "epoch": 0.2237, + "grad_norm": 0.952177223972269, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 22370 + }, + { + "epoch": 0.22371, + "grad_norm": 1.359945589997042, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 22371 + }, + { + "epoch": 0.22372, + "grad_norm": 0.8012654112858634, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 22372 + }, + { + "epoch": 0.22373, + "grad_norm": 0.7356740837704214, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 22373 + }, + { + "epoch": 0.22374, + "grad_norm": 0.6949995479371482, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 22374 + }, + { + "epoch": 0.22375, + "grad_norm": 0.7597166674084919, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 22375 + }, + { + "epoch": 0.22376, + "grad_norm": 0.7710603796693849, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 22376 + }, + { + "epoch": 0.22377, + "grad_norm": 0.8474986295447465, + "learning_rate": 0.003, + "loss": 4.006, + "step": 22377 + }, + { + "epoch": 0.22378, + "grad_norm": 0.9429507591970833, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 22378 + }, + { + "epoch": 0.22379, + "grad_norm": 1.1603650634506242, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 22379 + }, + { + "epoch": 0.2238, + "grad_norm": 1.1322971081235735, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 22380 + }, + { + "epoch": 0.22381, + "grad_norm": 0.8954707528543132, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 22381 + }, + { + "epoch": 0.22382, + "grad_norm": 0.7590314448402322, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 22382 + }, + { + "epoch": 0.22383, + "grad_norm": 0.7082814535073892, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 22383 + }, + { + "epoch": 0.22384, + "grad_norm": 0.6869516842089984, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 22384 + }, + { + "epoch": 0.22385, + "grad_norm": 0.6634108021481424, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 22385 + }, + { + "epoch": 0.22386, + "grad_norm": 0.6235367990405143, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 22386 + }, + { + "epoch": 0.22387, + "grad_norm": 0.596753933968273, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 22387 + }, + { + "epoch": 0.22388, + "grad_norm": 0.6086483230944381, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 22388 + }, + { + "epoch": 0.22389, + "grad_norm": 0.6597716993030494, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 22389 + }, + { + "epoch": 0.2239, + "grad_norm": 0.9004788820746377, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 22390 + }, + { + "epoch": 0.22391, + "grad_norm": 1.199098777885385, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 22391 + }, + { + "epoch": 0.22392, + "grad_norm": 0.8767882068154063, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 22392 + }, + { + "epoch": 0.22393, + "grad_norm": 0.8482362624263489, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 22393 + }, + { + "epoch": 0.22394, + "grad_norm": 0.8405518808929641, + "learning_rate": 0.003, + "loss": 4.045, + "step": 22394 + }, + { + "epoch": 0.22395, + "grad_norm": 0.8317343879495549, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 22395 + }, + { + "epoch": 0.22396, + "grad_norm": 0.7761655127230493, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 22396 + }, + { + "epoch": 0.22397, + "grad_norm": 0.8056297916851944, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 22397 + }, + { + "epoch": 0.22398, + "grad_norm": 0.8890216468949055, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 22398 + }, + { + "epoch": 0.22399, + "grad_norm": 1.0340271312224014, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 22399 + }, + { + "epoch": 0.224, + "grad_norm": 1.1423192226735999, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 22400 + }, + { + "epoch": 0.22401, + "grad_norm": 0.9970181720749213, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 22401 + }, + { + "epoch": 0.22402, + "grad_norm": 0.9941213546991586, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 22402 + }, + { + "epoch": 0.22403, + "grad_norm": 1.0494083599481703, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 22403 + }, + { + "epoch": 0.22404, + "grad_norm": 0.9947243299248854, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 22404 + }, + { + "epoch": 0.22405, + "grad_norm": 1.048932806997926, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 22405 + }, + { + "epoch": 0.22406, + "grad_norm": 0.9829655524326324, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 22406 + }, + { + "epoch": 0.22407, + "grad_norm": 0.983543152382307, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 22407 + }, + { + "epoch": 0.22408, + "grad_norm": 1.0295198385116437, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 22408 + }, + { + "epoch": 0.22409, + "grad_norm": 0.7879879786122067, + "learning_rate": 0.003, + "loss": 4.074, + "step": 22409 + }, + { + "epoch": 0.2241, + "grad_norm": 0.6920471352836073, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 22410 + }, + { + "epoch": 0.22411, + "grad_norm": 0.7335191785731368, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 22411 + }, + { + "epoch": 0.22412, + "grad_norm": 0.8874712535614557, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 22412 + }, + { + "epoch": 0.22413, + "grad_norm": 1.375623960449763, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 22413 + }, + { + "epoch": 0.22414, + "grad_norm": 0.7614122022734574, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 22414 + }, + { + "epoch": 0.22415, + "grad_norm": 0.6763293359616619, + "learning_rate": 0.003, + "loss": 4.024, + "step": 22415 + }, + { + "epoch": 0.22416, + "grad_norm": 0.7665652273534039, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 22416 + }, + { + "epoch": 0.22417, + "grad_norm": 0.8542468855922213, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 22417 + }, + { + "epoch": 0.22418, + "grad_norm": 1.0406223562079902, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 22418 + }, + { + "epoch": 0.22419, + "grad_norm": 0.9816528431228511, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 22419 + }, + { + "epoch": 0.2242, + "grad_norm": 0.9367438140124987, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 22420 + }, + { + "epoch": 0.22421, + "grad_norm": 0.9813251260574493, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 22421 + }, + { + "epoch": 0.22422, + "grad_norm": 1.037303855142213, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 22422 + }, + { + "epoch": 0.22423, + "grad_norm": 1.2849847322169115, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 22423 + }, + { + "epoch": 0.22424, + "grad_norm": 0.9342386198093378, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 22424 + }, + { + "epoch": 0.22425, + "grad_norm": 0.885714215238561, + "learning_rate": 0.003, + "loss": 4.05, + "step": 22425 + }, + { + "epoch": 0.22426, + "grad_norm": 0.8882791322146617, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 22426 + }, + { + "epoch": 0.22427, + "grad_norm": 0.8551045797536737, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 22427 + }, + { + "epoch": 0.22428, + "grad_norm": 0.9051532030889349, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 22428 + }, + { + "epoch": 0.22429, + "grad_norm": 0.911895252037181, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 22429 + }, + { + "epoch": 0.2243, + "grad_norm": 0.9374602615606485, + "learning_rate": 0.003, + "loss": 4.05, + "step": 22430 + }, + { + "epoch": 0.22431, + "grad_norm": 1.0820237191025959, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 22431 + }, + { + "epoch": 0.22432, + "grad_norm": 1.0842745461727958, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 22432 + }, + { + "epoch": 0.22433, + "grad_norm": 1.0726745386657761, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 22433 + }, + { + "epoch": 0.22434, + "grad_norm": 1.2787279423021063, + "learning_rate": 0.003, + "loss": 4.064, + "step": 22434 + }, + { + "epoch": 0.22435, + "grad_norm": 0.9446789932747638, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 22435 + }, + { + "epoch": 0.22436, + "grad_norm": 0.8495633647217364, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 22436 + }, + { + "epoch": 0.22437, + "grad_norm": 0.8257725090767902, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 22437 + }, + { + "epoch": 0.22438, + "grad_norm": 0.8330179977379383, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 22438 + }, + { + "epoch": 0.22439, + "grad_norm": 0.7369863711056917, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 22439 + }, + { + "epoch": 0.2244, + "grad_norm": 0.760668533826531, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 22440 + }, + { + "epoch": 0.22441, + "grad_norm": 0.7756009370562948, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 22441 + }, + { + "epoch": 0.22442, + "grad_norm": 0.8159686665261338, + "learning_rate": 0.003, + "loss": 4.077, + "step": 22442 + }, + { + "epoch": 0.22443, + "grad_norm": 0.7695657106284184, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 22443 + }, + { + "epoch": 0.22444, + "grad_norm": 0.767361157660122, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 22444 + }, + { + "epoch": 0.22445, + "grad_norm": 0.6751116306296152, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 22445 + }, + { + "epoch": 0.22446, + "grad_norm": 0.7111547860169535, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 22446 + }, + { + "epoch": 0.22447, + "grad_norm": 0.8027817595233157, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 22447 + }, + { + "epoch": 0.22448, + "grad_norm": 0.9619935572938082, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 22448 + }, + { + "epoch": 0.22449, + "grad_norm": 1.4128354503813663, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 22449 + }, + { + "epoch": 0.2245, + "grad_norm": 0.8199746721448757, + "learning_rate": 0.003, + "loss": 4.068, + "step": 22450 + }, + { + "epoch": 0.22451, + "grad_norm": 0.7376818926237061, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 22451 + }, + { + "epoch": 0.22452, + "grad_norm": 0.7225765552580602, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 22452 + }, + { + "epoch": 0.22453, + "grad_norm": 0.674240454017934, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 22453 + }, + { + "epoch": 0.22454, + "grad_norm": 0.597986516205863, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 22454 + }, + { + "epoch": 0.22455, + "grad_norm": 0.6657223122003431, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 22455 + }, + { + "epoch": 0.22456, + "grad_norm": 0.6998662127518855, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 22456 + }, + { + "epoch": 0.22457, + "grad_norm": 0.6879094052363666, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 22457 + }, + { + "epoch": 0.22458, + "grad_norm": 0.7387644489815635, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 22458 + }, + { + "epoch": 0.22459, + "grad_norm": 0.9012618934900345, + "learning_rate": 0.003, + "loss": 4.03, + "step": 22459 + }, + { + "epoch": 0.2246, + "grad_norm": 1.064742311103599, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 22460 + }, + { + "epoch": 0.22461, + "grad_norm": 1.0128085301764587, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 22461 + }, + { + "epoch": 0.22462, + "grad_norm": 0.9776856701081688, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 22462 + }, + { + "epoch": 0.22463, + "grad_norm": 1.0983449629603956, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 22463 + }, + { + "epoch": 0.22464, + "grad_norm": 1.050475174352711, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 22464 + }, + { + "epoch": 0.22465, + "grad_norm": 1.0195367633204806, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 22465 + }, + { + "epoch": 0.22466, + "grad_norm": 1.110525210294342, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 22466 + }, + { + "epoch": 0.22467, + "grad_norm": 0.9653978000317589, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 22467 + }, + { + "epoch": 0.22468, + "grad_norm": 0.8729096779729661, + "learning_rate": 0.003, + "loss": 4.053, + "step": 22468 + }, + { + "epoch": 0.22469, + "grad_norm": 0.8533826002763989, + "learning_rate": 0.003, + "loss": 4.084, + "step": 22469 + }, + { + "epoch": 0.2247, + "grad_norm": 0.8661519660377732, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 22470 + }, + { + "epoch": 0.22471, + "grad_norm": 0.9454466183737082, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 22471 + }, + { + "epoch": 0.22472, + "grad_norm": 0.9044980396172726, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 22472 + }, + { + "epoch": 0.22473, + "grad_norm": 0.7838565573244917, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 22473 + }, + { + "epoch": 0.22474, + "grad_norm": 0.7681634831293778, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 22474 + }, + { + "epoch": 0.22475, + "grad_norm": 0.8265672278491937, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 22475 + }, + { + "epoch": 0.22476, + "grad_norm": 0.9408747466032825, + "learning_rate": 0.003, + "loss": 4.071, + "step": 22476 + }, + { + "epoch": 0.22477, + "grad_norm": 1.147863715871065, + "learning_rate": 0.003, + "loss": 4.037, + "step": 22477 + }, + { + "epoch": 0.22478, + "grad_norm": 0.9002003248387239, + "learning_rate": 0.003, + "loss": 4.066, + "step": 22478 + }, + { + "epoch": 0.22479, + "grad_norm": 0.8873843965061498, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 22479 + }, + { + "epoch": 0.2248, + "grad_norm": 0.8883603408810261, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 22480 + }, + { + "epoch": 0.22481, + "grad_norm": 1.0005760126302268, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 22481 + }, + { + "epoch": 0.22482, + "grad_norm": 1.0165572050469156, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 22482 + }, + { + "epoch": 0.22483, + "grad_norm": 1.0239835058448845, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 22483 + }, + { + "epoch": 0.22484, + "grad_norm": 0.8460874881824254, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 22484 + }, + { + "epoch": 0.22485, + "grad_norm": 0.7975947538485714, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 22485 + }, + { + "epoch": 0.22486, + "grad_norm": 0.8657508696457858, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 22486 + }, + { + "epoch": 0.22487, + "grad_norm": 0.8371720375375046, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 22487 + }, + { + "epoch": 0.22488, + "grad_norm": 0.8910251004878182, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 22488 + }, + { + "epoch": 0.22489, + "grad_norm": 0.8426074338206573, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 22489 + }, + { + "epoch": 0.2249, + "grad_norm": 1.115577403561262, + "learning_rate": 0.003, + "loss": 4.07, + "step": 22490 + }, + { + "epoch": 0.22491, + "grad_norm": 1.0797138636103445, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 22491 + }, + { + "epoch": 0.22492, + "grad_norm": 1.0302809791529903, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 22492 + }, + { + "epoch": 0.22493, + "grad_norm": 1.100784740632589, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 22493 + }, + { + "epoch": 0.22494, + "grad_norm": 1.024358751639694, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 22494 + }, + { + "epoch": 0.22495, + "grad_norm": 1.10120163146534, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 22495 + }, + { + "epoch": 0.22496, + "grad_norm": 0.8278327030218051, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 22496 + }, + { + "epoch": 0.22497, + "grad_norm": 0.7788613330822212, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 22497 + }, + { + "epoch": 0.22498, + "grad_norm": 0.8247215002733669, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 22498 + }, + { + "epoch": 0.22499, + "grad_norm": 0.8253248770858604, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 22499 + }, + { + "epoch": 0.225, + "grad_norm": 0.6720663708044156, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 22500 + }, + { + "epoch": 0.22501, + "grad_norm": 0.6513400309322824, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 22501 + }, + { + "epoch": 0.22502, + "grad_norm": 0.6121646358825221, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 22502 + }, + { + "epoch": 0.22503, + "grad_norm": 0.574340405763607, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 22503 + }, + { + "epoch": 0.22504, + "grad_norm": 0.6170797344207057, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 22504 + }, + { + "epoch": 0.22505, + "grad_norm": 0.6806065791666323, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 22505 + }, + { + "epoch": 0.22506, + "grad_norm": 0.7932940420826946, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 22506 + }, + { + "epoch": 0.22507, + "grad_norm": 0.990946882618376, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 22507 + }, + { + "epoch": 0.22508, + "grad_norm": 1.0732940369211845, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 22508 + }, + { + "epoch": 0.22509, + "grad_norm": 0.9824475680713082, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 22509 + }, + { + "epoch": 0.2251, + "grad_norm": 0.8524620174114014, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 22510 + }, + { + "epoch": 0.22511, + "grad_norm": 0.8294394869232353, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 22511 + }, + { + "epoch": 0.22512, + "grad_norm": 0.8132779241261927, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 22512 + }, + { + "epoch": 0.22513, + "grad_norm": 0.8915216223813818, + "learning_rate": 0.003, + "loss": 4.04, + "step": 22513 + }, + { + "epoch": 0.22514, + "grad_norm": 0.9747971362160897, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 22514 + }, + { + "epoch": 0.22515, + "grad_norm": 1.2165438467058163, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 22515 + }, + { + "epoch": 0.22516, + "grad_norm": 0.8619242201440362, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 22516 + }, + { + "epoch": 0.22517, + "grad_norm": 0.8013797388295194, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 22517 + }, + { + "epoch": 0.22518, + "grad_norm": 0.745392620483749, + "learning_rate": 0.003, + "loss": 4.041, + "step": 22518 + }, + { + "epoch": 0.22519, + "grad_norm": 0.7247632398876357, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 22519 + }, + { + "epoch": 0.2252, + "grad_norm": 0.7190065003349567, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 22520 + }, + { + "epoch": 0.22521, + "grad_norm": 0.7683930864661954, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 22521 + }, + { + "epoch": 0.22522, + "grad_norm": 0.7574259775700574, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 22522 + }, + { + "epoch": 0.22523, + "grad_norm": 0.8059692805691367, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 22523 + }, + { + "epoch": 0.22524, + "grad_norm": 1.0352279462374505, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 22524 + }, + { + "epoch": 0.22525, + "grad_norm": 1.0199182312852217, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 22525 + }, + { + "epoch": 0.22526, + "grad_norm": 0.9635404966026307, + "learning_rate": 0.003, + "loss": 4.075, + "step": 22526 + }, + { + "epoch": 0.22527, + "grad_norm": 1.073452804157034, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 22527 + }, + { + "epoch": 0.22528, + "grad_norm": 1.057365274750992, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 22528 + }, + { + "epoch": 0.22529, + "grad_norm": 0.9664730460758946, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 22529 + }, + { + "epoch": 0.2253, + "grad_norm": 1.0122980328690805, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 22530 + }, + { + "epoch": 0.22531, + "grad_norm": 1.1736123434171162, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 22531 + }, + { + "epoch": 0.22532, + "grad_norm": 0.9629095351500622, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 22532 + }, + { + "epoch": 0.22533, + "grad_norm": 0.945575275575981, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 22533 + }, + { + "epoch": 0.22534, + "grad_norm": 0.9560840396590854, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 22534 + }, + { + "epoch": 0.22535, + "grad_norm": 0.957441548761198, + "learning_rate": 0.003, + "loss": 4.071, + "step": 22535 + }, + { + "epoch": 0.22536, + "grad_norm": 1.0119666166341508, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 22536 + }, + { + "epoch": 0.22537, + "grad_norm": 0.8912505217717467, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 22537 + }, + { + "epoch": 0.22538, + "grad_norm": 0.9783019584446301, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 22538 + }, + { + "epoch": 0.22539, + "grad_norm": 0.9776628271835963, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 22539 + }, + { + "epoch": 0.2254, + "grad_norm": 0.9673462477124672, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 22540 + }, + { + "epoch": 0.22541, + "grad_norm": 0.8602254145094855, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 22541 + }, + { + "epoch": 0.22542, + "grad_norm": 0.8628526849546995, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 22542 + }, + { + "epoch": 0.22543, + "grad_norm": 0.7917171637841883, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 22543 + }, + { + "epoch": 0.22544, + "grad_norm": 0.7889094961787414, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 22544 + }, + { + "epoch": 0.22545, + "grad_norm": 0.8723715399170936, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 22545 + }, + { + "epoch": 0.22546, + "grad_norm": 1.1398832572415534, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 22546 + }, + { + "epoch": 0.22547, + "grad_norm": 0.9842249532223255, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 22547 + }, + { + "epoch": 0.22548, + "grad_norm": 0.9505917507656427, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 22548 + }, + { + "epoch": 0.22549, + "grad_norm": 0.9464247013152843, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 22549 + }, + { + "epoch": 0.2255, + "grad_norm": 0.8695449853537233, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 22550 + }, + { + "epoch": 0.22551, + "grad_norm": 0.931958727355571, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 22551 + }, + { + "epoch": 0.22552, + "grad_norm": 1.0083673884591295, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 22552 + }, + { + "epoch": 0.22553, + "grad_norm": 1.0028389181120434, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 22553 + }, + { + "epoch": 0.22554, + "grad_norm": 1.1772338924028771, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 22554 + }, + { + "epoch": 0.22555, + "grad_norm": 0.965199777169094, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 22555 + }, + { + "epoch": 0.22556, + "grad_norm": 0.9389978371710452, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 22556 + }, + { + "epoch": 0.22557, + "grad_norm": 0.9581433870462375, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 22557 + }, + { + "epoch": 0.22558, + "grad_norm": 1.0140599141129432, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 22558 + }, + { + "epoch": 0.22559, + "grad_norm": 0.8604015323152806, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 22559 + }, + { + "epoch": 0.2256, + "grad_norm": 0.8166746744255023, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 22560 + }, + { + "epoch": 0.22561, + "grad_norm": 0.8587403428794941, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 22561 + }, + { + "epoch": 0.22562, + "grad_norm": 0.8776156589332852, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 22562 + }, + { + "epoch": 0.22563, + "grad_norm": 0.9450824447850328, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 22563 + }, + { + "epoch": 0.22564, + "grad_norm": 0.9561883635352979, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 22564 + }, + { + "epoch": 0.22565, + "grad_norm": 0.9882176140159471, + "learning_rate": 0.003, + "loss": 4.0948, + "step": 22565 + }, + { + "epoch": 0.22566, + "grad_norm": 0.8135825613386174, + "learning_rate": 0.003, + "loss": 4.06, + "step": 22566 + }, + { + "epoch": 0.22567, + "grad_norm": 0.7347851553409692, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 22567 + }, + { + "epoch": 0.22568, + "grad_norm": 0.690799113498159, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 22568 + }, + { + "epoch": 0.22569, + "grad_norm": 0.7162221793077578, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 22569 + }, + { + "epoch": 0.2257, + "grad_norm": 0.6806179485820844, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 22570 + }, + { + "epoch": 0.22571, + "grad_norm": 0.7216893418779671, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 22571 + }, + { + "epoch": 0.22572, + "grad_norm": 0.9024696562772438, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 22572 + }, + { + "epoch": 0.22573, + "grad_norm": 1.2127673296201573, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 22573 + }, + { + "epoch": 0.22574, + "grad_norm": 0.8479077283209163, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 22574 + }, + { + "epoch": 0.22575, + "grad_norm": 0.9083264485051574, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 22575 + }, + { + "epoch": 0.22576, + "grad_norm": 0.9839362927703458, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 22576 + }, + { + "epoch": 0.22577, + "grad_norm": 0.9682398027742857, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 22577 + }, + { + "epoch": 0.22578, + "grad_norm": 0.785383788338004, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 22578 + }, + { + "epoch": 0.22579, + "grad_norm": 0.773265432136049, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 22579 + }, + { + "epoch": 0.2258, + "grad_norm": 0.8374900736427304, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 22580 + }, + { + "epoch": 0.22581, + "grad_norm": 1.1235541656684023, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 22581 + }, + { + "epoch": 0.22582, + "grad_norm": 0.9878613728351212, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 22582 + }, + { + "epoch": 0.22583, + "grad_norm": 0.9097155948238201, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 22583 + }, + { + "epoch": 0.22584, + "grad_norm": 0.7342963290879935, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 22584 + }, + { + "epoch": 0.22585, + "grad_norm": 0.7310469446253919, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 22585 + }, + { + "epoch": 0.22586, + "grad_norm": 0.6564935680284889, + "learning_rate": 0.003, + "loss": 4.064, + "step": 22586 + }, + { + "epoch": 0.22587, + "grad_norm": 0.5752614337517873, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 22587 + }, + { + "epoch": 0.22588, + "grad_norm": 0.6277696751087115, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 22588 + }, + { + "epoch": 0.22589, + "grad_norm": 0.647207836677466, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 22589 + }, + { + "epoch": 0.2259, + "grad_norm": 0.6195436317102999, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 22590 + }, + { + "epoch": 0.22591, + "grad_norm": 0.6230322544870034, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 22591 + }, + { + "epoch": 0.22592, + "grad_norm": 0.8071873172682745, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 22592 + }, + { + "epoch": 0.22593, + "grad_norm": 1.1438259845801872, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 22593 + }, + { + "epoch": 0.22594, + "grad_norm": 1.1416447741204918, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 22594 + }, + { + "epoch": 0.22595, + "grad_norm": 0.6809705937637491, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 22595 + }, + { + "epoch": 0.22596, + "grad_norm": 0.6218388583904539, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 22596 + }, + { + "epoch": 0.22597, + "grad_norm": 0.632011795915333, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 22597 + }, + { + "epoch": 0.22598, + "grad_norm": 0.6833971399491734, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 22598 + }, + { + "epoch": 0.22599, + "grad_norm": 1.047985364137886, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 22599 + }, + { + "epoch": 0.226, + "grad_norm": 1.2289694623537284, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 22600 + }, + { + "epoch": 0.22601, + "grad_norm": 0.7802878888261349, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 22601 + }, + { + "epoch": 0.22602, + "grad_norm": 0.7654255100307006, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 22602 + }, + { + "epoch": 0.22603, + "grad_norm": 0.7742124186013298, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 22603 + }, + { + "epoch": 0.22604, + "grad_norm": 1.0025761483727609, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 22604 + }, + { + "epoch": 0.22605, + "grad_norm": 1.4886545270918778, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 22605 + }, + { + "epoch": 0.22606, + "grad_norm": 0.6736633516094532, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 22606 + }, + { + "epoch": 0.22607, + "grad_norm": 0.8297948904732105, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 22607 + }, + { + "epoch": 0.22608, + "grad_norm": 0.9367644543220012, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 22608 + }, + { + "epoch": 0.22609, + "grad_norm": 1.1171275809050112, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 22609 + }, + { + "epoch": 0.2261, + "grad_norm": 0.9522670237318388, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 22610 + }, + { + "epoch": 0.22611, + "grad_norm": 0.9296105377518948, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 22611 + }, + { + "epoch": 0.22612, + "grad_norm": 1.0450105775301994, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 22612 + }, + { + "epoch": 0.22613, + "grad_norm": 0.9261372014122379, + "learning_rate": 0.003, + "loss": 4.0933, + "step": 22613 + }, + { + "epoch": 0.22614, + "grad_norm": 0.9902630454626118, + "learning_rate": 0.003, + "loss": 4.069, + "step": 22614 + }, + { + "epoch": 0.22615, + "grad_norm": 0.977385652722921, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 22615 + }, + { + "epoch": 0.22616, + "grad_norm": 0.9091130785402171, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 22616 + }, + { + "epoch": 0.22617, + "grad_norm": 1.0163270259028543, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 22617 + }, + { + "epoch": 0.22618, + "grad_norm": 1.2031937925341383, + "learning_rate": 0.003, + "loss": 4.0962, + "step": 22618 + }, + { + "epoch": 0.22619, + "grad_norm": 1.0041494756507696, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 22619 + }, + { + "epoch": 0.2262, + "grad_norm": 0.9619042395614545, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 22620 + }, + { + "epoch": 0.22621, + "grad_norm": 0.9129350339396065, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 22621 + }, + { + "epoch": 0.22622, + "grad_norm": 1.0095613330379023, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 22622 + }, + { + "epoch": 0.22623, + "grad_norm": 1.1949991306262393, + "learning_rate": 0.003, + "loss": 4.051, + "step": 22623 + }, + { + "epoch": 0.22624, + "grad_norm": 0.8963117514310439, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 22624 + }, + { + "epoch": 0.22625, + "grad_norm": 1.0232834739506285, + "learning_rate": 0.003, + "loss": 4.066, + "step": 22625 + }, + { + "epoch": 0.22626, + "grad_norm": 0.911278980625608, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 22626 + }, + { + "epoch": 0.22627, + "grad_norm": 0.9805460639855323, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 22627 + }, + { + "epoch": 0.22628, + "grad_norm": 0.9053700926917654, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 22628 + }, + { + "epoch": 0.22629, + "grad_norm": 0.8838639267749948, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 22629 + }, + { + "epoch": 0.2263, + "grad_norm": 0.9692285526666109, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 22630 + }, + { + "epoch": 0.22631, + "grad_norm": 1.0028287599238581, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 22631 + }, + { + "epoch": 0.22632, + "grad_norm": 0.9011479773239284, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 22632 + }, + { + "epoch": 0.22633, + "grad_norm": 0.8612664806624377, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 22633 + }, + { + "epoch": 0.22634, + "grad_norm": 0.750262113745714, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 22634 + }, + { + "epoch": 0.22635, + "grad_norm": 0.743952291464596, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 22635 + }, + { + "epoch": 0.22636, + "grad_norm": 0.808419510931088, + "learning_rate": 0.003, + "loss": 4.054, + "step": 22636 + }, + { + "epoch": 0.22637, + "grad_norm": 0.7620130853712486, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 22637 + }, + { + "epoch": 0.22638, + "grad_norm": 0.694338324513765, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 22638 + }, + { + "epoch": 0.22639, + "grad_norm": 0.7550333084737811, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 22639 + }, + { + "epoch": 0.2264, + "grad_norm": 0.8525863573766842, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 22640 + }, + { + "epoch": 0.22641, + "grad_norm": 1.0062752351595612, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 22641 + }, + { + "epoch": 0.22642, + "grad_norm": 1.0537390949528482, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 22642 + }, + { + "epoch": 0.22643, + "grad_norm": 1.0025652451117242, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 22643 + }, + { + "epoch": 0.22644, + "grad_norm": 0.96830712391237, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 22644 + }, + { + "epoch": 0.22645, + "grad_norm": 1.034363807063051, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 22645 + }, + { + "epoch": 0.22646, + "grad_norm": 1.1990352588546782, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 22646 + }, + { + "epoch": 0.22647, + "grad_norm": 1.0459371360253151, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 22647 + }, + { + "epoch": 0.22648, + "grad_norm": 1.0276827455692468, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 22648 + }, + { + "epoch": 0.22649, + "grad_norm": 0.9796795019469517, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 22649 + }, + { + "epoch": 0.2265, + "grad_norm": 0.8818432232629168, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 22650 + }, + { + "epoch": 0.22651, + "grad_norm": 0.628120121846171, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 22651 + }, + { + "epoch": 0.22652, + "grad_norm": 0.6145968841593442, + "learning_rate": 0.003, + "loss": 4.065, + "step": 22652 + }, + { + "epoch": 0.22653, + "grad_norm": 0.720373343094132, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 22653 + }, + { + "epoch": 0.22654, + "grad_norm": 0.7499659342626266, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 22654 + }, + { + "epoch": 0.22655, + "grad_norm": 0.7234087522757889, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 22655 + }, + { + "epoch": 0.22656, + "grad_norm": 0.7545664706458002, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 22656 + }, + { + "epoch": 0.22657, + "grad_norm": 0.900128869164829, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 22657 + }, + { + "epoch": 0.22658, + "grad_norm": 0.9276287783009832, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 22658 + }, + { + "epoch": 0.22659, + "grad_norm": 0.8283135664338659, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 22659 + }, + { + "epoch": 0.2266, + "grad_norm": 0.8789792512460451, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 22660 + }, + { + "epoch": 0.22661, + "grad_norm": 1.0052539515852346, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 22661 + }, + { + "epoch": 0.22662, + "grad_norm": 1.080767698072948, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 22662 + }, + { + "epoch": 0.22663, + "grad_norm": 0.8611203467525144, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 22663 + }, + { + "epoch": 0.22664, + "grad_norm": 0.9449416565356623, + "learning_rate": 0.003, + "loss": 4.06, + "step": 22664 + }, + { + "epoch": 0.22665, + "grad_norm": 0.971622178537138, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 22665 + }, + { + "epoch": 0.22666, + "grad_norm": 1.0356852569858708, + "learning_rate": 0.003, + "loss": 4.08, + "step": 22666 + }, + { + "epoch": 0.22667, + "grad_norm": 1.007553276766393, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 22667 + }, + { + "epoch": 0.22668, + "grad_norm": 0.9174707531427051, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 22668 + }, + { + "epoch": 0.22669, + "grad_norm": 0.8425523705853986, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 22669 + }, + { + "epoch": 0.2267, + "grad_norm": 0.7941694785805898, + "learning_rate": 0.003, + "loss": 4.048, + "step": 22670 + }, + { + "epoch": 0.22671, + "grad_norm": 0.7749838193621441, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 22671 + }, + { + "epoch": 0.22672, + "grad_norm": 0.7918445039759081, + "learning_rate": 0.003, + "loss": 4.056, + "step": 22672 + }, + { + "epoch": 0.22673, + "grad_norm": 0.8646885110451289, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 22673 + }, + { + "epoch": 0.22674, + "grad_norm": 0.8345245611150793, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 22674 + }, + { + "epoch": 0.22675, + "grad_norm": 0.7975428682059714, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 22675 + }, + { + "epoch": 0.22676, + "grad_norm": 0.9170779721343335, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 22676 + }, + { + "epoch": 0.22677, + "grad_norm": 1.0018992931708357, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 22677 + }, + { + "epoch": 0.22678, + "grad_norm": 0.9259278629586358, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 22678 + }, + { + "epoch": 0.22679, + "grad_norm": 0.9581009692431548, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 22679 + }, + { + "epoch": 0.2268, + "grad_norm": 1.1510095775516627, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 22680 + }, + { + "epoch": 0.22681, + "grad_norm": 0.9866108478374495, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 22681 + }, + { + "epoch": 0.22682, + "grad_norm": 0.9830469804042229, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 22682 + }, + { + "epoch": 0.22683, + "grad_norm": 1.0039065982777216, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 22683 + }, + { + "epoch": 0.22684, + "grad_norm": 0.9832268895434356, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 22684 + }, + { + "epoch": 0.22685, + "grad_norm": 0.947447633492396, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 22685 + }, + { + "epoch": 0.22686, + "grad_norm": 0.8837451486182951, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 22686 + }, + { + "epoch": 0.22687, + "grad_norm": 0.8565653017539905, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 22687 + }, + { + "epoch": 0.22688, + "grad_norm": 1.0104639017512123, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 22688 + }, + { + "epoch": 0.22689, + "grad_norm": 0.975114170922442, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 22689 + }, + { + "epoch": 0.2269, + "grad_norm": 0.8449339228154968, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 22690 + }, + { + "epoch": 0.22691, + "grad_norm": 0.8151313485000596, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 22691 + }, + { + "epoch": 0.22692, + "grad_norm": 0.8162244058539979, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 22692 + }, + { + "epoch": 0.22693, + "grad_norm": 0.7488576404896917, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 22693 + }, + { + "epoch": 0.22694, + "grad_norm": 0.7515467946755654, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 22694 + }, + { + "epoch": 0.22695, + "grad_norm": 0.7440309682747194, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 22695 + }, + { + "epoch": 0.22696, + "grad_norm": 0.6645082399531839, + "learning_rate": 0.003, + "loss": 4.046, + "step": 22696 + }, + { + "epoch": 0.22697, + "grad_norm": 0.6656852626486821, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 22697 + }, + { + "epoch": 0.22698, + "grad_norm": 0.5735199010982336, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 22698 + }, + { + "epoch": 0.22699, + "grad_norm": 0.6224573279349771, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 22699 + }, + { + "epoch": 0.227, + "grad_norm": 0.6333014785989532, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 22700 + }, + { + "epoch": 0.22701, + "grad_norm": 0.6920123584011759, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 22701 + }, + { + "epoch": 0.22702, + "grad_norm": 0.747329756271678, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 22702 + }, + { + "epoch": 0.22703, + "grad_norm": 0.7110852928129989, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 22703 + }, + { + "epoch": 0.22704, + "grad_norm": 0.7165391520100443, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 22704 + }, + { + "epoch": 0.22705, + "grad_norm": 0.7540745466164605, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 22705 + }, + { + "epoch": 0.22706, + "grad_norm": 0.8697672264851807, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 22706 + }, + { + "epoch": 0.22707, + "grad_norm": 1.183111399923685, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 22707 + }, + { + "epoch": 0.22708, + "grad_norm": 1.1491430180039552, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 22708 + }, + { + "epoch": 0.22709, + "grad_norm": 0.972757390350557, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 22709 + }, + { + "epoch": 0.2271, + "grad_norm": 0.968494013863733, + "learning_rate": 0.003, + "loss": 4.04, + "step": 22710 + }, + { + "epoch": 0.22711, + "grad_norm": 1.0520071655171688, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 22711 + }, + { + "epoch": 0.22712, + "grad_norm": 0.8910486609978872, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 22712 + }, + { + "epoch": 0.22713, + "grad_norm": 0.7945312930938495, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 22713 + }, + { + "epoch": 0.22714, + "grad_norm": 0.7182497839785165, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 22714 + }, + { + "epoch": 0.22715, + "grad_norm": 0.7069488420876577, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 22715 + }, + { + "epoch": 0.22716, + "grad_norm": 0.7980211387490869, + "learning_rate": 0.003, + "loss": 4.044, + "step": 22716 + }, + { + "epoch": 0.22717, + "grad_norm": 0.9048879610739127, + "learning_rate": 0.003, + "loss": 4.077, + "step": 22717 + }, + { + "epoch": 0.22718, + "grad_norm": 0.8602498474925933, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 22718 + }, + { + "epoch": 0.22719, + "grad_norm": 0.8482471930463336, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 22719 + }, + { + "epoch": 0.2272, + "grad_norm": 0.8133784578779553, + "learning_rate": 0.003, + "loss": 4.058, + "step": 22720 + }, + { + "epoch": 0.22721, + "grad_norm": 0.8104731653097466, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 22721 + }, + { + "epoch": 0.22722, + "grad_norm": 0.8118707964996753, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 22722 + }, + { + "epoch": 0.22723, + "grad_norm": 0.8861620894199845, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 22723 + }, + { + "epoch": 0.22724, + "grad_norm": 1.079175913332311, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 22724 + }, + { + "epoch": 0.22725, + "grad_norm": 1.2562039862360328, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 22725 + }, + { + "epoch": 0.22726, + "grad_norm": 0.8492751766680512, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 22726 + }, + { + "epoch": 0.22727, + "grad_norm": 0.904075217164488, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 22727 + }, + { + "epoch": 0.22728, + "grad_norm": 0.8414714997922506, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 22728 + }, + { + "epoch": 0.22729, + "grad_norm": 0.906962799489872, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 22729 + }, + { + "epoch": 0.2273, + "grad_norm": 0.9967874821560386, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 22730 + }, + { + "epoch": 0.22731, + "grad_norm": 1.1586205386995512, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 22731 + }, + { + "epoch": 0.22732, + "grad_norm": 1.0489328409640646, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 22732 + }, + { + "epoch": 0.22733, + "grad_norm": 0.9610063968691808, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 22733 + }, + { + "epoch": 0.22734, + "grad_norm": 0.909430450755612, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 22734 + }, + { + "epoch": 0.22735, + "grad_norm": 0.9897769424042009, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 22735 + }, + { + "epoch": 0.22736, + "grad_norm": 0.970753056237132, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 22736 + }, + { + "epoch": 0.22737, + "grad_norm": 1.0260003649404208, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 22737 + }, + { + "epoch": 0.22738, + "grad_norm": 1.1877661369093184, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 22738 + }, + { + "epoch": 0.22739, + "grad_norm": 0.9347650162164082, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 22739 + }, + { + "epoch": 0.2274, + "grad_norm": 0.9393559041480698, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 22740 + }, + { + "epoch": 0.22741, + "grad_norm": 0.9967223469859002, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 22741 + }, + { + "epoch": 0.22742, + "grad_norm": 0.9272640767979677, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 22742 + }, + { + "epoch": 0.22743, + "grad_norm": 0.8808696694984538, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 22743 + }, + { + "epoch": 0.22744, + "grad_norm": 0.8754452913224413, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 22744 + }, + { + "epoch": 0.22745, + "grad_norm": 0.7880741554440877, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 22745 + }, + { + "epoch": 0.22746, + "grad_norm": 0.7523070895746822, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 22746 + }, + { + "epoch": 0.22747, + "grad_norm": 0.6716616970521917, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 22747 + }, + { + "epoch": 0.22748, + "grad_norm": 0.6078743567779584, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 22748 + }, + { + "epoch": 0.22749, + "grad_norm": 0.7269726867285148, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 22749 + }, + { + "epoch": 0.2275, + "grad_norm": 0.9344885317325508, + "learning_rate": 0.003, + "loss": 4.046, + "step": 22750 + }, + { + "epoch": 0.22751, + "grad_norm": 1.1503680403094034, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 22751 + }, + { + "epoch": 0.22752, + "grad_norm": 0.8700409231841962, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 22752 + }, + { + "epoch": 0.22753, + "grad_norm": 0.9502787901091739, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 22753 + }, + { + "epoch": 0.22754, + "grad_norm": 1.2170716227290153, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 22754 + }, + { + "epoch": 0.22755, + "grad_norm": 1.001586415329349, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 22755 + }, + { + "epoch": 0.22756, + "grad_norm": 0.9073302937086049, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 22756 + }, + { + "epoch": 0.22757, + "grad_norm": 0.8481127690300688, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 22757 + }, + { + "epoch": 0.22758, + "grad_norm": 0.9188490433164043, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 22758 + }, + { + "epoch": 0.22759, + "grad_norm": 0.862118745179981, + "learning_rate": 0.003, + "loss": 4.044, + "step": 22759 + }, + { + "epoch": 0.2276, + "grad_norm": 0.8769824496909433, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 22760 + }, + { + "epoch": 0.22761, + "grad_norm": 0.9013392694072248, + "learning_rate": 0.003, + "loss": 4.064, + "step": 22761 + }, + { + "epoch": 0.22762, + "grad_norm": 0.9309138302753703, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 22762 + }, + { + "epoch": 0.22763, + "grad_norm": 0.949413238124462, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 22763 + }, + { + "epoch": 0.22764, + "grad_norm": 0.9512018871828793, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 22764 + }, + { + "epoch": 0.22765, + "grad_norm": 0.8689322660909483, + "learning_rate": 0.003, + "loss": 4.04, + "step": 22765 + }, + { + "epoch": 0.22766, + "grad_norm": 0.8444804983371068, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 22766 + }, + { + "epoch": 0.22767, + "grad_norm": 1.080413165072527, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 22767 + }, + { + "epoch": 0.22768, + "grad_norm": 0.9088416420914456, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 22768 + }, + { + "epoch": 0.22769, + "grad_norm": 0.799525837915491, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 22769 + }, + { + "epoch": 0.2277, + "grad_norm": 0.7699723538144105, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 22770 + }, + { + "epoch": 0.22771, + "grad_norm": 0.8143142554392547, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 22771 + }, + { + "epoch": 0.22772, + "grad_norm": 0.9274083261025904, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 22772 + }, + { + "epoch": 0.22773, + "grad_norm": 0.984238176883308, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 22773 + }, + { + "epoch": 0.22774, + "grad_norm": 0.9554511641495651, + "learning_rate": 0.003, + "loss": 4.029, + "step": 22774 + }, + { + "epoch": 0.22775, + "grad_norm": 0.7928942084926369, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 22775 + }, + { + "epoch": 0.22776, + "grad_norm": 0.7207440721741104, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 22776 + }, + { + "epoch": 0.22777, + "grad_norm": 0.6623191295748422, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 22777 + }, + { + "epoch": 0.22778, + "grad_norm": 0.5898935569872705, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 22778 + }, + { + "epoch": 0.22779, + "grad_norm": 0.608286460676412, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 22779 + }, + { + "epoch": 0.2278, + "grad_norm": 0.7065719510964161, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 22780 + }, + { + "epoch": 0.22781, + "grad_norm": 0.8759375015149959, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 22781 + }, + { + "epoch": 0.22782, + "grad_norm": 1.0701114191785137, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 22782 + }, + { + "epoch": 0.22783, + "grad_norm": 1.0109689899197118, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 22783 + }, + { + "epoch": 0.22784, + "grad_norm": 1.1001918809786848, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 22784 + }, + { + "epoch": 0.22785, + "grad_norm": 1.0651519419042537, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 22785 + }, + { + "epoch": 0.22786, + "grad_norm": 0.8809097220494033, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 22786 + }, + { + "epoch": 0.22787, + "grad_norm": 0.6721397554264887, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 22787 + }, + { + "epoch": 0.22788, + "grad_norm": 0.5252983629838026, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 22788 + }, + { + "epoch": 0.22789, + "grad_norm": 0.6071835354482025, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 22789 + }, + { + "epoch": 0.2279, + "grad_norm": 0.7097765682495768, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 22790 + }, + { + "epoch": 0.22791, + "grad_norm": 0.8122285027012834, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 22791 + }, + { + "epoch": 0.22792, + "grad_norm": 0.8741104437717017, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 22792 + }, + { + "epoch": 0.22793, + "grad_norm": 0.7546144575435845, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 22793 + }, + { + "epoch": 0.22794, + "grad_norm": 0.8617384188138572, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 22794 + }, + { + "epoch": 0.22795, + "grad_norm": 1.0403403332749075, + "learning_rate": 0.003, + "loss": 4.106, + "step": 22795 + }, + { + "epoch": 0.22796, + "grad_norm": 1.1477340056332108, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 22796 + }, + { + "epoch": 0.22797, + "grad_norm": 0.9891245123643607, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 22797 + }, + { + "epoch": 0.22798, + "grad_norm": 1.0551842642330131, + "learning_rate": 0.003, + "loss": 4.074, + "step": 22798 + }, + { + "epoch": 0.22799, + "grad_norm": 1.0552942759567618, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 22799 + }, + { + "epoch": 0.228, + "grad_norm": 1.009389497514096, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 22800 + }, + { + "epoch": 0.22801, + "grad_norm": 1.1548037553780226, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 22801 + }, + { + "epoch": 0.22802, + "grad_norm": 0.843224546999332, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 22802 + }, + { + "epoch": 0.22803, + "grad_norm": 0.8061515141629308, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 22803 + }, + { + "epoch": 0.22804, + "grad_norm": 0.7573372806484121, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 22804 + }, + { + "epoch": 0.22805, + "grad_norm": 0.6828518090113931, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 22805 + }, + { + "epoch": 0.22806, + "grad_norm": 0.666160973032393, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 22806 + }, + { + "epoch": 0.22807, + "grad_norm": 0.6534853359153482, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 22807 + }, + { + "epoch": 0.22808, + "grad_norm": 0.694400653267406, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 22808 + }, + { + "epoch": 0.22809, + "grad_norm": 0.6626020941319344, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 22809 + }, + { + "epoch": 0.2281, + "grad_norm": 0.5981677241867265, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 22810 + }, + { + "epoch": 0.22811, + "grad_norm": 0.6691449669962832, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 22811 + }, + { + "epoch": 0.22812, + "grad_norm": 0.772816656462761, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 22812 + }, + { + "epoch": 0.22813, + "grad_norm": 0.9377318248938178, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 22813 + }, + { + "epoch": 0.22814, + "grad_norm": 1.2412529653321838, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 22814 + }, + { + "epoch": 0.22815, + "grad_norm": 0.749202066133807, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 22815 + }, + { + "epoch": 0.22816, + "grad_norm": 0.7501119635213498, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 22816 + }, + { + "epoch": 0.22817, + "grad_norm": 0.7675679163994044, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 22817 + }, + { + "epoch": 0.22818, + "grad_norm": 0.8433282719370441, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 22818 + }, + { + "epoch": 0.22819, + "grad_norm": 0.941368095384892, + "learning_rate": 0.003, + "loss": 4.034, + "step": 22819 + }, + { + "epoch": 0.2282, + "grad_norm": 1.1336983836760979, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 22820 + }, + { + "epoch": 0.22821, + "grad_norm": 1.0208848351092374, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 22821 + }, + { + "epoch": 0.22822, + "grad_norm": 1.1913258834969345, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 22822 + }, + { + "epoch": 0.22823, + "grad_norm": 0.8920054317702796, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 22823 + }, + { + "epoch": 0.22824, + "grad_norm": 0.8388889322992132, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 22824 + }, + { + "epoch": 0.22825, + "grad_norm": 0.8141244000835429, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 22825 + }, + { + "epoch": 0.22826, + "grad_norm": 0.8139046223127696, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 22826 + }, + { + "epoch": 0.22827, + "grad_norm": 0.9174284630998599, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 22827 + }, + { + "epoch": 0.22828, + "grad_norm": 0.8838765846096129, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 22828 + }, + { + "epoch": 0.22829, + "grad_norm": 0.8461518621141414, + "learning_rate": 0.003, + "loss": 4.027, + "step": 22829 + }, + { + "epoch": 0.2283, + "grad_norm": 0.8930878887878086, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 22830 + }, + { + "epoch": 0.22831, + "grad_norm": 1.1795609722644327, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 22831 + }, + { + "epoch": 0.22832, + "grad_norm": 0.9880840005024115, + "learning_rate": 0.003, + "loss": 4.061, + "step": 22832 + }, + { + "epoch": 0.22833, + "grad_norm": 0.8639869942846465, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 22833 + }, + { + "epoch": 0.22834, + "grad_norm": 0.9211993329776144, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 22834 + }, + { + "epoch": 0.22835, + "grad_norm": 1.0468431059164391, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 22835 + }, + { + "epoch": 0.22836, + "grad_norm": 1.245325142310674, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 22836 + }, + { + "epoch": 0.22837, + "grad_norm": 0.8997614990021262, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 22837 + }, + { + "epoch": 0.22838, + "grad_norm": 0.7223012100017461, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 22838 + }, + { + "epoch": 0.22839, + "grad_norm": 0.7114146903458733, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 22839 + }, + { + "epoch": 0.2284, + "grad_norm": 0.7968502569647606, + "learning_rate": 0.003, + "loss": 4.053, + "step": 22840 + }, + { + "epoch": 0.22841, + "grad_norm": 1.0252895278679306, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 22841 + }, + { + "epoch": 0.22842, + "grad_norm": 1.1009154227110782, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 22842 + }, + { + "epoch": 0.22843, + "grad_norm": 0.9348037932749858, + "learning_rate": 0.003, + "loss": 4.062, + "step": 22843 + }, + { + "epoch": 0.22844, + "grad_norm": 1.0187018353456725, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 22844 + }, + { + "epoch": 0.22845, + "grad_norm": 1.0757222856196504, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 22845 + }, + { + "epoch": 0.22846, + "grad_norm": 0.9321190629288321, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 22846 + }, + { + "epoch": 0.22847, + "grad_norm": 1.1090680297543973, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 22847 + }, + { + "epoch": 0.22848, + "grad_norm": 1.0260494042912045, + "learning_rate": 0.003, + "loss": 4.0987, + "step": 22848 + }, + { + "epoch": 0.22849, + "grad_norm": 1.0234148870836144, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 22849 + }, + { + "epoch": 0.2285, + "grad_norm": 1.0063410273604398, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 22850 + }, + { + "epoch": 0.22851, + "grad_norm": 0.9518465127345284, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 22851 + }, + { + "epoch": 0.22852, + "grad_norm": 0.8896919039899941, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 22852 + }, + { + "epoch": 0.22853, + "grad_norm": 0.970127617852437, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 22853 + }, + { + "epoch": 0.22854, + "grad_norm": 1.1040297130814274, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 22854 + }, + { + "epoch": 0.22855, + "grad_norm": 1.1272146921961455, + "learning_rate": 0.003, + "loss": 4.098, + "step": 22855 + }, + { + "epoch": 0.22856, + "grad_norm": 0.9025382692018051, + "learning_rate": 0.003, + "loss": 4.0982, + "step": 22856 + }, + { + "epoch": 0.22857, + "grad_norm": 0.9756305715682522, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 22857 + }, + { + "epoch": 0.22858, + "grad_norm": 1.1889158222663803, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 22858 + }, + { + "epoch": 0.22859, + "grad_norm": 0.8569899154104883, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 22859 + }, + { + "epoch": 0.2286, + "grad_norm": 0.858128212011132, + "learning_rate": 0.003, + "loss": 4.0876, + "step": 22860 + }, + { + "epoch": 0.22861, + "grad_norm": 0.8459227504132819, + "learning_rate": 0.003, + "loss": 4.042, + "step": 22861 + }, + { + "epoch": 0.22862, + "grad_norm": 0.7765843307811464, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 22862 + }, + { + "epoch": 0.22863, + "grad_norm": 0.7706817304234865, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 22863 + }, + { + "epoch": 0.22864, + "grad_norm": 0.7255600341445928, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 22864 + }, + { + "epoch": 0.22865, + "grad_norm": 0.6813917713241208, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 22865 + }, + { + "epoch": 0.22866, + "grad_norm": 0.5738037421381981, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 22866 + }, + { + "epoch": 0.22867, + "grad_norm": 0.5507813134036691, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 22867 + }, + { + "epoch": 0.22868, + "grad_norm": 0.6832369511215274, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 22868 + }, + { + "epoch": 0.22869, + "grad_norm": 1.1055537764289742, + "learning_rate": 0.003, + "loss": 4.061, + "step": 22869 + }, + { + "epoch": 0.2287, + "grad_norm": 1.2224045301249717, + "learning_rate": 0.003, + "loss": 4.055, + "step": 22870 + }, + { + "epoch": 0.22871, + "grad_norm": 0.6513004531331329, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 22871 + }, + { + "epoch": 0.22872, + "grad_norm": 0.6151857890607966, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 22872 + }, + { + "epoch": 0.22873, + "grad_norm": 0.7166660684404108, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 22873 + }, + { + "epoch": 0.22874, + "grad_norm": 0.6904437340497975, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 22874 + }, + { + "epoch": 0.22875, + "grad_norm": 0.6631850554355596, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 22875 + }, + { + "epoch": 0.22876, + "grad_norm": 0.7179607912734198, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 22876 + }, + { + "epoch": 0.22877, + "grad_norm": 0.9976481130477168, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 22877 + }, + { + "epoch": 0.22878, + "grad_norm": 1.300526202817369, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 22878 + }, + { + "epoch": 0.22879, + "grad_norm": 0.6375483840950272, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 22879 + }, + { + "epoch": 0.2288, + "grad_norm": 0.7579041704260526, + "learning_rate": 0.003, + "loss": 4.041, + "step": 22880 + }, + { + "epoch": 0.22881, + "grad_norm": 0.8667024385310933, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 22881 + }, + { + "epoch": 0.22882, + "grad_norm": 0.9157192582314859, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 22882 + }, + { + "epoch": 0.22883, + "grad_norm": 0.9356479890036767, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 22883 + }, + { + "epoch": 0.22884, + "grad_norm": 0.8954344958456766, + "learning_rate": 0.003, + "loss": 4.029, + "step": 22884 + }, + { + "epoch": 0.22885, + "grad_norm": 0.9041830458560043, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 22885 + }, + { + "epoch": 0.22886, + "grad_norm": 1.0603001011635644, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 22886 + }, + { + "epoch": 0.22887, + "grad_norm": 1.1936793786653745, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 22887 + }, + { + "epoch": 0.22888, + "grad_norm": 0.7928966771319186, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 22888 + }, + { + "epoch": 0.22889, + "grad_norm": 0.7250793645787325, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 22889 + }, + { + "epoch": 0.2289, + "grad_norm": 0.6459945149022985, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 22890 + }, + { + "epoch": 0.22891, + "grad_norm": 0.644664663994461, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 22891 + }, + { + "epoch": 0.22892, + "grad_norm": 0.7146749073703039, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 22892 + }, + { + "epoch": 0.22893, + "grad_norm": 0.8059523448770808, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 22893 + }, + { + "epoch": 0.22894, + "grad_norm": 0.8299869209530397, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 22894 + }, + { + "epoch": 0.22895, + "grad_norm": 0.791062985100653, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 22895 + }, + { + "epoch": 0.22896, + "grad_norm": 0.9253667717221834, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 22896 + }, + { + "epoch": 0.22897, + "grad_norm": 0.916825194225468, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 22897 + }, + { + "epoch": 0.22898, + "grad_norm": 0.900905856485575, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 22898 + }, + { + "epoch": 0.22899, + "grad_norm": 0.9855931961412255, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 22899 + }, + { + "epoch": 0.229, + "grad_norm": 1.0862993704192567, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 22900 + }, + { + "epoch": 0.22901, + "grad_norm": 0.9833426033059771, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 22901 + }, + { + "epoch": 0.22902, + "grad_norm": 0.9626526031839022, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 22902 + }, + { + "epoch": 0.22903, + "grad_norm": 0.87181432723415, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 22903 + }, + { + "epoch": 0.22904, + "grad_norm": 0.9400562772563169, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 22904 + }, + { + "epoch": 0.22905, + "grad_norm": 0.9984794930427514, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 22905 + }, + { + "epoch": 0.22906, + "grad_norm": 1.002218107887175, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 22906 + }, + { + "epoch": 0.22907, + "grad_norm": 0.9102772936374971, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 22907 + }, + { + "epoch": 0.22908, + "grad_norm": 1.1172984967001958, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 22908 + }, + { + "epoch": 0.22909, + "grad_norm": 1.1244459021174908, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 22909 + }, + { + "epoch": 0.2291, + "grad_norm": 1.163742705238034, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 22910 + }, + { + "epoch": 0.22911, + "grad_norm": 0.8542478953851748, + "learning_rate": 0.003, + "loss": 4.043, + "step": 22911 + }, + { + "epoch": 0.22912, + "grad_norm": 0.8479751087676674, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 22912 + }, + { + "epoch": 0.22913, + "grad_norm": 1.0237665976024057, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 22913 + }, + { + "epoch": 0.22914, + "grad_norm": 1.0245501065829639, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 22914 + }, + { + "epoch": 0.22915, + "grad_norm": 0.905597520989836, + "learning_rate": 0.003, + "loss": 4.063, + "step": 22915 + }, + { + "epoch": 0.22916, + "grad_norm": 0.8452178131475603, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 22916 + }, + { + "epoch": 0.22917, + "grad_norm": 0.7676683649543508, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 22917 + }, + { + "epoch": 0.22918, + "grad_norm": 0.8347297353087648, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 22918 + }, + { + "epoch": 0.22919, + "grad_norm": 0.9238730428877724, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 22919 + }, + { + "epoch": 0.2292, + "grad_norm": 0.9948638665174948, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 22920 + }, + { + "epoch": 0.22921, + "grad_norm": 1.0204471285302084, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 22921 + }, + { + "epoch": 0.22922, + "grad_norm": 1.0567623779721371, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 22922 + }, + { + "epoch": 0.22923, + "grad_norm": 0.8598295176574143, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 22923 + }, + { + "epoch": 0.22924, + "grad_norm": 0.7410298075784536, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 22924 + }, + { + "epoch": 0.22925, + "grad_norm": 0.6810413229843065, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 22925 + }, + { + "epoch": 0.22926, + "grad_norm": 0.6324416583182618, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 22926 + }, + { + "epoch": 0.22927, + "grad_norm": 0.6668615354737717, + "learning_rate": 0.003, + "loss": 4.055, + "step": 22927 + }, + { + "epoch": 0.22928, + "grad_norm": 0.7976919923272611, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 22928 + }, + { + "epoch": 0.22929, + "grad_norm": 0.8562862406451046, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 22929 + }, + { + "epoch": 0.2293, + "grad_norm": 0.9892108356188348, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 22930 + }, + { + "epoch": 0.22931, + "grad_norm": 0.9879965267580636, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 22931 + }, + { + "epoch": 0.22932, + "grad_norm": 0.8771048374138064, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 22932 + }, + { + "epoch": 0.22933, + "grad_norm": 0.8392038939464274, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 22933 + }, + { + "epoch": 0.22934, + "grad_norm": 0.8252270106607972, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 22934 + }, + { + "epoch": 0.22935, + "grad_norm": 0.8725316834366568, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 22935 + }, + { + "epoch": 0.22936, + "grad_norm": 0.9484097659068086, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 22936 + }, + { + "epoch": 0.22937, + "grad_norm": 1.339929854033913, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 22937 + }, + { + "epoch": 0.22938, + "grad_norm": 0.9453912054839796, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 22938 + }, + { + "epoch": 0.22939, + "grad_norm": 0.9252437224548369, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 22939 + }, + { + "epoch": 0.2294, + "grad_norm": 0.8626464385891859, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 22940 + }, + { + "epoch": 0.22941, + "grad_norm": 0.9741466563509095, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 22941 + }, + { + "epoch": 0.22942, + "grad_norm": 1.035526456224067, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 22942 + }, + { + "epoch": 0.22943, + "grad_norm": 0.9534015075296856, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 22943 + }, + { + "epoch": 0.22944, + "grad_norm": 0.9775281928913955, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 22944 + }, + { + "epoch": 0.22945, + "grad_norm": 1.0681686522829308, + "learning_rate": 0.003, + "loss": 4.055, + "step": 22945 + }, + { + "epoch": 0.22946, + "grad_norm": 1.0443875198340178, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 22946 + }, + { + "epoch": 0.22947, + "grad_norm": 0.9303181436743452, + "learning_rate": 0.003, + "loss": 4.06, + "step": 22947 + }, + { + "epoch": 0.22948, + "grad_norm": 0.8805204069071085, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 22948 + }, + { + "epoch": 0.22949, + "grad_norm": 0.8073268543094761, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 22949 + }, + { + "epoch": 0.2295, + "grad_norm": 0.7990491002563296, + "learning_rate": 0.003, + "loss": 4.057, + "step": 22950 + }, + { + "epoch": 0.22951, + "grad_norm": 0.7990195049561448, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 22951 + }, + { + "epoch": 0.22952, + "grad_norm": 0.8028268297177463, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 22952 + }, + { + "epoch": 0.22953, + "grad_norm": 0.7812014546426375, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 22953 + }, + { + "epoch": 0.22954, + "grad_norm": 0.7896531312946842, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 22954 + }, + { + "epoch": 0.22955, + "grad_norm": 0.7152243663377584, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 22955 + }, + { + "epoch": 0.22956, + "grad_norm": 0.6466416504931088, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 22956 + }, + { + "epoch": 0.22957, + "grad_norm": 0.6699337604433011, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 22957 + }, + { + "epoch": 0.22958, + "grad_norm": 0.5800195096240598, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 22958 + }, + { + "epoch": 0.22959, + "grad_norm": 0.6335042890350372, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 22959 + }, + { + "epoch": 0.2296, + "grad_norm": 0.581973903016988, + "learning_rate": 0.003, + "loss": 3.9851, + "step": 22960 + }, + { + "epoch": 0.22961, + "grad_norm": 0.5812413821469002, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 22961 + }, + { + "epoch": 0.22962, + "grad_norm": 0.6755407643154575, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 22962 + }, + { + "epoch": 0.22963, + "grad_norm": 0.723218674105609, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 22963 + }, + { + "epoch": 0.22964, + "grad_norm": 0.8057990456142935, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 22964 + }, + { + "epoch": 0.22965, + "grad_norm": 1.0252109799460332, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 22965 + }, + { + "epoch": 0.22966, + "grad_norm": 1.3427812705845885, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 22966 + }, + { + "epoch": 0.22967, + "grad_norm": 0.8797474436401552, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 22967 + }, + { + "epoch": 0.22968, + "grad_norm": 0.9576048558955592, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 22968 + }, + { + "epoch": 0.22969, + "grad_norm": 0.944415388266654, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 22969 + }, + { + "epoch": 0.2297, + "grad_norm": 1.1375977085296949, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 22970 + }, + { + "epoch": 0.22971, + "grad_norm": 0.9591753880489413, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 22971 + }, + { + "epoch": 0.22972, + "grad_norm": 0.8148959203882464, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 22972 + }, + { + "epoch": 0.22973, + "grad_norm": 0.7743403967676066, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 22973 + }, + { + "epoch": 0.22974, + "grad_norm": 0.8639527765579142, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 22974 + }, + { + "epoch": 0.22975, + "grad_norm": 1.0442579089795847, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 22975 + }, + { + "epoch": 0.22976, + "grad_norm": 1.0369976250361848, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 22976 + }, + { + "epoch": 0.22977, + "grad_norm": 1.109288992260987, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 22977 + }, + { + "epoch": 0.22978, + "grad_norm": 1.1558766156710585, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 22978 + }, + { + "epoch": 0.22979, + "grad_norm": 1.0285031591188565, + "learning_rate": 0.003, + "loss": 4.059, + "step": 22979 + }, + { + "epoch": 0.2298, + "grad_norm": 0.9182114991127589, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 22980 + }, + { + "epoch": 0.22981, + "grad_norm": 0.8586159639490722, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 22981 + }, + { + "epoch": 0.22982, + "grad_norm": 0.7320856724910753, + "learning_rate": 0.003, + "loss": 4.049, + "step": 22982 + }, + { + "epoch": 0.22983, + "grad_norm": 0.7573053317615691, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 22983 + }, + { + "epoch": 0.22984, + "grad_norm": 0.7364799613121218, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 22984 + }, + { + "epoch": 0.22985, + "grad_norm": 0.7517083418231769, + "learning_rate": 0.003, + "loss": 4.048, + "step": 22985 + }, + { + "epoch": 0.22986, + "grad_norm": 0.7121643498601627, + "learning_rate": 0.003, + "loss": 4.027, + "step": 22986 + }, + { + "epoch": 0.22987, + "grad_norm": 0.7442075739463677, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 22987 + }, + { + "epoch": 0.22988, + "grad_norm": 0.7675063175417414, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 22988 + }, + { + "epoch": 0.22989, + "grad_norm": 0.9094691556004088, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 22989 + }, + { + "epoch": 0.2299, + "grad_norm": 0.9897633770199264, + "learning_rate": 0.003, + "loss": 4.055, + "step": 22990 + }, + { + "epoch": 0.22991, + "grad_norm": 1.1205124593215388, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 22991 + }, + { + "epoch": 0.22992, + "grad_norm": 1.069519546680366, + "learning_rate": 0.003, + "loss": 4.044, + "step": 22992 + }, + { + "epoch": 0.22993, + "grad_norm": 1.052990148523248, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 22993 + }, + { + "epoch": 0.22994, + "grad_norm": 1.0578793869125982, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 22994 + }, + { + "epoch": 0.22995, + "grad_norm": 1.1542369296925874, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 22995 + }, + { + "epoch": 0.22996, + "grad_norm": 1.0761176648478468, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 22996 + }, + { + "epoch": 0.22997, + "grad_norm": 1.0180931606093209, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 22997 + }, + { + "epoch": 0.22998, + "grad_norm": 0.9458943977593038, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 22998 + }, + { + "epoch": 0.22999, + "grad_norm": 0.8220670512733281, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 22999 + }, + { + "epoch": 0.23, + "grad_norm": 0.6440144646518966, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 23000 + }, + { + "epoch": 0.23001, + "grad_norm": 0.5755514750379364, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 23001 + }, + { + "epoch": 0.23002, + "grad_norm": 0.6407383692328728, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 23002 + }, + { + "epoch": 0.23003, + "grad_norm": 0.7322045865591256, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 23003 + }, + { + "epoch": 0.23004, + "grad_norm": 0.912968524887482, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 23004 + }, + { + "epoch": 0.23005, + "grad_norm": 1.1448046014698365, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 23005 + }, + { + "epoch": 0.23006, + "grad_norm": 0.9912597420142032, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 23006 + }, + { + "epoch": 0.23007, + "grad_norm": 0.9797687843984587, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23007 + }, + { + "epoch": 0.23008, + "grad_norm": 0.8562928902048835, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23008 + }, + { + "epoch": 0.23009, + "grad_norm": 0.8442274961912204, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 23009 + }, + { + "epoch": 0.2301, + "grad_norm": 0.8947168340077016, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 23010 + }, + { + "epoch": 0.23011, + "grad_norm": 1.0647803346246902, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 23011 + }, + { + "epoch": 0.23012, + "grad_norm": 1.0482378466819748, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 23012 + }, + { + "epoch": 0.23013, + "grad_norm": 0.9665854253183856, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 23013 + }, + { + "epoch": 0.23014, + "grad_norm": 0.7918976868664909, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 23014 + }, + { + "epoch": 0.23015, + "grad_norm": 0.7504563422319043, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 23015 + }, + { + "epoch": 0.23016, + "grad_norm": 0.7846609882630923, + "learning_rate": 0.003, + "loss": 4.037, + "step": 23016 + }, + { + "epoch": 0.23017, + "grad_norm": 0.6953285756158426, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 23017 + }, + { + "epoch": 0.23018, + "grad_norm": 0.7182305490736426, + "learning_rate": 0.003, + "loss": 4.019, + "step": 23018 + }, + { + "epoch": 0.23019, + "grad_norm": 0.8230608160580406, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 23019 + }, + { + "epoch": 0.2302, + "grad_norm": 0.7621053002492446, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 23020 + }, + { + "epoch": 0.23021, + "grad_norm": 0.732343101266712, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 23021 + }, + { + "epoch": 0.23022, + "grad_norm": 0.6979325865328343, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 23022 + }, + { + "epoch": 0.23023, + "grad_norm": 0.6491460396937742, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 23023 + }, + { + "epoch": 0.23024, + "grad_norm": 0.7407802564372362, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 23024 + }, + { + "epoch": 0.23025, + "grad_norm": 0.7129150741142731, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 23025 + }, + { + "epoch": 0.23026, + "grad_norm": 0.5648904117697976, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 23026 + }, + { + "epoch": 0.23027, + "grad_norm": 0.6313451167071286, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 23027 + }, + { + "epoch": 0.23028, + "grad_norm": 0.6918169092622712, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 23028 + }, + { + "epoch": 0.23029, + "grad_norm": 0.8804208104690182, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 23029 + }, + { + "epoch": 0.2303, + "grad_norm": 1.3311874724407622, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 23030 + }, + { + "epoch": 0.23031, + "grad_norm": 1.238694234393582, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 23031 + }, + { + "epoch": 0.23032, + "grad_norm": 0.8039568917642566, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 23032 + }, + { + "epoch": 0.23033, + "grad_norm": 0.7671679413513601, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 23033 + }, + { + "epoch": 0.23034, + "grad_norm": 0.7232051042304448, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 23034 + }, + { + "epoch": 0.23035, + "grad_norm": 0.67765558853382, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 23035 + }, + { + "epoch": 0.23036, + "grad_norm": 0.7468434783455861, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 23036 + }, + { + "epoch": 0.23037, + "grad_norm": 0.8810508270969094, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 23037 + }, + { + "epoch": 0.23038, + "grad_norm": 1.0703990078286738, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 23038 + }, + { + "epoch": 0.23039, + "grad_norm": 1.1634087956640424, + "learning_rate": 0.003, + "loss": 4.037, + "step": 23039 + }, + { + "epoch": 0.2304, + "grad_norm": 0.9932499531102147, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 23040 + }, + { + "epoch": 0.23041, + "grad_norm": 1.1867958003879997, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 23041 + }, + { + "epoch": 0.23042, + "grad_norm": 0.9442961368857763, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 23042 + }, + { + "epoch": 0.23043, + "grad_norm": 0.9531702275296813, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 23043 + }, + { + "epoch": 0.23044, + "grad_norm": 1.1339833255036662, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 23044 + }, + { + "epoch": 0.23045, + "grad_norm": 0.9114005226081583, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 23045 + }, + { + "epoch": 0.23046, + "grad_norm": 0.8518906522540243, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 23046 + }, + { + "epoch": 0.23047, + "grad_norm": 0.8229079410946365, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 23047 + }, + { + "epoch": 0.23048, + "grad_norm": 0.8242342866953422, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 23048 + }, + { + "epoch": 0.23049, + "grad_norm": 0.8072608110455313, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 23049 + }, + { + "epoch": 0.2305, + "grad_norm": 0.9430069625495672, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 23050 + }, + { + "epoch": 0.23051, + "grad_norm": 0.9796725003736887, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 23051 + }, + { + "epoch": 0.23052, + "grad_norm": 1.0099302599233693, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 23052 + }, + { + "epoch": 0.23053, + "grad_norm": 1.0245568456074743, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 23053 + }, + { + "epoch": 0.23054, + "grad_norm": 1.1409643629234234, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 23054 + }, + { + "epoch": 0.23055, + "grad_norm": 1.0230247137167217, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 23055 + }, + { + "epoch": 0.23056, + "grad_norm": 1.034256782464772, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 23056 + }, + { + "epoch": 0.23057, + "grad_norm": 1.036737098806506, + "learning_rate": 0.003, + "loss": 4.0898, + "step": 23057 + }, + { + "epoch": 0.23058, + "grad_norm": 1.037351210038279, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 23058 + }, + { + "epoch": 0.23059, + "grad_norm": 0.9519086559547316, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 23059 + }, + { + "epoch": 0.2306, + "grad_norm": 0.8959661672152691, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 23060 + }, + { + "epoch": 0.23061, + "grad_norm": 0.8903507833817338, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 23061 + }, + { + "epoch": 0.23062, + "grad_norm": 0.8807956259666923, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 23062 + }, + { + "epoch": 0.23063, + "grad_norm": 0.9844388200826915, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 23063 + }, + { + "epoch": 0.23064, + "grad_norm": 1.208210992824724, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 23064 + }, + { + "epoch": 0.23065, + "grad_norm": 0.8025078155500408, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 23065 + }, + { + "epoch": 0.23066, + "grad_norm": 0.781969026190564, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 23066 + }, + { + "epoch": 0.23067, + "grad_norm": 0.7525009160614471, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 23067 + }, + { + "epoch": 0.23068, + "grad_norm": 0.7400231509723334, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 23068 + }, + { + "epoch": 0.23069, + "grad_norm": 0.7412303677565018, + "learning_rate": 0.003, + "loss": 4.054, + "step": 23069 + }, + { + "epoch": 0.2307, + "grad_norm": 0.6901599980350567, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 23070 + }, + { + "epoch": 0.23071, + "grad_norm": 0.6291578041103024, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 23071 + }, + { + "epoch": 0.23072, + "grad_norm": 0.6426487652646763, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 23072 + }, + { + "epoch": 0.23073, + "grad_norm": 0.7484153343286141, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 23073 + }, + { + "epoch": 0.23074, + "grad_norm": 0.8591630184030804, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 23074 + }, + { + "epoch": 0.23075, + "grad_norm": 0.9461249230965478, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 23075 + }, + { + "epoch": 0.23076, + "grad_norm": 1.1122012616960164, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 23076 + }, + { + "epoch": 0.23077, + "grad_norm": 1.0383414062056104, + "learning_rate": 0.003, + "loss": 4.048, + "step": 23077 + }, + { + "epoch": 0.23078, + "grad_norm": 0.9458084129572396, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 23078 + }, + { + "epoch": 0.23079, + "grad_norm": 0.8180342130513747, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 23079 + }, + { + "epoch": 0.2308, + "grad_norm": 0.88701227096098, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 23080 + }, + { + "epoch": 0.23081, + "grad_norm": 0.9524663541299678, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 23081 + }, + { + "epoch": 0.23082, + "grad_norm": 1.0767976989171717, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 23082 + }, + { + "epoch": 0.23083, + "grad_norm": 0.9238024718076581, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 23083 + }, + { + "epoch": 0.23084, + "grad_norm": 0.9819205078043135, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 23084 + }, + { + "epoch": 0.23085, + "grad_norm": 1.183068481802817, + "learning_rate": 0.003, + "loss": 4.038, + "step": 23085 + }, + { + "epoch": 0.23086, + "grad_norm": 1.0401157212986032, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 23086 + }, + { + "epoch": 0.23087, + "grad_norm": 1.0400128905396773, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 23087 + }, + { + "epoch": 0.23088, + "grad_norm": 1.0003200233920644, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 23088 + }, + { + "epoch": 0.23089, + "grad_norm": 0.8736122936941668, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 23089 + }, + { + "epoch": 0.2309, + "grad_norm": 0.7665087221210782, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 23090 + }, + { + "epoch": 0.23091, + "grad_norm": 0.8327630270085647, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 23091 + }, + { + "epoch": 0.23092, + "grad_norm": 0.9134355021202011, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 23092 + }, + { + "epoch": 0.23093, + "grad_norm": 1.051066563244841, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 23093 + }, + { + "epoch": 0.23094, + "grad_norm": 1.0214354124435396, + "learning_rate": 0.003, + "loss": 4.053, + "step": 23094 + }, + { + "epoch": 0.23095, + "grad_norm": 0.8958983175184104, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 23095 + }, + { + "epoch": 0.23096, + "grad_norm": 0.9559871365111267, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 23096 + }, + { + "epoch": 0.23097, + "grad_norm": 1.127557277530838, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 23097 + }, + { + "epoch": 0.23098, + "grad_norm": 0.9581631011438779, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 23098 + }, + { + "epoch": 0.23099, + "grad_norm": 0.891490700702359, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 23099 + }, + { + "epoch": 0.231, + "grad_norm": 0.8458326421640169, + "learning_rate": 0.003, + "loss": 4.064, + "step": 23100 + }, + { + "epoch": 0.23101, + "grad_norm": 0.8426055006721699, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 23101 + }, + { + "epoch": 0.23102, + "grad_norm": 0.8186155305996721, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 23102 + }, + { + "epoch": 0.23103, + "grad_norm": 0.7713171618089862, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 23103 + }, + { + "epoch": 0.23104, + "grad_norm": 0.8339620123432792, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 23104 + }, + { + "epoch": 0.23105, + "grad_norm": 0.8259863446093868, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 23105 + }, + { + "epoch": 0.23106, + "grad_norm": 0.8282756771482723, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 23106 + }, + { + "epoch": 0.23107, + "grad_norm": 0.700501615712469, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 23107 + }, + { + "epoch": 0.23108, + "grad_norm": 0.6803929830388348, + "learning_rate": 0.003, + "loss": 4.001, + "step": 23108 + }, + { + "epoch": 0.23109, + "grad_norm": 0.7042258069211237, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 23109 + }, + { + "epoch": 0.2311, + "grad_norm": 0.8167577998600485, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 23110 + }, + { + "epoch": 0.23111, + "grad_norm": 1.1302336645143332, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 23111 + }, + { + "epoch": 0.23112, + "grad_norm": 1.2451510774546128, + "learning_rate": 0.003, + "loss": 4.0953, + "step": 23112 + }, + { + "epoch": 0.23113, + "grad_norm": 0.733608198620074, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 23113 + }, + { + "epoch": 0.23114, + "grad_norm": 0.641177426296802, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 23114 + }, + { + "epoch": 0.23115, + "grad_norm": 0.7308422523981293, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 23115 + }, + { + "epoch": 0.23116, + "grad_norm": 0.8609629458726281, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 23116 + }, + { + "epoch": 0.23117, + "grad_norm": 1.0017284900307113, + "learning_rate": 0.003, + "loss": 4.065, + "step": 23117 + }, + { + "epoch": 0.23118, + "grad_norm": 1.095749053592283, + "learning_rate": 0.003, + "loss": 4.044, + "step": 23118 + }, + { + "epoch": 0.23119, + "grad_norm": 1.0334151111415144, + "learning_rate": 0.003, + "loss": 4.067, + "step": 23119 + }, + { + "epoch": 0.2312, + "grad_norm": 1.136214537832086, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 23120 + }, + { + "epoch": 0.23121, + "grad_norm": 0.8602315868680853, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 23121 + }, + { + "epoch": 0.23122, + "grad_norm": 0.7188755912748588, + "learning_rate": 0.003, + "loss": 4.044, + "step": 23122 + }, + { + "epoch": 0.23123, + "grad_norm": 0.6831341891311914, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 23123 + }, + { + "epoch": 0.23124, + "grad_norm": 0.689386101602556, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 23124 + }, + { + "epoch": 0.23125, + "grad_norm": 0.7949028140751238, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 23125 + }, + { + "epoch": 0.23126, + "grad_norm": 0.8925072686265749, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 23126 + }, + { + "epoch": 0.23127, + "grad_norm": 1.030558238710072, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23127 + }, + { + "epoch": 0.23128, + "grad_norm": 1.1869892077913553, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 23128 + }, + { + "epoch": 0.23129, + "grad_norm": 0.7979618263337956, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 23129 + }, + { + "epoch": 0.2313, + "grad_norm": 0.7541219421787313, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 23130 + }, + { + "epoch": 0.23131, + "grad_norm": 0.728790578622073, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 23131 + }, + { + "epoch": 0.23132, + "grad_norm": 0.6654525142196901, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 23132 + }, + { + "epoch": 0.23133, + "grad_norm": 0.7140478749674254, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 23133 + }, + { + "epoch": 0.23134, + "grad_norm": 0.8223033332588107, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 23134 + }, + { + "epoch": 0.23135, + "grad_norm": 1.0518448763186588, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 23135 + }, + { + "epoch": 0.23136, + "grad_norm": 0.9293349576285221, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 23136 + }, + { + "epoch": 0.23137, + "grad_norm": 0.8788913747286137, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 23137 + }, + { + "epoch": 0.23138, + "grad_norm": 0.9880121834966458, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 23138 + }, + { + "epoch": 0.23139, + "grad_norm": 1.1796886910094022, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 23139 + }, + { + "epoch": 0.2314, + "grad_norm": 0.8964341593056058, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 23140 + }, + { + "epoch": 0.23141, + "grad_norm": 0.9740490172641584, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 23141 + }, + { + "epoch": 0.23142, + "grad_norm": 1.149619858298541, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 23142 + }, + { + "epoch": 0.23143, + "grad_norm": 0.8801661951396532, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 23143 + }, + { + "epoch": 0.23144, + "grad_norm": 0.8646141631667361, + "learning_rate": 0.003, + "loss": 4.037, + "step": 23144 + }, + { + "epoch": 0.23145, + "grad_norm": 0.8077041410823514, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 23145 + }, + { + "epoch": 0.23146, + "grad_norm": 0.9296332270586543, + "learning_rate": 0.003, + "loss": 4.0902, + "step": 23146 + }, + { + "epoch": 0.23147, + "grad_norm": 1.0281759610629906, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 23147 + }, + { + "epoch": 0.23148, + "grad_norm": 0.8332539633933264, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 23148 + }, + { + "epoch": 0.23149, + "grad_norm": 0.6718691492607541, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 23149 + }, + { + "epoch": 0.2315, + "grad_norm": 0.794417872689008, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 23150 + }, + { + "epoch": 0.23151, + "grad_norm": 0.8743968158609591, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 23151 + }, + { + "epoch": 0.23152, + "grad_norm": 0.7940753328224843, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 23152 + }, + { + "epoch": 0.23153, + "grad_norm": 0.7612152619273527, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 23153 + }, + { + "epoch": 0.23154, + "grad_norm": 0.8483713152037101, + "learning_rate": 0.003, + "loss": 4.052, + "step": 23154 + }, + { + "epoch": 0.23155, + "grad_norm": 0.7780967753220992, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 23155 + }, + { + "epoch": 0.23156, + "grad_norm": 0.8751698564680405, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 23156 + }, + { + "epoch": 0.23157, + "grad_norm": 0.9738745667518447, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 23157 + }, + { + "epoch": 0.23158, + "grad_norm": 0.8913304238523949, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 23158 + }, + { + "epoch": 0.23159, + "grad_norm": 1.1645725784000678, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 23159 + }, + { + "epoch": 0.2316, + "grad_norm": 1.1682181889148886, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 23160 + }, + { + "epoch": 0.23161, + "grad_norm": 0.8344153393977471, + "learning_rate": 0.003, + "loss": 4.033, + "step": 23161 + }, + { + "epoch": 0.23162, + "grad_norm": 0.6493871502833799, + "learning_rate": 0.003, + "loss": 4.039, + "step": 23162 + }, + { + "epoch": 0.23163, + "grad_norm": 0.6769824395401683, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 23163 + }, + { + "epoch": 0.23164, + "grad_norm": 0.7366738323543714, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 23164 + }, + { + "epoch": 0.23165, + "grad_norm": 0.7724693297848358, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 23165 + }, + { + "epoch": 0.23166, + "grad_norm": 0.9186543965596209, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 23166 + }, + { + "epoch": 0.23167, + "grad_norm": 1.130344443604793, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 23167 + }, + { + "epoch": 0.23168, + "grad_norm": 0.9086773580393178, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 23168 + }, + { + "epoch": 0.23169, + "grad_norm": 0.8513501304435446, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 23169 + }, + { + "epoch": 0.2317, + "grad_norm": 0.9071029195969358, + "learning_rate": 0.003, + "loss": 4.058, + "step": 23170 + }, + { + "epoch": 0.23171, + "grad_norm": 0.9863758150106552, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 23171 + }, + { + "epoch": 0.23172, + "grad_norm": 1.1227519604261884, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 23172 + }, + { + "epoch": 0.23173, + "grad_norm": 0.8465970644388893, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 23173 + }, + { + "epoch": 0.23174, + "grad_norm": 0.8323049945940443, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 23174 + }, + { + "epoch": 0.23175, + "grad_norm": 0.8207785622563308, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 23175 + }, + { + "epoch": 0.23176, + "grad_norm": 0.8776604899774045, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 23176 + }, + { + "epoch": 0.23177, + "grad_norm": 1.2362603218990649, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 23177 + }, + { + "epoch": 0.23178, + "grad_norm": 0.8435599941035459, + "learning_rate": 0.003, + "loss": 4.041, + "step": 23178 + }, + { + "epoch": 0.23179, + "grad_norm": 0.8611331828490991, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 23179 + }, + { + "epoch": 0.2318, + "grad_norm": 0.995223681188264, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 23180 + }, + { + "epoch": 0.23181, + "grad_norm": 1.0379370750698478, + "learning_rate": 0.003, + "loss": 4.082, + "step": 23181 + }, + { + "epoch": 0.23182, + "grad_norm": 0.9090692891478289, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 23182 + }, + { + "epoch": 0.23183, + "grad_norm": 0.8277825840618702, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 23183 + }, + { + "epoch": 0.23184, + "grad_norm": 0.8456226998971755, + "learning_rate": 0.003, + "loss": 4.057, + "step": 23184 + }, + { + "epoch": 0.23185, + "grad_norm": 0.8720691591032947, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 23185 + }, + { + "epoch": 0.23186, + "grad_norm": 0.9220562305550305, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 23186 + }, + { + "epoch": 0.23187, + "grad_norm": 0.851438738350241, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 23187 + }, + { + "epoch": 0.23188, + "grad_norm": 0.7818525520935256, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 23188 + }, + { + "epoch": 0.23189, + "grad_norm": 0.7286503035114896, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 23189 + }, + { + "epoch": 0.2319, + "grad_norm": 0.7421027077402059, + "learning_rate": 0.003, + "loss": 4.029, + "step": 23190 + }, + { + "epoch": 0.23191, + "grad_norm": 0.7123163633494964, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 23191 + }, + { + "epoch": 0.23192, + "grad_norm": 0.7167199220165789, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 23192 + }, + { + "epoch": 0.23193, + "grad_norm": 0.7218936007899907, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 23193 + }, + { + "epoch": 0.23194, + "grad_norm": 0.7383427430131531, + "learning_rate": 0.003, + "loss": 4.009, + "step": 23194 + }, + { + "epoch": 0.23195, + "grad_norm": 0.7579351615694749, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 23195 + }, + { + "epoch": 0.23196, + "grad_norm": 0.8254898925429789, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 23196 + }, + { + "epoch": 0.23197, + "grad_norm": 1.1471002586596273, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 23197 + }, + { + "epoch": 0.23198, + "grad_norm": 1.2317359165769932, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 23198 + }, + { + "epoch": 0.23199, + "grad_norm": 0.8705700836540348, + "learning_rate": 0.003, + "loss": 4.057, + "step": 23199 + }, + { + "epoch": 0.232, + "grad_norm": 0.8386864939544902, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 23200 + }, + { + "epoch": 0.23201, + "grad_norm": 0.8831417143911874, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 23201 + }, + { + "epoch": 0.23202, + "grad_norm": 0.8989113483203732, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 23202 + }, + { + "epoch": 0.23203, + "grad_norm": 0.9319952181445379, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 23203 + }, + { + "epoch": 0.23204, + "grad_norm": 1.1234857880248483, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 23204 + }, + { + "epoch": 0.23205, + "grad_norm": 0.9646882635849044, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 23205 + }, + { + "epoch": 0.23206, + "grad_norm": 1.1204598275221551, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 23206 + }, + { + "epoch": 0.23207, + "grad_norm": 0.8468977841556461, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 23207 + }, + { + "epoch": 0.23208, + "grad_norm": 0.8736054694554825, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 23208 + }, + { + "epoch": 0.23209, + "grad_norm": 0.793857771129506, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 23209 + }, + { + "epoch": 0.2321, + "grad_norm": 0.7829459826281823, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 23210 + }, + { + "epoch": 0.23211, + "grad_norm": 0.9743869245867949, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 23211 + }, + { + "epoch": 0.23212, + "grad_norm": 1.0099969444714338, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 23212 + }, + { + "epoch": 0.23213, + "grad_norm": 0.9772473599852297, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 23213 + }, + { + "epoch": 0.23214, + "grad_norm": 1.1090191942947476, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 23214 + }, + { + "epoch": 0.23215, + "grad_norm": 0.9809795265303912, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 23215 + }, + { + "epoch": 0.23216, + "grad_norm": 1.1022901248934491, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 23216 + }, + { + "epoch": 0.23217, + "grad_norm": 0.9994053700063643, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 23217 + }, + { + "epoch": 0.23218, + "grad_norm": 0.9518282222486462, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 23218 + }, + { + "epoch": 0.23219, + "grad_norm": 0.8901810578367069, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 23219 + }, + { + "epoch": 0.2322, + "grad_norm": 0.7379109720421644, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 23220 + }, + { + "epoch": 0.23221, + "grad_norm": 0.6304443394079544, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 23221 + }, + { + "epoch": 0.23222, + "grad_norm": 0.6039519856203805, + "learning_rate": 0.003, + "loss": 4.024, + "step": 23222 + }, + { + "epoch": 0.23223, + "grad_norm": 0.7261943503246203, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 23223 + }, + { + "epoch": 0.23224, + "grad_norm": 0.8061088900498037, + "learning_rate": 0.003, + "loss": 4.044, + "step": 23224 + }, + { + "epoch": 0.23225, + "grad_norm": 0.8482348430318354, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 23225 + }, + { + "epoch": 0.23226, + "grad_norm": 0.9174531207170211, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 23226 + }, + { + "epoch": 0.23227, + "grad_norm": 1.1570666209810108, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 23227 + }, + { + "epoch": 0.23228, + "grad_norm": 0.9107358994765787, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 23228 + }, + { + "epoch": 0.23229, + "grad_norm": 0.867793013833244, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 23229 + }, + { + "epoch": 0.2323, + "grad_norm": 0.8031669535488789, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 23230 + }, + { + "epoch": 0.23231, + "grad_norm": 0.9526980439082624, + "learning_rate": 0.003, + "loss": 4.021, + "step": 23231 + }, + { + "epoch": 0.23232, + "grad_norm": 1.0983306831463389, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 23232 + }, + { + "epoch": 0.23233, + "grad_norm": 0.9172141034188522, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 23233 + }, + { + "epoch": 0.23234, + "grad_norm": 0.8780316601312603, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 23234 + }, + { + "epoch": 0.23235, + "grad_norm": 0.8657486185162844, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 23235 + }, + { + "epoch": 0.23236, + "grad_norm": 0.7867284711629754, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 23236 + }, + { + "epoch": 0.23237, + "grad_norm": 0.8593762302387505, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23237 + }, + { + "epoch": 0.23238, + "grad_norm": 0.999200770415388, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 23238 + }, + { + "epoch": 0.23239, + "grad_norm": 0.9900950045281023, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 23239 + }, + { + "epoch": 0.2324, + "grad_norm": 1.0001850110999082, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23240 + }, + { + "epoch": 0.23241, + "grad_norm": 0.9767799123132489, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 23241 + }, + { + "epoch": 0.23242, + "grad_norm": 1.0858482824867202, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 23242 + }, + { + "epoch": 0.23243, + "grad_norm": 1.0408044330079458, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 23243 + }, + { + "epoch": 0.23244, + "grad_norm": 0.9928541307010126, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 23244 + }, + { + "epoch": 0.23245, + "grad_norm": 0.8675087983831122, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 23245 + }, + { + "epoch": 0.23246, + "grad_norm": 0.8981112837908503, + "learning_rate": 0.003, + "loss": 4.077, + "step": 23246 + }, + { + "epoch": 0.23247, + "grad_norm": 0.9328525917233651, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 23247 + }, + { + "epoch": 0.23248, + "grad_norm": 0.9756171868427217, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 23248 + }, + { + "epoch": 0.23249, + "grad_norm": 1.0950239360776952, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 23249 + }, + { + "epoch": 0.2325, + "grad_norm": 1.0783311333497028, + "learning_rate": 0.003, + "loss": 4.1031, + "step": 23250 + }, + { + "epoch": 0.23251, + "grad_norm": 1.1335158849567175, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 23251 + }, + { + "epoch": 0.23252, + "grad_norm": 0.8492377915473657, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 23252 + }, + { + "epoch": 0.23253, + "grad_norm": 0.8559040391897444, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 23253 + }, + { + "epoch": 0.23254, + "grad_norm": 0.8212510012899998, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 23254 + }, + { + "epoch": 0.23255, + "grad_norm": 0.8349638108267292, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 23255 + }, + { + "epoch": 0.23256, + "grad_norm": 0.8669598562870442, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 23256 + }, + { + "epoch": 0.23257, + "grad_norm": 1.2031135123573853, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 23257 + }, + { + "epoch": 0.23258, + "grad_norm": 1.132960801067764, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 23258 + }, + { + "epoch": 0.23259, + "grad_norm": 0.9274153111540889, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 23259 + }, + { + "epoch": 0.2326, + "grad_norm": 0.8392512702203906, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 23260 + }, + { + "epoch": 0.23261, + "grad_norm": 0.8153444941138165, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 23261 + }, + { + "epoch": 0.23262, + "grad_norm": 0.7650530271338066, + "learning_rate": 0.003, + "loss": 4.026, + "step": 23262 + }, + { + "epoch": 0.23263, + "grad_norm": 0.7072048818401927, + "learning_rate": 0.003, + "loss": 4.051, + "step": 23263 + }, + { + "epoch": 0.23264, + "grad_norm": 0.7434278611886409, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 23264 + }, + { + "epoch": 0.23265, + "grad_norm": 0.6969863964264411, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 23265 + }, + { + "epoch": 0.23266, + "grad_norm": 0.5921543446129753, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 23266 + }, + { + "epoch": 0.23267, + "grad_norm": 0.6246915459886806, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 23267 + }, + { + "epoch": 0.23268, + "grad_norm": 0.6071232070964971, + "learning_rate": 0.003, + "loss": 4.034, + "step": 23268 + }, + { + "epoch": 0.23269, + "grad_norm": 0.6120017112848584, + "learning_rate": 0.003, + "loss": 4.047, + "step": 23269 + }, + { + "epoch": 0.2327, + "grad_norm": 0.7381727894488059, + "learning_rate": 0.003, + "loss": 4.049, + "step": 23270 + }, + { + "epoch": 0.23271, + "grad_norm": 0.9377226044305099, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 23271 + }, + { + "epoch": 0.23272, + "grad_norm": 0.9610306381192282, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23272 + }, + { + "epoch": 0.23273, + "grad_norm": 0.9320320224352803, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 23273 + }, + { + "epoch": 0.23274, + "grad_norm": 0.8911826352988351, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 23274 + }, + { + "epoch": 0.23275, + "grad_norm": 0.8105825721007471, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 23275 + }, + { + "epoch": 0.23276, + "grad_norm": 0.8670830025954498, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 23276 + }, + { + "epoch": 0.23277, + "grad_norm": 0.8657927094012594, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 23277 + }, + { + "epoch": 0.23278, + "grad_norm": 0.7708245466487271, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23278 + }, + { + "epoch": 0.23279, + "grad_norm": 0.7787466473254215, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 23279 + }, + { + "epoch": 0.2328, + "grad_norm": 0.8164552198515813, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 23280 + }, + { + "epoch": 0.23281, + "grad_norm": 0.8927138473434831, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 23281 + }, + { + "epoch": 0.23282, + "grad_norm": 0.8876102475955792, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 23282 + }, + { + "epoch": 0.23283, + "grad_norm": 0.9087671931379016, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 23283 + }, + { + "epoch": 0.23284, + "grad_norm": 0.9469167290019652, + "learning_rate": 0.003, + "loss": 4.054, + "step": 23284 + }, + { + "epoch": 0.23285, + "grad_norm": 1.101195157510704, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 23285 + }, + { + "epoch": 0.23286, + "grad_norm": 1.1383771773928801, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 23286 + }, + { + "epoch": 0.23287, + "grad_norm": 1.1346771594819132, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 23287 + }, + { + "epoch": 0.23288, + "grad_norm": 0.9967749570008625, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 23288 + }, + { + "epoch": 0.23289, + "grad_norm": 0.9779788333655552, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 23289 + }, + { + "epoch": 0.2329, + "grad_norm": 0.96736155236567, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 23290 + }, + { + "epoch": 0.23291, + "grad_norm": 0.9867814017940819, + "learning_rate": 0.003, + "loss": 4.066, + "step": 23291 + }, + { + "epoch": 0.23292, + "grad_norm": 1.0294637262664235, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 23292 + }, + { + "epoch": 0.23293, + "grad_norm": 0.7994950119197537, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 23293 + }, + { + "epoch": 0.23294, + "grad_norm": 0.7481805754458921, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 23294 + }, + { + "epoch": 0.23295, + "grad_norm": 0.7650244135894871, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 23295 + }, + { + "epoch": 0.23296, + "grad_norm": 1.013495268242076, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 23296 + }, + { + "epoch": 0.23297, + "grad_norm": 1.164764749176224, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 23297 + }, + { + "epoch": 0.23298, + "grad_norm": 0.8960344235012595, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 23298 + }, + { + "epoch": 0.23299, + "grad_norm": 0.8881564501743127, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 23299 + }, + { + "epoch": 0.233, + "grad_norm": 0.9350007006593121, + "learning_rate": 0.003, + "loss": 4.0955, + "step": 23300 + }, + { + "epoch": 0.23301, + "grad_norm": 0.9548358908147525, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23301 + }, + { + "epoch": 0.23302, + "grad_norm": 1.0334268930845638, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 23302 + }, + { + "epoch": 0.23303, + "grad_norm": 1.0357680667654454, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 23303 + }, + { + "epoch": 0.23304, + "grad_norm": 1.0529152874427279, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 23304 + }, + { + "epoch": 0.23305, + "grad_norm": 0.9156996550201966, + "learning_rate": 0.003, + "loss": 4.064, + "step": 23305 + }, + { + "epoch": 0.23306, + "grad_norm": 0.8940837746911591, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 23306 + }, + { + "epoch": 0.23307, + "grad_norm": 0.9965133688987783, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 23307 + }, + { + "epoch": 0.23308, + "grad_norm": 0.8363566760855301, + "learning_rate": 0.003, + "loss": 4.035, + "step": 23308 + }, + { + "epoch": 0.23309, + "grad_norm": 0.7406233933871841, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 23309 + }, + { + "epoch": 0.2331, + "grad_norm": 0.7408562529044742, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 23310 + }, + { + "epoch": 0.23311, + "grad_norm": 0.713263037142467, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 23311 + }, + { + "epoch": 0.23312, + "grad_norm": 0.7928308735562886, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 23312 + }, + { + "epoch": 0.23313, + "grad_norm": 0.871717260427556, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 23313 + }, + { + "epoch": 0.23314, + "grad_norm": 1.0558291387356178, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 23314 + }, + { + "epoch": 0.23315, + "grad_norm": 0.944503677573255, + "learning_rate": 0.003, + "loss": 4.036, + "step": 23315 + }, + { + "epoch": 0.23316, + "grad_norm": 0.8115335344824361, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 23316 + }, + { + "epoch": 0.23317, + "grad_norm": 0.7557550645339812, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 23317 + }, + { + "epoch": 0.23318, + "grad_norm": 0.8046906030587745, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 23318 + }, + { + "epoch": 0.23319, + "grad_norm": 0.9800549503027157, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 23319 + }, + { + "epoch": 0.2332, + "grad_norm": 0.9080598060586338, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 23320 + }, + { + "epoch": 0.23321, + "grad_norm": 0.847657931039378, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 23321 + }, + { + "epoch": 0.23322, + "grad_norm": 0.820622327753731, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 23322 + }, + { + "epoch": 0.23323, + "grad_norm": 0.8095692117922965, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 23323 + }, + { + "epoch": 0.23324, + "grad_norm": 0.826232442867, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 23324 + }, + { + "epoch": 0.23325, + "grad_norm": 0.7780695456310369, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 23325 + }, + { + "epoch": 0.23326, + "grad_norm": 0.7433992194672006, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 23326 + }, + { + "epoch": 0.23327, + "grad_norm": 0.7962462661564261, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 23327 + }, + { + "epoch": 0.23328, + "grad_norm": 0.9421042256332244, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 23328 + }, + { + "epoch": 0.23329, + "grad_norm": 1.1195373580440857, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 23329 + }, + { + "epoch": 0.2333, + "grad_norm": 1.0369531212452263, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 23330 + }, + { + "epoch": 0.23331, + "grad_norm": 1.0902708414049547, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 23331 + }, + { + "epoch": 0.23332, + "grad_norm": 0.9376568194493495, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 23332 + }, + { + "epoch": 0.23333, + "grad_norm": 0.8358816175056863, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 23333 + }, + { + "epoch": 0.23334, + "grad_norm": 0.8770463459751547, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 23334 + }, + { + "epoch": 0.23335, + "grad_norm": 0.9084869034696209, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 23335 + }, + { + "epoch": 0.23336, + "grad_norm": 0.9164647954125518, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 23336 + }, + { + "epoch": 0.23337, + "grad_norm": 0.9468103515429476, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 23337 + }, + { + "epoch": 0.23338, + "grad_norm": 0.9603457233002876, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 23338 + }, + { + "epoch": 0.23339, + "grad_norm": 0.8631892790024719, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 23339 + }, + { + "epoch": 0.2334, + "grad_norm": 0.8641952703679369, + "learning_rate": 0.003, + "loss": 4.069, + "step": 23340 + }, + { + "epoch": 0.23341, + "grad_norm": 0.8706020331767145, + "learning_rate": 0.003, + "loss": 4.059, + "step": 23341 + }, + { + "epoch": 0.23342, + "grad_norm": 1.013057060348118, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 23342 + }, + { + "epoch": 0.23343, + "grad_norm": 1.1822166227602056, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 23343 + }, + { + "epoch": 0.23344, + "grad_norm": 0.9738003062187575, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 23344 + }, + { + "epoch": 0.23345, + "grad_norm": 0.8503096136492654, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 23345 + }, + { + "epoch": 0.23346, + "grad_norm": 0.6910227370125558, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 23346 + }, + { + "epoch": 0.23347, + "grad_norm": 0.7265346072820527, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 23347 + }, + { + "epoch": 0.23348, + "grad_norm": 0.8112023118736004, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 23348 + }, + { + "epoch": 0.23349, + "grad_norm": 1.0429340872049848, + "learning_rate": 0.003, + "loss": 4.014, + "step": 23349 + }, + { + "epoch": 0.2335, + "grad_norm": 1.110429019074852, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 23350 + }, + { + "epoch": 0.23351, + "grad_norm": 1.0566018295048674, + "learning_rate": 0.003, + "loss": 4.029, + "step": 23351 + }, + { + "epoch": 0.23352, + "grad_norm": 1.00512196214799, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 23352 + }, + { + "epoch": 0.23353, + "grad_norm": 0.8489244570738428, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 23353 + }, + { + "epoch": 0.23354, + "grad_norm": 0.7565669660831456, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 23354 + }, + { + "epoch": 0.23355, + "grad_norm": 0.9221691464711177, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 23355 + }, + { + "epoch": 0.23356, + "grad_norm": 1.085255935128597, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 23356 + }, + { + "epoch": 0.23357, + "grad_norm": 0.9103587875579616, + "learning_rate": 0.003, + "loss": 4.074, + "step": 23357 + }, + { + "epoch": 0.23358, + "grad_norm": 0.8512063629589692, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 23358 + }, + { + "epoch": 0.23359, + "grad_norm": 0.7470681476112401, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 23359 + }, + { + "epoch": 0.2336, + "grad_norm": 0.5961140625364454, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 23360 + }, + { + "epoch": 0.23361, + "grad_norm": 0.5798801546356659, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 23361 + }, + { + "epoch": 0.23362, + "grad_norm": 0.5902421624891833, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 23362 + }, + { + "epoch": 0.23363, + "grad_norm": 0.6999184120311019, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 23363 + }, + { + "epoch": 0.23364, + "grad_norm": 0.8893670383757734, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 23364 + }, + { + "epoch": 0.23365, + "grad_norm": 1.219470143245809, + "learning_rate": 0.003, + "loss": 4.049, + "step": 23365 + }, + { + "epoch": 0.23366, + "grad_norm": 0.8412023347162878, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 23366 + }, + { + "epoch": 0.23367, + "grad_norm": 0.7462332736457706, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 23367 + }, + { + "epoch": 0.23368, + "grad_norm": 0.7395531664941113, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 23368 + }, + { + "epoch": 0.23369, + "grad_norm": 0.9318765351397753, + "learning_rate": 0.003, + "loss": 4.017, + "step": 23369 + }, + { + "epoch": 0.2337, + "grad_norm": 1.0284846386411144, + "learning_rate": 0.003, + "loss": 4.039, + "step": 23370 + }, + { + "epoch": 0.23371, + "grad_norm": 1.082375674841975, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 23371 + }, + { + "epoch": 0.23372, + "grad_norm": 0.8681449169859182, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 23372 + }, + { + "epoch": 0.23373, + "grad_norm": 0.7643782292642143, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 23373 + }, + { + "epoch": 0.23374, + "grad_norm": 0.7886293105726657, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 23374 + }, + { + "epoch": 0.23375, + "grad_norm": 0.8184168061138848, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 23375 + }, + { + "epoch": 0.23376, + "grad_norm": 0.7990018029882848, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 23376 + }, + { + "epoch": 0.23377, + "grad_norm": 0.748870729955507, + "learning_rate": 0.003, + "loss": 4.053, + "step": 23377 + }, + { + "epoch": 0.23378, + "grad_norm": 0.9132468595526786, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 23378 + }, + { + "epoch": 0.23379, + "grad_norm": 1.0098402223700262, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 23379 + }, + { + "epoch": 0.2338, + "grad_norm": 1.1381184887449074, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 23380 + }, + { + "epoch": 0.23381, + "grad_norm": 0.9805527873250193, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 23381 + }, + { + "epoch": 0.23382, + "grad_norm": 1.000360382070932, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 23382 + }, + { + "epoch": 0.23383, + "grad_norm": 0.9980224432070521, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 23383 + }, + { + "epoch": 0.23384, + "grad_norm": 1.1132622676282136, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 23384 + }, + { + "epoch": 0.23385, + "grad_norm": 0.9143593082991279, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 23385 + }, + { + "epoch": 0.23386, + "grad_norm": 0.8361349588005175, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 23386 + }, + { + "epoch": 0.23387, + "grad_norm": 0.8341700939575358, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 23387 + }, + { + "epoch": 0.23388, + "grad_norm": 0.8544316720492576, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 23388 + }, + { + "epoch": 0.23389, + "grad_norm": 0.8194641054665752, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 23389 + }, + { + "epoch": 0.2339, + "grad_norm": 0.7695433452794722, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 23390 + }, + { + "epoch": 0.23391, + "grad_norm": 0.8791339635741777, + "learning_rate": 0.003, + "loss": 4.063, + "step": 23391 + }, + { + "epoch": 0.23392, + "grad_norm": 0.9674610737913872, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 23392 + }, + { + "epoch": 0.23393, + "grad_norm": 1.0394141583490182, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 23393 + }, + { + "epoch": 0.23394, + "grad_norm": 1.1078255857520207, + "learning_rate": 0.003, + "loss": 4.06, + "step": 23394 + }, + { + "epoch": 0.23395, + "grad_norm": 1.023291222036471, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 23395 + }, + { + "epoch": 0.23396, + "grad_norm": 0.8847425875234746, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 23396 + }, + { + "epoch": 0.23397, + "grad_norm": 0.9994241022835657, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23397 + }, + { + "epoch": 0.23398, + "grad_norm": 1.066314590277396, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 23398 + }, + { + "epoch": 0.23399, + "grad_norm": 0.9337551056905361, + "learning_rate": 0.003, + "loss": 4.088, + "step": 23399 + }, + { + "epoch": 0.234, + "grad_norm": 0.9919661609265504, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 23400 + }, + { + "epoch": 0.23401, + "grad_norm": 0.9930058711932699, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 23401 + }, + { + "epoch": 0.23402, + "grad_norm": 0.946035310376112, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 23402 + }, + { + "epoch": 0.23403, + "grad_norm": 0.9479158532294484, + "learning_rate": 0.003, + "loss": 4.057, + "step": 23403 + }, + { + "epoch": 0.23404, + "grad_norm": 0.9887494895668005, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 23404 + }, + { + "epoch": 0.23405, + "grad_norm": 1.0795738494837765, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 23405 + }, + { + "epoch": 0.23406, + "grad_norm": 0.9748867888107192, + "learning_rate": 0.003, + "loss": 4.063, + "step": 23406 + }, + { + "epoch": 0.23407, + "grad_norm": 0.9747261742240523, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 23407 + }, + { + "epoch": 0.23408, + "grad_norm": 0.9439911353112621, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 23408 + }, + { + "epoch": 0.23409, + "grad_norm": 0.9858693723609644, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 23409 + }, + { + "epoch": 0.2341, + "grad_norm": 1.0619208062979824, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 23410 + }, + { + "epoch": 0.23411, + "grad_norm": 0.9572973427946112, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 23411 + }, + { + "epoch": 0.23412, + "grad_norm": 0.9011634764609106, + "learning_rate": 0.003, + "loss": 4.052, + "step": 23412 + }, + { + "epoch": 0.23413, + "grad_norm": 0.9263551959365492, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 23413 + }, + { + "epoch": 0.23414, + "grad_norm": 0.8628853629024509, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 23414 + }, + { + "epoch": 0.23415, + "grad_norm": 0.8275678625042461, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 23415 + }, + { + "epoch": 0.23416, + "grad_norm": 0.7840972622886222, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 23416 + }, + { + "epoch": 0.23417, + "grad_norm": 0.8182833085317417, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 23417 + }, + { + "epoch": 0.23418, + "grad_norm": 0.8769906378803388, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 23418 + }, + { + "epoch": 0.23419, + "grad_norm": 0.9113616275585473, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 23419 + }, + { + "epoch": 0.2342, + "grad_norm": 0.9554640268768768, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 23420 + }, + { + "epoch": 0.23421, + "grad_norm": 0.9315320557270598, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 23421 + }, + { + "epoch": 0.23422, + "grad_norm": 0.7702007732696645, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 23422 + }, + { + "epoch": 0.23423, + "grad_norm": 0.790431807343337, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 23423 + }, + { + "epoch": 0.23424, + "grad_norm": 0.7609546132057345, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 23424 + }, + { + "epoch": 0.23425, + "grad_norm": 0.6985684874736482, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 23425 + }, + { + "epoch": 0.23426, + "grad_norm": 0.6891898397494017, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 23426 + }, + { + "epoch": 0.23427, + "grad_norm": 0.7279093681307923, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 23427 + }, + { + "epoch": 0.23428, + "grad_norm": 0.8441303383782134, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 23428 + }, + { + "epoch": 0.23429, + "grad_norm": 1.04643159548747, + "learning_rate": 0.003, + "loss": 4.065, + "step": 23429 + }, + { + "epoch": 0.2343, + "grad_norm": 1.2918401876269727, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 23430 + }, + { + "epoch": 0.23431, + "grad_norm": 0.6711194521983928, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 23431 + }, + { + "epoch": 0.23432, + "grad_norm": 0.6874738068004878, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 23432 + }, + { + "epoch": 0.23433, + "grad_norm": 0.7826447289270644, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 23433 + }, + { + "epoch": 0.23434, + "grad_norm": 0.701971592804287, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 23434 + }, + { + "epoch": 0.23435, + "grad_norm": 0.6101785123639616, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 23435 + }, + { + "epoch": 0.23436, + "grad_norm": 0.5880009523193628, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 23436 + }, + { + "epoch": 0.23437, + "grad_norm": 0.6339926810602442, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 23437 + }, + { + "epoch": 0.23438, + "grad_norm": 0.6833875393171981, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 23438 + }, + { + "epoch": 0.23439, + "grad_norm": 0.7916025228216734, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 23439 + }, + { + "epoch": 0.2344, + "grad_norm": 0.8697467276483367, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 23440 + }, + { + "epoch": 0.23441, + "grad_norm": 1.100718052765827, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 23441 + }, + { + "epoch": 0.23442, + "grad_norm": 1.053195700373657, + "learning_rate": 0.003, + "loss": 4.077, + "step": 23442 + }, + { + "epoch": 0.23443, + "grad_norm": 0.8617135427224588, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 23443 + }, + { + "epoch": 0.23444, + "grad_norm": 0.7032869299540702, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 23444 + }, + { + "epoch": 0.23445, + "grad_norm": 0.6923348590258565, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 23445 + }, + { + "epoch": 0.23446, + "grad_norm": 0.7284603533865156, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 23446 + }, + { + "epoch": 0.23447, + "grad_norm": 0.8536187828216245, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 23447 + }, + { + "epoch": 0.23448, + "grad_norm": 0.995310467170776, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 23448 + }, + { + "epoch": 0.23449, + "grad_norm": 1.2160398849176544, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 23449 + }, + { + "epoch": 0.2345, + "grad_norm": 0.8882534098429892, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23450 + }, + { + "epoch": 0.23451, + "grad_norm": 0.8615827600129583, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 23451 + }, + { + "epoch": 0.23452, + "grad_norm": 0.9464588359712807, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 23452 + }, + { + "epoch": 0.23453, + "grad_norm": 0.9605578950153069, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 23453 + }, + { + "epoch": 0.23454, + "grad_norm": 1.0168215253821362, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 23454 + }, + { + "epoch": 0.23455, + "grad_norm": 1.0290202474970156, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 23455 + }, + { + "epoch": 0.23456, + "grad_norm": 0.9892988617362561, + "learning_rate": 0.003, + "loss": 4.058, + "step": 23456 + }, + { + "epoch": 0.23457, + "grad_norm": 0.8197320854755072, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 23457 + }, + { + "epoch": 0.23458, + "grad_norm": 0.8537843438185317, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 23458 + }, + { + "epoch": 0.23459, + "grad_norm": 0.9417005757161352, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 23459 + }, + { + "epoch": 0.2346, + "grad_norm": 0.9969534769222956, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 23460 + }, + { + "epoch": 0.23461, + "grad_norm": 1.136723325063256, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 23461 + }, + { + "epoch": 0.23462, + "grad_norm": 1.045664861745029, + "learning_rate": 0.003, + "loss": 4.0822, + "step": 23462 + }, + { + "epoch": 0.23463, + "grad_norm": 0.9670235398040552, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 23463 + }, + { + "epoch": 0.23464, + "grad_norm": 0.9860658741113812, + "learning_rate": 0.003, + "loss": 4.068, + "step": 23464 + }, + { + "epoch": 0.23465, + "grad_norm": 0.9978464502003478, + "learning_rate": 0.003, + "loss": 4.071, + "step": 23465 + }, + { + "epoch": 0.23466, + "grad_norm": 0.9673931812832036, + "learning_rate": 0.003, + "loss": 4.061, + "step": 23466 + }, + { + "epoch": 0.23467, + "grad_norm": 0.8699565165568466, + "learning_rate": 0.003, + "loss": 4.063, + "step": 23467 + }, + { + "epoch": 0.23468, + "grad_norm": 0.9065795198411963, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 23468 + }, + { + "epoch": 0.23469, + "grad_norm": 0.75466624961184, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 23469 + }, + { + "epoch": 0.2347, + "grad_norm": 0.6561214345777504, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 23470 + }, + { + "epoch": 0.23471, + "grad_norm": 0.6309503611918389, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 23471 + }, + { + "epoch": 0.23472, + "grad_norm": 0.743276881969359, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 23472 + }, + { + "epoch": 0.23473, + "grad_norm": 0.8782294109824252, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 23473 + }, + { + "epoch": 0.23474, + "grad_norm": 1.0074254170883652, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 23474 + }, + { + "epoch": 0.23475, + "grad_norm": 0.9881534497496233, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 23475 + }, + { + "epoch": 0.23476, + "grad_norm": 0.8269256727291021, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 23476 + }, + { + "epoch": 0.23477, + "grad_norm": 0.6738745656842985, + "learning_rate": 0.003, + "loss": 4.057, + "step": 23477 + }, + { + "epoch": 0.23478, + "grad_norm": 0.6188196020134842, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 23478 + }, + { + "epoch": 0.23479, + "grad_norm": 0.6708426760650309, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 23479 + }, + { + "epoch": 0.2348, + "grad_norm": 0.8527239421195897, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 23480 + }, + { + "epoch": 0.23481, + "grad_norm": 1.0777377799731296, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 23481 + }, + { + "epoch": 0.23482, + "grad_norm": 0.9077010630127782, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 23482 + }, + { + "epoch": 0.23483, + "grad_norm": 0.8096723144590248, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 23483 + }, + { + "epoch": 0.23484, + "grad_norm": 0.6454451594198872, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 23484 + }, + { + "epoch": 0.23485, + "grad_norm": 0.634693890899822, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 23485 + }, + { + "epoch": 0.23486, + "grad_norm": 0.7590901521337867, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 23486 + }, + { + "epoch": 0.23487, + "grad_norm": 0.8631093358471466, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 23487 + }, + { + "epoch": 0.23488, + "grad_norm": 0.8836530459599684, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 23488 + }, + { + "epoch": 0.23489, + "grad_norm": 0.8167953744867498, + "learning_rate": 0.003, + "loss": 4.065, + "step": 23489 + }, + { + "epoch": 0.2349, + "grad_norm": 0.7322969154946962, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 23490 + }, + { + "epoch": 0.23491, + "grad_norm": 0.7532723755276917, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 23491 + }, + { + "epoch": 0.23492, + "grad_norm": 0.9038800318880632, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 23492 + }, + { + "epoch": 0.23493, + "grad_norm": 0.9920619300549424, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 23493 + }, + { + "epoch": 0.23494, + "grad_norm": 1.0187695353063326, + "learning_rate": 0.003, + "loss": 4.065, + "step": 23494 + }, + { + "epoch": 0.23495, + "grad_norm": 1.1778584734092041, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 23495 + }, + { + "epoch": 0.23496, + "grad_norm": 0.9224805363428353, + "learning_rate": 0.003, + "loss": 4.051, + "step": 23496 + }, + { + "epoch": 0.23497, + "grad_norm": 0.9436997654433646, + "learning_rate": 0.003, + "loss": 4.03, + "step": 23497 + }, + { + "epoch": 0.23498, + "grad_norm": 0.9952993171817909, + "learning_rate": 0.003, + "loss": 4.036, + "step": 23498 + }, + { + "epoch": 0.23499, + "grad_norm": 0.995203368537637, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 23499 + }, + { + "epoch": 0.235, + "grad_norm": 1.1294019839979683, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 23500 + }, + { + "epoch": 0.23501, + "grad_norm": 0.9612324528905263, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 23501 + }, + { + "epoch": 0.23502, + "grad_norm": 0.9921975929743411, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 23502 + }, + { + "epoch": 0.23503, + "grad_norm": 1.0270290566550475, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 23503 + }, + { + "epoch": 0.23504, + "grad_norm": 1.036780613639376, + "learning_rate": 0.003, + "loss": 4.052, + "step": 23504 + }, + { + "epoch": 0.23505, + "grad_norm": 1.029123150239687, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 23505 + }, + { + "epoch": 0.23506, + "grad_norm": 0.946117669682993, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 23506 + }, + { + "epoch": 0.23507, + "grad_norm": 1.1144297464145339, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 23507 + }, + { + "epoch": 0.23508, + "grad_norm": 0.9799699580148787, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 23508 + }, + { + "epoch": 0.23509, + "grad_norm": 0.9093323460452115, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 23509 + }, + { + "epoch": 0.2351, + "grad_norm": 0.9409869663730628, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 23510 + }, + { + "epoch": 0.23511, + "grad_norm": 0.9794146986958985, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 23511 + }, + { + "epoch": 0.23512, + "grad_norm": 1.03550774538799, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 23512 + }, + { + "epoch": 0.23513, + "grad_norm": 0.8167910926584461, + "learning_rate": 0.003, + "loss": 4.058, + "step": 23513 + }, + { + "epoch": 0.23514, + "grad_norm": 0.7470325975213249, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 23514 + }, + { + "epoch": 0.23515, + "grad_norm": 0.7206279401531992, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 23515 + }, + { + "epoch": 0.23516, + "grad_norm": 0.7518204968408843, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 23516 + }, + { + "epoch": 0.23517, + "grad_norm": 0.7967049472635556, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 23517 + }, + { + "epoch": 0.23518, + "grad_norm": 0.9542519789857323, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 23518 + }, + { + "epoch": 0.23519, + "grad_norm": 1.027237973204826, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 23519 + }, + { + "epoch": 0.2352, + "grad_norm": 0.9077309554712739, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 23520 + }, + { + "epoch": 0.23521, + "grad_norm": 0.9353060411573507, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 23521 + }, + { + "epoch": 0.23522, + "grad_norm": 0.9169242440742275, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 23522 + }, + { + "epoch": 0.23523, + "grad_norm": 0.9197349234314814, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 23523 + }, + { + "epoch": 0.23524, + "grad_norm": 0.7478110805490918, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 23524 + }, + { + "epoch": 0.23525, + "grad_norm": 0.6723302418236689, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 23525 + }, + { + "epoch": 0.23526, + "grad_norm": 0.7146151303034055, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 23526 + }, + { + "epoch": 0.23527, + "grad_norm": 0.8442950772620854, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 23527 + }, + { + "epoch": 0.23528, + "grad_norm": 0.9629020451911949, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 23528 + }, + { + "epoch": 0.23529, + "grad_norm": 1.1076277992353016, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 23529 + }, + { + "epoch": 0.2353, + "grad_norm": 1.0262669318483837, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 23530 + }, + { + "epoch": 0.23531, + "grad_norm": 1.022044751703166, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 23531 + }, + { + "epoch": 0.23532, + "grad_norm": 1.0333940674483422, + "learning_rate": 0.003, + "loss": 4.0907, + "step": 23532 + }, + { + "epoch": 0.23533, + "grad_norm": 0.9994143479050802, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 23533 + }, + { + "epoch": 0.23534, + "grad_norm": 1.043355750470464, + "learning_rate": 0.003, + "loss": 4.082, + "step": 23534 + }, + { + "epoch": 0.23535, + "grad_norm": 0.8283532203989878, + "learning_rate": 0.003, + "loss": 4.066, + "step": 23535 + }, + { + "epoch": 0.23536, + "grad_norm": 0.7617125529874617, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 23536 + }, + { + "epoch": 0.23537, + "grad_norm": 0.9138806394157641, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 23537 + }, + { + "epoch": 0.23538, + "grad_norm": 0.903383173408642, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 23538 + }, + { + "epoch": 0.23539, + "grad_norm": 0.7883716512588659, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 23539 + }, + { + "epoch": 0.2354, + "grad_norm": 0.847778479790901, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 23540 + }, + { + "epoch": 0.23541, + "grad_norm": 0.8838225140860158, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 23541 + }, + { + "epoch": 0.23542, + "grad_norm": 0.9284506878331492, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 23542 + }, + { + "epoch": 0.23543, + "grad_norm": 0.8348687662052139, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 23543 + }, + { + "epoch": 0.23544, + "grad_norm": 0.8887999874833376, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 23544 + }, + { + "epoch": 0.23545, + "grad_norm": 0.9475586610532396, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 23545 + }, + { + "epoch": 0.23546, + "grad_norm": 0.7138223749279077, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 23546 + }, + { + "epoch": 0.23547, + "grad_norm": 0.5730568821844476, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 23547 + }, + { + "epoch": 0.23548, + "grad_norm": 0.6239287737464521, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 23548 + }, + { + "epoch": 0.23549, + "grad_norm": 0.7515950009395274, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 23549 + }, + { + "epoch": 0.2355, + "grad_norm": 1.0329720486858198, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 23550 + }, + { + "epoch": 0.23551, + "grad_norm": 1.209157868285168, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 23551 + }, + { + "epoch": 0.23552, + "grad_norm": 0.5837990966246265, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 23552 + }, + { + "epoch": 0.23553, + "grad_norm": 0.7274515567994064, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 23553 + }, + { + "epoch": 0.23554, + "grad_norm": 0.8619683797766035, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 23554 + }, + { + "epoch": 0.23555, + "grad_norm": 0.9586212709246147, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 23555 + }, + { + "epoch": 0.23556, + "grad_norm": 1.086459832909454, + "learning_rate": 0.003, + "loss": 4.024, + "step": 23556 + }, + { + "epoch": 0.23557, + "grad_norm": 0.8578811925102114, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 23557 + }, + { + "epoch": 0.23558, + "grad_norm": 0.761003404904438, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 23558 + }, + { + "epoch": 0.23559, + "grad_norm": 0.7650770056002918, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 23559 + }, + { + "epoch": 0.2356, + "grad_norm": 0.8010382266649182, + "learning_rate": 0.003, + "loss": 4.063, + "step": 23560 + }, + { + "epoch": 0.23561, + "grad_norm": 0.8819968910323468, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 23561 + }, + { + "epoch": 0.23562, + "grad_norm": 0.8370755739812418, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 23562 + }, + { + "epoch": 0.23563, + "grad_norm": 0.8851392694988899, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 23563 + }, + { + "epoch": 0.23564, + "grad_norm": 0.8623204350197202, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 23564 + }, + { + "epoch": 0.23565, + "grad_norm": 0.9262339707527562, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 23565 + }, + { + "epoch": 0.23566, + "grad_norm": 0.8875631855423897, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 23566 + }, + { + "epoch": 0.23567, + "grad_norm": 0.8618856193538932, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 23567 + }, + { + "epoch": 0.23568, + "grad_norm": 0.8412703707105913, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 23568 + }, + { + "epoch": 0.23569, + "grad_norm": 0.859622999156412, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 23569 + }, + { + "epoch": 0.2357, + "grad_norm": 0.8854793718788011, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 23570 + }, + { + "epoch": 0.23571, + "grad_norm": 0.9680686922638029, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 23571 + }, + { + "epoch": 0.23572, + "grad_norm": 0.956367222153997, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 23572 + }, + { + "epoch": 0.23573, + "grad_norm": 0.9875147630233635, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 23573 + }, + { + "epoch": 0.23574, + "grad_norm": 1.1374897107766333, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 23574 + }, + { + "epoch": 0.23575, + "grad_norm": 0.9361115171636641, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 23575 + }, + { + "epoch": 0.23576, + "grad_norm": 0.9646203889033469, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 23576 + }, + { + "epoch": 0.23577, + "grad_norm": 1.0001062256926352, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 23577 + }, + { + "epoch": 0.23578, + "grad_norm": 1.0797203602522731, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 23578 + }, + { + "epoch": 0.23579, + "grad_norm": 1.1802473448698165, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 23579 + }, + { + "epoch": 0.2358, + "grad_norm": 1.026226737154868, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 23580 + }, + { + "epoch": 0.23581, + "grad_norm": 0.8832815362627126, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 23581 + }, + { + "epoch": 0.23582, + "grad_norm": 0.7048146501859645, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 23582 + }, + { + "epoch": 0.23583, + "grad_norm": 0.6464982389973704, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 23583 + }, + { + "epoch": 0.23584, + "grad_norm": 0.6526469346115965, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 23584 + }, + { + "epoch": 0.23585, + "grad_norm": 0.6546173838286163, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 23585 + }, + { + "epoch": 0.23586, + "grad_norm": 0.6441171924240503, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 23586 + }, + { + "epoch": 0.23587, + "grad_norm": 0.6523256404459764, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 23587 + }, + { + "epoch": 0.23588, + "grad_norm": 0.6399602779107045, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 23588 + }, + { + "epoch": 0.23589, + "grad_norm": 0.6183238921054076, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 23589 + }, + { + "epoch": 0.2359, + "grad_norm": 0.635493406806837, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 23590 + }, + { + "epoch": 0.23591, + "grad_norm": 0.8146691276437147, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 23591 + }, + { + "epoch": 0.23592, + "grad_norm": 1.0951470006089996, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 23592 + }, + { + "epoch": 0.23593, + "grad_norm": 1.1583929813081055, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 23593 + }, + { + "epoch": 0.23594, + "grad_norm": 0.8712449056385658, + "learning_rate": 0.003, + "loss": 4.063, + "step": 23594 + }, + { + "epoch": 0.23595, + "grad_norm": 0.806471354872271, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 23595 + }, + { + "epoch": 0.23596, + "grad_norm": 0.7269455268062749, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 23596 + }, + { + "epoch": 0.23597, + "grad_norm": 0.7183578580805932, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 23597 + }, + { + "epoch": 0.23598, + "grad_norm": 0.7249367504658509, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 23598 + }, + { + "epoch": 0.23599, + "grad_norm": 0.8425052152058388, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 23599 + }, + { + "epoch": 0.236, + "grad_norm": 0.9742208637680512, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 23600 + }, + { + "epoch": 0.23601, + "grad_norm": 1.1461598131534463, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 23601 + }, + { + "epoch": 0.23602, + "grad_norm": 0.7875740516544314, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 23602 + }, + { + "epoch": 0.23603, + "grad_norm": 0.7230962926061553, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 23603 + }, + { + "epoch": 0.23604, + "grad_norm": 0.8479527183462586, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 23604 + }, + { + "epoch": 0.23605, + "grad_norm": 0.9041809157482524, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 23605 + }, + { + "epoch": 0.23606, + "grad_norm": 0.8391258589145818, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 23606 + }, + { + "epoch": 0.23607, + "grad_norm": 0.845458514490645, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 23607 + }, + { + "epoch": 0.23608, + "grad_norm": 0.9064973898408545, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 23608 + }, + { + "epoch": 0.23609, + "grad_norm": 0.9555197729355157, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 23609 + }, + { + "epoch": 0.2361, + "grad_norm": 0.9065372952649496, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 23610 + }, + { + "epoch": 0.23611, + "grad_norm": 0.9496673211055551, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 23611 + }, + { + "epoch": 0.23612, + "grad_norm": 1.1277771237357277, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 23612 + }, + { + "epoch": 0.23613, + "grad_norm": 1.2855285262865856, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 23613 + }, + { + "epoch": 0.23614, + "grad_norm": 1.0017365393907138, + "learning_rate": 0.003, + "loss": 4.033, + "step": 23614 + }, + { + "epoch": 0.23615, + "grad_norm": 0.9492447064677598, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 23615 + }, + { + "epoch": 0.23616, + "grad_norm": 1.0172177554368869, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 23616 + }, + { + "epoch": 0.23617, + "grad_norm": 1.0046308501484664, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 23617 + }, + { + "epoch": 0.23618, + "grad_norm": 0.8542539972453103, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 23618 + }, + { + "epoch": 0.23619, + "grad_norm": 0.7373111297814003, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 23619 + }, + { + "epoch": 0.2362, + "grad_norm": 0.8144671964369854, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 23620 + }, + { + "epoch": 0.23621, + "grad_norm": 0.9043474154302669, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 23621 + }, + { + "epoch": 0.23622, + "grad_norm": 0.9132045013295168, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 23622 + }, + { + "epoch": 0.23623, + "grad_norm": 0.9905635131882818, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 23623 + }, + { + "epoch": 0.23624, + "grad_norm": 1.094439400523337, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 23624 + }, + { + "epoch": 0.23625, + "grad_norm": 1.0767903637947664, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 23625 + }, + { + "epoch": 0.23626, + "grad_norm": 1.0162965675462243, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 23626 + }, + { + "epoch": 0.23627, + "grad_norm": 0.9850749670549553, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 23627 + }, + { + "epoch": 0.23628, + "grad_norm": 1.0212284158426461, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 23628 + }, + { + "epoch": 0.23629, + "grad_norm": 1.121964602804996, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 23629 + }, + { + "epoch": 0.2363, + "grad_norm": 0.9153124617988608, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 23630 + }, + { + "epoch": 0.23631, + "grad_norm": 0.8676822869985553, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 23631 + }, + { + "epoch": 0.23632, + "grad_norm": 0.7485787363934675, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 23632 + }, + { + "epoch": 0.23633, + "grad_norm": 0.7619567971358175, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 23633 + }, + { + "epoch": 0.23634, + "grad_norm": 0.8650858385414405, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 23634 + }, + { + "epoch": 0.23635, + "grad_norm": 0.9308350165470094, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 23635 + }, + { + "epoch": 0.23636, + "grad_norm": 1.228532982389462, + "learning_rate": 0.003, + "loss": 4.062, + "step": 23636 + }, + { + "epoch": 0.23637, + "grad_norm": 0.935256116006693, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 23637 + }, + { + "epoch": 0.23638, + "grad_norm": 0.8507425405458384, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 23638 + }, + { + "epoch": 0.23639, + "grad_norm": 0.8566502586079002, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 23639 + }, + { + "epoch": 0.2364, + "grad_norm": 0.7828140024110716, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 23640 + }, + { + "epoch": 0.23641, + "grad_norm": 0.7807380867926977, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 23641 + }, + { + "epoch": 0.23642, + "grad_norm": 0.7526310919570613, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 23642 + }, + { + "epoch": 0.23643, + "grad_norm": 0.7501699914736486, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 23643 + }, + { + "epoch": 0.23644, + "grad_norm": 0.9296431810122299, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 23644 + }, + { + "epoch": 0.23645, + "grad_norm": 0.8993955311518375, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 23645 + }, + { + "epoch": 0.23646, + "grad_norm": 0.8225115630681428, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 23646 + }, + { + "epoch": 0.23647, + "grad_norm": 0.9489010814155697, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 23647 + }, + { + "epoch": 0.23648, + "grad_norm": 1.0278446557178285, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 23648 + }, + { + "epoch": 0.23649, + "grad_norm": 0.8508752314092654, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 23649 + }, + { + "epoch": 0.2365, + "grad_norm": 0.7393204440650136, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 23650 + }, + { + "epoch": 0.23651, + "grad_norm": 0.7282365567618183, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 23651 + }, + { + "epoch": 0.23652, + "grad_norm": 0.7951331786723594, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 23652 + }, + { + "epoch": 0.23653, + "grad_norm": 0.8505496353603379, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 23653 + }, + { + "epoch": 0.23654, + "grad_norm": 0.9999252666650351, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 23654 + }, + { + "epoch": 0.23655, + "grad_norm": 1.1036654308243536, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 23655 + }, + { + "epoch": 0.23656, + "grad_norm": 0.8857862620759969, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 23656 + }, + { + "epoch": 0.23657, + "grad_norm": 1.0349756791578346, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 23657 + }, + { + "epoch": 0.23658, + "grad_norm": 0.8515051639875577, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 23658 + }, + { + "epoch": 0.23659, + "grad_norm": 0.7045717320814407, + "learning_rate": 0.003, + "loss": 4.041, + "step": 23659 + }, + { + "epoch": 0.2366, + "grad_norm": 0.6134568239393103, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 23660 + }, + { + "epoch": 0.23661, + "grad_norm": 0.7094646545527582, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 23661 + }, + { + "epoch": 0.23662, + "grad_norm": 0.8125165557695597, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 23662 + }, + { + "epoch": 0.23663, + "grad_norm": 0.9848910656251846, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 23663 + }, + { + "epoch": 0.23664, + "grad_norm": 1.0148514717454828, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 23664 + }, + { + "epoch": 0.23665, + "grad_norm": 1.0091959955395162, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 23665 + }, + { + "epoch": 0.23666, + "grad_norm": 0.8757007107412164, + "learning_rate": 0.003, + "loss": 4.1109, + "step": 23666 + }, + { + "epoch": 0.23667, + "grad_norm": 0.8883704392602975, + "learning_rate": 0.003, + "loss": 4.055, + "step": 23667 + }, + { + "epoch": 0.23668, + "grad_norm": 1.0260382100750878, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 23668 + }, + { + "epoch": 0.23669, + "grad_norm": 1.044053788050421, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 23669 + }, + { + "epoch": 0.2367, + "grad_norm": 1.0829679192764872, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 23670 + }, + { + "epoch": 0.23671, + "grad_norm": 1.0582751893335969, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 23671 + }, + { + "epoch": 0.23672, + "grad_norm": 0.9141838014885043, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 23672 + }, + { + "epoch": 0.23673, + "grad_norm": 0.9379376997659608, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 23673 + }, + { + "epoch": 0.23674, + "grad_norm": 1.1124835008836031, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 23674 + }, + { + "epoch": 0.23675, + "grad_norm": 0.9916248203041355, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 23675 + }, + { + "epoch": 0.23676, + "grad_norm": 0.864929601268841, + "learning_rate": 0.003, + "loss": 4.084, + "step": 23676 + }, + { + "epoch": 0.23677, + "grad_norm": 0.8286947397229656, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 23677 + }, + { + "epoch": 0.23678, + "grad_norm": 0.7772779771133362, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 23678 + }, + { + "epoch": 0.23679, + "grad_norm": 0.659533718327479, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 23679 + }, + { + "epoch": 0.2368, + "grad_norm": 0.7120360805479395, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 23680 + }, + { + "epoch": 0.23681, + "grad_norm": 0.6575060339444038, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 23681 + }, + { + "epoch": 0.23682, + "grad_norm": 0.7034512232485463, + "learning_rate": 0.003, + "loss": 4.017, + "step": 23682 + }, + { + "epoch": 0.23683, + "grad_norm": 0.7854979227953794, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 23683 + }, + { + "epoch": 0.23684, + "grad_norm": 0.8640903386646525, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 23684 + }, + { + "epoch": 0.23685, + "grad_norm": 1.0013649370730158, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 23685 + }, + { + "epoch": 0.23686, + "grad_norm": 1.096318757466863, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 23686 + }, + { + "epoch": 0.23687, + "grad_norm": 0.8214280798521816, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 23687 + }, + { + "epoch": 0.23688, + "grad_norm": 0.7965584445237289, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 23688 + }, + { + "epoch": 0.23689, + "grad_norm": 0.7685865849769362, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 23689 + }, + { + "epoch": 0.2369, + "grad_norm": 0.6893280561013264, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 23690 + }, + { + "epoch": 0.23691, + "grad_norm": 0.6912213761040542, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 23691 + }, + { + "epoch": 0.23692, + "grad_norm": 0.7372370189926273, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 23692 + }, + { + "epoch": 0.23693, + "grad_norm": 0.8204260436094726, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 23693 + }, + { + "epoch": 0.23694, + "grad_norm": 0.9485968847610363, + "learning_rate": 0.003, + "loss": 4.013, + "step": 23694 + }, + { + "epoch": 0.23695, + "grad_norm": 1.1977787093846215, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 23695 + }, + { + "epoch": 0.23696, + "grad_norm": 1.0331212679192867, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 23696 + }, + { + "epoch": 0.23697, + "grad_norm": 0.886799126891739, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 23697 + }, + { + "epoch": 0.23698, + "grad_norm": 0.7945548226833384, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 23698 + }, + { + "epoch": 0.23699, + "grad_norm": 0.8262667902295605, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 23699 + }, + { + "epoch": 0.237, + "grad_norm": 0.7952850981510156, + "learning_rate": 0.003, + "loss": 4.028, + "step": 23700 + }, + { + "epoch": 0.23701, + "grad_norm": 0.9426131438109266, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 23701 + }, + { + "epoch": 0.23702, + "grad_norm": 1.2186326294816368, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 23702 + }, + { + "epoch": 0.23703, + "grad_norm": 0.9093822433317033, + "learning_rate": 0.003, + "loss": 4.042, + "step": 23703 + }, + { + "epoch": 0.23704, + "grad_norm": 0.924778055129946, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 23704 + }, + { + "epoch": 0.23705, + "grad_norm": 0.9605397122456875, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 23705 + }, + { + "epoch": 0.23706, + "grad_norm": 0.9281521026179532, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 23706 + }, + { + "epoch": 0.23707, + "grad_norm": 0.918885559322832, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 23707 + }, + { + "epoch": 0.23708, + "grad_norm": 0.9259313446023939, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 23708 + }, + { + "epoch": 0.23709, + "grad_norm": 0.8491193958148544, + "learning_rate": 0.003, + "loss": 4.062, + "step": 23709 + }, + { + "epoch": 0.2371, + "grad_norm": 0.9787414510707803, + "learning_rate": 0.003, + "loss": 4.042, + "step": 23710 + }, + { + "epoch": 0.23711, + "grad_norm": 1.0512576427281033, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 23711 + }, + { + "epoch": 0.23712, + "grad_norm": 1.1442441444524603, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 23712 + }, + { + "epoch": 0.23713, + "grad_norm": 0.7886432661186354, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23713 + }, + { + "epoch": 0.23714, + "grad_norm": 0.7192607059020641, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 23714 + }, + { + "epoch": 0.23715, + "grad_norm": 0.6803307217059974, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 23715 + }, + { + "epoch": 0.23716, + "grad_norm": 0.7451331889798255, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 23716 + }, + { + "epoch": 0.23717, + "grad_norm": 0.9912150629909722, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 23717 + }, + { + "epoch": 0.23718, + "grad_norm": 1.142803181117511, + "learning_rate": 0.003, + "loss": 4.075, + "step": 23718 + }, + { + "epoch": 0.23719, + "grad_norm": 0.9536266113914136, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 23719 + }, + { + "epoch": 0.2372, + "grad_norm": 0.8318571044524352, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 23720 + }, + { + "epoch": 0.23721, + "grad_norm": 0.7369039646820076, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 23721 + }, + { + "epoch": 0.23722, + "grad_norm": 0.7186204991531209, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 23722 + }, + { + "epoch": 0.23723, + "grad_norm": 0.7303515590228324, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 23723 + }, + { + "epoch": 0.23724, + "grad_norm": 0.6972734523106194, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 23724 + }, + { + "epoch": 0.23725, + "grad_norm": 0.7479317547096554, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 23725 + }, + { + "epoch": 0.23726, + "grad_norm": 0.78785194481346, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 23726 + }, + { + "epoch": 0.23727, + "grad_norm": 0.9262905918361009, + "learning_rate": 0.003, + "loss": 4.048, + "step": 23727 + }, + { + "epoch": 0.23728, + "grad_norm": 1.183995877661308, + "learning_rate": 0.003, + "loss": 4.0043, + "step": 23728 + }, + { + "epoch": 0.23729, + "grad_norm": 0.9529623987101564, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 23729 + }, + { + "epoch": 0.2373, + "grad_norm": 0.8408421205102627, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 23730 + }, + { + "epoch": 0.23731, + "grad_norm": 0.9411302961024143, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 23731 + }, + { + "epoch": 0.23732, + "grad_norm": 1.1415803404196334, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 23732 + }, + { + "epoch": 0.23733, + "grad_norm": 0.9730155583799385, + "learning_rate": 0.003, + "loss": 4.073, + "step": 23733 + }, + { + "epoch": 0.23734, + "grad_norm": 0.9102850065374251, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23734 + }, + { + "epoch": 0.23735, + "grad_norm": 1.1059069322995851, + "learning_rate": 0.003, + "loss": 4.057, + "step": 23735 + }, + { + "epoch": 0.23736, + "grad_norm": 1.0167701587712585, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 23736 + }, + { + "epoch": 0.23737, + "grad_norm": 0.9028611539175332, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 23737 + }, + { + "epoch": 0.23738, + "grad_norm": 0.8216560376710537, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 23738 + }, + { + "epoch": 0.23739, + "grad_norm": 0.8230191641359648, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 23739 + }, + { + "epoch": 0.2374, + "grad_norm": 0.9676879124111347, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 23740 + }, + { + "epoch": 0.23741, + "grad_norm": 1.101932757731516, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 23741 + }, + { + "epoch": 0.23742, + "grad_norm": 0.9240266059580302, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 23742 + }, + { + "epoch": 0.23743, + "grad_norm": 0.9777630766475063, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 23743 + }, + { + "epoch": 0.23744, + "grad_norm": 0.9532760535159879, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 23744 + }, + { + "epoch": 0.23745, + "grad_norm": 0.9578988240507131, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 23745 + }, + { + "epoch": 0.23746, + "grad_norm": 0.948282548513073, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 23746 + }, + { + "epoch": 0.23747, + "grad_norm": 1.1266205813666343, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 23747 + }, + { + "epoch": 0.23748, + "grad_norm": 0.9455660471383677, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 23748 + }, + { + "epoch": 0.23749, + "grad_norm": 1.0556565216795553, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 23749 + }, + { + "epoch": 0.2375, + "grad_norm": 1.2075693151324032, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 23750 + }, + { + "epoch": 0.23751, + "grad_norm": 0.9355026323387243, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 23751 + }, + { + "epoch": 0.23752, + "grad_norm": 0.8740309165215498, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 23752 + }, + { + "epoch": 0.23753, + "grad_norm": 0.8128975080080184, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 23753 + }, + { + "epoch": 0.23754, + "grad_norm": 0.7540731723693127, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 23754 + }, + { + "epoch": 0.23755, + "grad_norm": 0.6966786043359898, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 23755 + }, + { + "epoch": 0.23756, + "grad_norm": 0.7035375621121372, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 23756 + }, + { + "epoch": 0.23757, + "grad_norm": 0.7413219939164101, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 23757 + }, + { + "epoch": 0.23758, + "grad_norm": 0.8082120029716918, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 23758 + }, + { + "epoch": 0.23759, + "grad_norm": 1.0040605348583993, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 23759 + }, + { + "epoch": 0.2376, + "grad_norm": 1.1156474671494216, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23760 + }, + { + "epoch": 0.23761, + "grad_norm": 0.8940647208220829, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 23761 + }, + { + "epoch": 0.23762, + "grad_norm": 0.8636884557517225, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 23762 + }, + { + "epoch": 0.23763, + "grad_norm": 0.8059181598095077, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 23763 + }, + { + "epoch": 0.23764, + "grad_norm": 0.7359689937991982, + "learning_rate": 0.003, + "loss": 4.0096, + "step": 23764 + }, + { + "epoch": 0.23765, + "grad_norm": 0.7052047258115853, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 23765 + }, + { + "epoch": 0.23766, + "grad_norm": 0.7053016619129944, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 23766 + }, + { + "epoch": 0.23767, + "grad_norm": 0.634681941680801, + "learning_rate": 0.003, + "loss": 4.014, + "step": 23767 + }, + { + "epoch": 0.23768, + "grad_norm": 0.6379477048246032, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 23768 + }, + { + "epoch": 0.23769, + "grad_norm": 0.7165958126511536, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 23769 + }, + { + "epoch": 0.2377, + "grad_norm": 0.8059448359700745, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 23770 + }, + { + "epoch": 0.23771, + "grad_norm": 0.9165908723360074, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 23771 + }, + { + "epoch": 0.23772, + "grad_norm": 1.1363440354514454, + "learning_rate": 0.003, + "loss": 4.027, + "step": 23772 + }, + { + "epoch": 0.23773, + "grad_norm": 0.9435709742458889, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 23773 + }, + { + "epoch": 0.23774, + "grad_norm": 0.9853652896027093, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 23774 + }, + { + "epoch": 0.23775, + "grad_norm": 0.986838507451984, + "learning_rate": 0.003, + "loss": 4.049, + "step": 23775 + }, + { + "epoch": 0.23776, + "grad_norm": 0.8955411308927379, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 23776 + }, + { + "epoch": 0.23777, + "grad_norm": 0.8387329218582318, + "learning_rate": 0.003, + "loss": 4.049, + "step": 23777 + }, + { + "epoch": 0.23778, + "grad_norm": 0.9356896828060617, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 23778 + }, + { + "epoch": 0.23779, + "grad_norm": 1.0917706522165258, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 23779 + }, + { + "epoch": 0.2378, + "grad_norm": 0.9714825727349592, + "learning_rate": 0.003, + "loss": 4.0017, + "step": 23780 + }, + { + "epoch": 0.23781, + "grad_norm": 0.9462718964720743, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 23781 + }, + { + "epoch": 0.23782, + "grad_norm": 0.9073306152103564, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 23782 + }, + { + "epoch": 0.23783, + "grad_norm": 0.7049492342336101, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 23783 + }, + { + "epoch": 0.23784, + "grad_norm": 0.6951098464330825, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 23784 + }, + { + "epoch": 0.23785, + "grad_norm": 0.7375645933427062, + "learning_rate": 0.003, + "loss": 4.021, + "step": 23785 + }, + { + "epoch": 0.23786, + "grad_norm": 0.8769100255082033, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 23786 + }, + { + "epoch": 0.23787, + "grad_norm": 0.9868474641372071, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 23787 + }, + { + "epoch": 0.23788, + "grad_norm": 1.044571696665442, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 23788 + }, + { + "epoch": 0.23789, + "grad_norm": 1.1026872352115349, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 23789 + }, + { + "epoch": 0.2379, + "grad_norm": 0.9138550452245233, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 23790 + }, + { + "epoch": 0.23791, + "grad_norm": 1.0640233372869399, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 23791 + }, + { + "epoch": 0.23792, + "grad_norm": 1.120476918971707, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 23792 + }, + { + "epoch": 0.23793, + "grad_norm": 0.9535255213492942, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 23793 + }, + { + "epoch": 0.23794, + "grad_norm": 0.9982005136533192, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 23794 + }, + { + "epoch": 0.23795, + "grad_norm": 0.9811287827583272, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 23795 + }, + { + "epoch": 0.23796, + "grad_norm": 0.8825797221493334, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 23796 + }, + { + "epoch": 0.23797, + "grad_norm": 0.8786571476249699, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 23797 + }, + { + "epoch": 0.23798, + "grad_norm": 0.8217438953646821, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 23798 + }, + { + "epoch": 0.23799, + "grad_norm": 0.8241426772628611, + "learning_rate": 0.003, + "loss": 4.075, + "step": 23799 + }, + { + "epoch": 0.238, + "grad_norm": 0.806284747066766, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 23800 + }, + { + "epoch": 0.23801, + "grad_norm": 0.7699279123731254, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 23801 + }, + { + "epoch": 0.23802, + "grad_norm": 0.7793582936417139, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 23802 + }, + { + "epoch": 0.23803, + "grad_norm": 0.9858019055772262, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 23803 + }, + { + "epoch": 0.23804, + "grad_norm": 1.0210858876720903, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 23804 + }, + { + "epoch": 0.23805, + "grad_norm": 0.894510215298178, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 23805 + }, + { + "epoch": 0.23806, + "grad_norm": 1.0842775381717982, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 23806 + }, + { + "epoch": 0.23807, + "grad_norm": 1.1790647380800272, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 23807 + }, + { + "epoch": 0.23808, + "grad_norm": 0.975198201371548, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 23808 + }, + { + "epoch": 0.23809, + "grad_norm": 0.9448235297635766, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 23809 + }, + { + "epoch": 0.2381, + "grad_norm": 0.8309856053087825, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 23810 + }, + { + "epoch": 0.23811, + "grad_norm": 0.7431981536876313, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 23811 + }, + { + "epoch": 0.23812, + "grad_norm": 0.6693241524162311, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 23812 + }, + { + "epoch": 0.23813, + "grad_norm": 0.7051333290537272, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 23813 + }, + { + "epoch": 0.23814, + "grad_norm": 0.6653198217036655, + "learning_rate": 0.003, + "loss": 4.075, + "step": 23814 + }, + { + "epoch": 0.23815, + "grad_norm": 0.5650805771726254, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 23815 + }, + { + "epoch": 0.23816, + "grad_norm": 0.5720528675378561, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 23816 + }, + { + "epoch": 0.23817, + "grad_norm": 0.55667001135488, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 23817 + }, + { + "epoch": 0.23818, + "grad_norm": 0.5688252003169696, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 23818 + }, + { + "epoch": 0.23819, + "grad_norm": 0.5770735792760818, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 23819 + }, + { + "epoch": 0.2382, + "grad_norm": 0.6415005448927382, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 23820 + }, + { + "epoch": 0.23821, + "grad_norm": 0.7792790899285379, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 23821 + }, + { + "epoch": 0.23822, + "grad_norm": 1.2401253810872785, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 23822 + }, + { + "epoch": 0.23823, + "grad_norm": 0.9654666156907661, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 23823 + }, + { + "epoch": 0.23824, + "grad_norm": 0.9530324568754521, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 23824 + }, + { + "epoch": 0.23825, + "grad_norm": 1.1707237383671287, + "learning_rate": 0.003, + "loss": 4.043, + "step": 23825 + }, + { + "epoch": 0.23826, + "grad_norm": 0.8708757840944379, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 23826 + }, + { + "epoch": 0.23827, + "grad_norm": 0.8306739795853243, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 23827 + }, + { + "epoch": 0.23828, + "grad_norm": 0.8041387911269132, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 23828 + }, + { + "epoch": 0.23829, + "grad_norm": 0.7870930722708536, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 23829 + }, + { + "epoch": 0.2383, + "grad_norm": 0.8231782286884456, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 23830 + }, + { + "epoch": 0.23831, + "grad_norm": 0.8328594937417843, + "learning_rate": 0.003, + "loss": 4.034, + "step": 23831 + }, + { + "epoch": 0.23832, + "grad_norm": 0.8367398697427005, + "learning_rate": 0.003, + "loss": 4.046, + "step": 23832 + }, + { + "epoch": 0.23833, + "grad_norm": 0.8869077275210392, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 23833 + }, + { + "epoch": 0.23834, + "grad_norm": 0.9771233275735915, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 23834 + }, + { + "epoch": 0.23835, + "grad_norm": 0.9445842169776447, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 23835 + }, + { + "epoch": 0.23836, + "grad_norm": 1.0796517589401968, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 23836 + }, + { + "epoch": 0.23837, + "grad_norm": 0.9999686768476472, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 23837 + }, + { + "epoch": 0.23838, + "grad_norm": 1.0188576010165489, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 23838 + }, + { + "epoch": 0.23839, + "grad_norm": 1.0188012835907987, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 23839 + }, + { + "epoch": 0.2384, + "grad_norm": 0.9035928233198262, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 23840 + }, + { + "epoch": 0.23841, + "grad_norm": 0.9449464339944221, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 23841 + }, + { + "epoch": 0.23842, + "grad_norm": 1.108643556061732, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 23842 + }, + { + "epoch": 0.23843, + "grad_norm": 0.8717757225857686, + "learning_rate": 0.003, + "loss": 4.068, + "step": 23843 + }, + { + "epoch": 0.23844, + "grad_norm": 0.8666097149021877, + "learning_rate": 0.003, + "loss": 4.072, + "step": 23844 + }, + { + "epoch": 0.23845, + "grad_norm": 1.0759369814913384, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 23845 + }, + { + "epoch": 0.23846, + "grad_norm": 1.2178127552228601, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 23846 + }, + { + "epoch": 0.23847, + "grad_norm": 0.8303896574997529, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 23847 + }, + { + "epoch": 0.23848, + "grad_norm": 0.8042276507324703, + "learning_rate": 0.003, + "loss": 4.031, + "step": 23848 + }, + { + "epoch": 0.23849, + "grad_norm": 0.9313481058300013, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 23849 + }, + { + "epoch": 0.2385, + "grad_norm": 1.1634654039350076, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 23850 + }, + { + "epoch": 0.23851, + "grad_norm": 0.8514752157085883, + "learning_rate": 0.003, + "loss": 4.073, + "step": 23851 + }, + { + "epoch": 0.23852, + "grad_norm": 0.7076461178687795, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 23852 + }, + { + "epoch": 0.23853, + "grad_norm": 0.7503145225617919, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 23853 + }, + { + "epoch": 0.23854, + "grad_norm": 0.7202968172791401, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 23854 + }, + { + "epoch": 0.23855, + "grad_norm": 0.719678691366509, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 23855 + }, + { + "epoch": 0.23856, + "grad_norm": 0.8275547809991393, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 23856 + }, + { + "epoch": 0.23857, + "grad_norm": 0.9871145001597039, + "learning_rate": 0.003, + "loss": 4.0866, + "step": 23857 + }, + { + "epoch": 0.23858, + "grad_norm": 1.115484560260526, + "learning_rate": 0.003, + "loss": 4.1019, + "step": 23858 + }, + { + "epoch": 0.23859, + "grad_norm": 0.8518385877430924, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 23859 + }, + { + "epoch": 0.2386, + "grad_norm": 0.795377862230423, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 23860 + }, + { + "epoch": 0.23861, + "grad_norm": 0.8530868156680435, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 23861 + }, + { + "epoch": 0.23862, + "grad_norm": 0.854205737404698, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 23862 + }, + { + "epoch": 0.23863, + "grad_norm": 0.8895193214946078, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 23863 + }, + { + "epoch": 0.23864, + "grad_norm": 0.8366774362897966, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 23864 + }, + { + "epoch": 0.23865, + "grad_norm": 0.8869314133643494, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 23865 + }, + { + "epoch": 0.23866, + "grad_norm": 0.7409505229499392, + "learning_rate": 0.003, + "loss": 4.043, + "step": 23866 + }, + { + "epoch": 0.23867, + "grad_norm": 0.6710412370291577, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 23867 + }, + { + "epoch": 0.23868, + "grad_norm": 0.6744115315788555, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 23868 + }, + { + "epoch": 0.23869, + "grad_norm": 0.9195077068497159, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 23869 + }, + { + "epoch": 0.2387, + "grad_norm": 1.2714772447849427, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 23870 + }, + { + "epoch": 0.23871, + "grad_norm": 0.6770058295215355, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 23871 + }, + { + "epoch": 0.23872, + "grad_norm": 0.6309307076653112, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 23872 + }, + { + "epoch": 0.23873, + "grad_norm": 0.5990751958506392, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 23873 + }, + { + "epoch": 0.23874, + "grad_norm": 0.6680844834978303, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 23874 + }, + { + "epoch": 0.23875, + "grad_norm": 0.726616253775967, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 23875 + }, + { + "epoch": 0.23876, + "grad_norm": 0.6410081277309755, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 23876 + }, + { + "epoch": 0.23877, + "grad_norm": 0.6071173451199402, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 23877 + }, + { + "epoch": 0.23878, + "grad_norm": 0.6686577462701669, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 23878 + }, + { + "epoch": 0.23879, + "grad_norm": 0.7575692661423011, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 23879 + }, + { + "epoch": 0.2388, + "grad_norm": 1.0635534878555628, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 23880 + }, + { + "epoch": 0.23881, + "grad_norm": 1.282914970321341, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 23881 + }, + { + "epoch": 0.23882, + "grad_norm": 0.807510943337494, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 23882 + }, + { + "epoch": 0.23883, + "grad_norm": 0.7438756109729677, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 23883 + }, + { + "epoch": 0.23884, + "grad_norm": 0.7883902967380124, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 23884 + }, + { + "epoch": 0.23885, + "grad_norm": 0.8745080907165037, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 23885 + }, + { + "epoch": 0.23886, + "grad_norm": 0.992019050134065, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 23886 + }, + { + "epoch": 0.23887, + "grad_norm": 1.079792889903227, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 23887 + }, + { + "epoch": 0.23888, + "grad_norm": 1.1031556788350718, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 23888 + }, + { + "epoch": 0.23889, + "grad_norm": 1.163061635381459, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 23889 + }, + { + "epoch": 0.2389, + "grad_norm": 0.977647611777324, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 23890 + }, + { + "epoch": 0.23891, + "grad_norm": 0.9047700430063531, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 23891 + }, + { + "epoch": 0.23892, + "grad_norm": 1.10505228991655, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 23892 + }, + { + "epoch": 0.23893, + "grad_norm": 0.8821829797179381, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 23893 + }, + { + "epoch": 0.23894, + "grad_norm": 0.7941149912615949, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 23894 + }, + { + "epoch": 0.23895, + "grad_norm": 0.8697131405624108, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 23895 + }, + { + "epoch": 0.23896, + "grad_norm": 0.902187412970047, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 23896 + }, + { + "epoch": 0.23897, + "grad_norm": 1.050734562136658, + "learning_rate": 0.003, + "loss": 4.056, + "step": 23897 + }, + { + "epoch": 0.23898, + "grad_norm": 1.225133759485856, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 23898 + }, + { + "epoch": 0.23899, + "grad_norm": 0.7578595119970893, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 23899 + }, + { + "epoch": 0.239, + "grad_norm": 0.6591358689957916, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 23900 + }, + { + "epoch": 0.23901, + "grad_norm": 0.7213696754680423, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 23901 + }, + { + "epoch": 0.23902, + "grad_norm": 0.8941360339181375, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 23902 + }, + { + "epoch": 0.23903, + "grad_norm": 1.22272343566739, + "learning_rate": 0.003, + "loss": 4.084, + "step": 23903 + }, + { + "epoch": 0.23904, + "grad_norm": 0.8459556551219984, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 23904 + }, + { + "epoch": 0.23905, + "grad_norm": 0.6991213623821504, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 23905 + }, + { + "epoch": 0.23906, + "grad_norm": 0.6931773850988112, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 23906 + }, + { + "epoch": 0.23907, + "grad_norm": 0.7060989251339346, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 23907 + }, + { + "epoch": 0.23908, + "grad_norm": 0.6929639140943131, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 23908 + }, + { + "epoch": 0.23909, + "grad_norm": 0.7266225244111828, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 23909 + }, + { + "epoch": 0.2391, + "grad_norm": 0.8343071444496308, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 23910 + }, + { + "epoch": 0.23911, + "grad_norm": 0.9186558032919087, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 23911 + }, + { + "epoch": 0.23912, + "grad_norm": 1.1568098248618708, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 23912 + }, + { + "epoch": 0.23913, + "grad_norm": 1.1765447822534711, + "learning_rate": 0.003, + "loss": 4.039, + "step": 23913 + }, + { + "epoch": 0.23914, + "grad_norm": 0.888065067566215, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 23914 + }, + { + "epoch": 0.23915, + "grad_norm": 0.701921798296881, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 23915 + }, + { + "epoch": 0.23916, + "grad_norm": 0.7741549141027102, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 23916 + }, + { + "epoch": 0.23917, + "grad_norm": 0.7750561869548219, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 23917 + }, + { + "epoch": 0.23918, + "grad_norm": 0.7379412014013816, + "learning_rate": 0.003, + "loss": 4.028, + "step": 23918 + }, + { + "epoch": 0.23919, + "grad_norm": 0.667436205126358, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 23919 + }, + { + "epoch": 0.2392, + "grad_norm": 0.7006097258389764, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 23920 + }, + { + "epoch": 0.23921, + "grad_norm": 0.7199153764105264, + "learning_rate": 0.003, + "loss": 4.044, + "step": 23921 + }, + { + "epoch": 0.23922, + "grad_norm": 0.95338646918464, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 23922 + }, + { + "epoch": 0.23923, + "grad_norm": 1.1177306161222953, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 23923 + }, + { + "epoch": 0.23924, + "grad_norm": 0.8516971319288399, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 23924 + }, + { + "epoch": 0.23925, + "grad_norm": 0.7195189356447421, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 23925 + }, + { + "epoch": 0.23926, + "grad_norm": 0.8560185291034766, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 23926 + }, + { + "epoch": 0.23927, + "grad_norm": 0.9454836899241144, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 23927 + }, + { + "epoch": 0.23928, + "grad_norm": 0.8588839091992811, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 23928 + }, + { + "epoch": 0.23929, + "grad_norm": 0.8989397071990942, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 23929 + }, + { + "epoch": 0.2393, + "grad_norm": 0.943677117321925, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 23930 + }, + { + "epoch": 0.23931, + "grad_norm": 0.9073316878593287, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 23931 + }, + { + "epoch": 0.23932, + "grad_norm": 0.9422686652566484, + "learning_rate": 0.003, + "loss": 4.051, + "step": 23932 + }, + { + "epoch": 0.23933, + "grad_norm": 1.1831678230488505, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 23933 + }, + { + "epoch": 0.23934, + "grad_norm": 1.0941226389408711, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 23934 + }, + { + "epoch": 0.23935, + "grad_norm": 0.8278199637940014, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 23935 + }, + { + "epoch": 0.23936, + "grad_norm": 0.8303696103921486, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 23936 + }, + { + "epoch": 0.23937, + "grad_norm": 0.9464801771631769, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 23937 + }, + { + "epoch": 0.23938, + "grad_norm": 1.0895075966974028, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 23938 + }, + { + "epoch": 0.23939, + "grad_norm": 1.062865093566268, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 23939 + }, + { + "epoch": 0.2394, + "grad_norm": 1.0101956880586989, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 23940 + }, + { + "epoch": 0.23941, + "grad_norm": 1.0039406989902235, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 23941 + }, + { + "epoch": 0.23942, + "grad_norm": 1.0049457113166653, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 23942 + }, + { + "epoch": 0.23943, + "grad_norm": 1.1890442050350323, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 23943 + }, + { + "epoch": 0.23944, + "grad_norm": 0.9919631383837988, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 23944 + }, + { + "epoch": 0.23945, + "grad_norm": 1.031884376874631, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 23945 + }, + { + "epoch": 0.23946, + "grad_norm": 1.0281850575674434, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 23946 + }, + { + "epoch": 0.23947, + "grad_norm": 0.832043122470827, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 23947 + }, + { + "epoch": 0.23948, + "grad_norm": 0.8742482864584319, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 23948 + }, + { + "epoch": 0.23949, + "grad_norm": 0.7322700338963198, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 23949 + }, + { + "epoch": 0.2395, + "grad_norm": 0.7692639399506629, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 23950 + }, + { + "epoch": 0.23951, + "grad_norm": 0.9281173395562962, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 23951 + }, + { + "epoch": 0.23952, + "grad_norm": 1.1837769876789066, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 23952 + }, + { + "epoch": 0.23953, + "grad_norm": 0.8848549376178267, + "learning_rate": 0.003, + "loss": 4.063, + "step": 23953 + }, + { + "epoch": 0.23954, + "grad_norm": 0.9993075832636185, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 23954 + }, + { + "epoch": 0.23955, + "grad_norm": 1.1526762999632307, + "learning_rate": 0.003, + "loss": 4.0872, + "step": 23955 + }, + { + "epoch": 0.23956, + "grad_norm": 0.8215830345725534, + "learning_rate": 0.003, + "loss": 4.086, + "step": 23956 + }, + { + "epoch": 0.23957, + "grad_norm": 0.779914910608157, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 23957 + }, + { + "epoch": 0.23958, + "grad_norm": 0.8308680009635977, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 23958 + }, + { + "epoch": 0.23959, + "grad_norm": 0.8479161618904443, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 23959 + }, + { + "epoch": 0.2396, + "grad_norm": 0.9078884902641015, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 23960 + }, + { + "epoch": 0.23961, + "grad_norm": 0.9268873818269759, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 23961 + }, + { + "epoch": 0.23962, + "grad_norm": 1.0509853299007614, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 23962 + }, + { + "epoch": 0.23963, + "grad_norm": 1.1278199704024026, + "learning_rate": 0.003, + "loss": 4.028, + "step": 23963 + }, + { + "epoch": 0.23964, + "grad_norm": 1.1672871698584926, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 23964 + }, + { + "epoch": 0.23965, + "grad_norm": 0.8818154374994653, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 23965 + }, + { + "epoch": 0.23966, + "grad_norm": 0.795286769845189, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 23966 + }, + { + "epoch": 0.23967, + "grad_norm": 0.7486297084821716, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 23967 + }, + { + "epoch": 0.23968, + "grad_norm": 0.763368071363066, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 23968 + }, + { + "epoch": 0.23969, + "grad_norm": 0.8735000690838187, + "learning_rate": 0.003, + "loss": 4.054, + "step": 23969 + }, + { + "epoch": 0.2397, + "grad_norm": 0.8884296537657199, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 23970 + }, + { + "epoch": 0.23971, + "grad_norm": 0.8925903222962948, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 23971 + }, + { + "epoch": 0.23972, + "grad_norm": 1.0062558089726468, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 23972 + }, + { + "epoch": 0.23973, + "grad_norm": 1.0104350210104025, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 23973 + }, + { + "epoch": 0.23974, + "grad_norm": 0.9477021336502717, + "learning_rate": 0.003, + "loss": 4.068, + "step": 23974 + }, + { + "epoch": 0.23975, + "grad_norm": 0.9884272259898024, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 23975 + }, + { + "epoch": 0.23976, + "grad_norm": 1.0415007609763645, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 23976 + }, + { + "epoch": 0.23977, + "grad_norm": 0.9913468290844657, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 23977 + }, + { + "epoch": 0.23978, + "grad_norm": 0.990606825698148, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 23978 + }, + { + "epoch": 0.23979, + "grad_norm": 0.9052510115102118, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 23979 + }, + { + "epoch": 0.2398, + "grad_norm": 0.8807073781520176, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 23980 + }, + { + "epoch": 0.23981, + "grad_norm": 0.8737174013016102, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 23981 + }, + { + "epoch": 0.23982, + "grad_norm": 0.9016961800169254, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 23982 + }, + { + "epoch": 0.23983, + "grad_norm": 0.8186522578905069, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 23983 + }, + { + "epoch": 0.23984, + "grad_norm": 0.687195031125146, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 23984 + }, + { + "epoch": 0.23985, + "grad_norm": 0.6848577582069004, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 23985 + }, + { + "epoch": 0.23986, + "grad_norm": 0.7811671403393963, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 23986 + }, + { + "epoch": 0.23987, + "grad_norm": 0.9379284798848163, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 23987 + }, + { + "epoch": 0.23988, + "grad_norm": 1.2349473219242542, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 23988 + }, + { + "epoch": 0.23989, + "grad_norm": 0.8519903952432376, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 23989 + }, + { + "epoch": 0.2399, + "grad_norm": 0.9152604046045248, + "learning_rate": 0.003, + "loss": 4.0861, + "step": 23990 + }, + { + "epoch": 0.23991, + "grad_norm": 0.7908477818215786, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 23991 + }, + { + "epoch": 0.23992, + "grad_norm": 0.6752846025344641, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 23992 + }, + { + "epoch": 0.23993, + "grad_norm": 0.6130458187944698, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 23993 + }, + { + "epoch": 0.23994, + "grad_norm": 0.7017133338001408, + "learning_rate": 0.003, + "loss": 4.051, + "step": 23994 + }, + { + "epoch": 0.23995, + "grad_norm": 0.9462146215640923, + "learning_rate": 0.003, + "loss": 4.036, + "step": 23995 + }, + { + "epoch": 0.23996, + "grad_norm": 1.2356017573618747, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 23996 + }, + { + "epoch": 0.23997, + "grad_norm": 0.8376309062213451, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 23997 + }, + { + "epoch": 0.23998, + "grad_norm": 0.7806909865944034, + "learning_rate": 0.003, + "loss": 4.047, + "step": 23998 + }, + { + "epoch": 0.23999, + "grad_norm": 0.7623008924102859, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 23999 + }, + { + "epoch": 0.24, + "grad_norm": 0.7429570086202645, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 24000 + }, + { + "epoch": 0.24001, + "grad_norm": 0.891403679073063, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 24001 + }, + { + "epoch": 0.24002, + "grad_norm": 1.0607137502136506, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 24002 + }, + { + "epoch": 0.24003, + "grad_norm": 0.8462446955922235, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 24003 + }, + { + "epoch": 0.24004, + "grad_norm": 0.8724979271571859, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 24004 + }, + { + "epoch": 0.24005, + "grad_norm": 1.0165472860385638, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 24005 + }, + { + "epoch": 0.24006, + "grad_norm": 0.808744868229454, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 24006 + }, + { + "epoch": 0.24007, + "grad_norm": 0.6461027697386886, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 24007 + }, + { + "epoch": 0.24008, + "grad_norm": 0.7257636692510661, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 24008 + }, + { + "epoch": 0.24009, + "grad_norm": 0.7879524264192855, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 24009 + }, + { + "epoch": 0.2401, + "grad_norm": 0.8281139836294894, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 24010 + }, + { + "epoch": 0.24011, + "grad_norm": 0.8052142616607947, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 24011 + }, + { + "epoch": 0.24012, + "grad_norm": 0.8682225494515519, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 24012 + }, + { + "epoch": 0.24013, + "grad_norm": 0.9463939182510178, + "learning_rate": 0.003, + "loss": 4.052, + "step": 24013 + }, + { + "epoch": 0.24014, + "grad_norm": 0.9358372485485583, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 24014 + }, + { + "epoch": 0.24015, + "grad_norm": 0.8914743728838098, + "learning_rate": 0.003, + "loss": 4.021, + "step": 24015 + }, + { + "epoch": 0.24016, + "grad_norm": 0.82966190079915, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 24016 + }, + { + "epoch": 0.24017, + "grad_norm": 0.9872198219003735, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 24017 + }, + { + "epoch": 0.24018, + "grad_norm": 1.1194449032240241, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 24018 + }, + { + "epoch": 0.24019, + "grad_norm": 0.8812518925645841, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 24019 + }, + { + "epoch": 0.2402, + "grad_norm": 0.8529573273868983, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 24020 + }, + { + "epoch": 0.24021, + "grad_norm": 0.8574789545078222, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 24021 + }, + { + "epoch": 0.24022, + "grad_norm": 0.9072276055402922, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 24022 + }, + { + "epoch": 0.24023, + "grad_norm": 0.8182505805919096, + "learning_rate": 0.003, + "loss": 4.037, + "step": 24023 + }, + { + "epoch": 0.24024, + "grad_norm": 0.8598264411315909, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 24024 + }, + { + "epoch": 0.24025, + "grad_norm": 0.9191350454275453, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 24025 + }, + { + "epoch": 0.24026, + "grad_norm": 1.1681167880550225, + "learning_rate": 0.003, + "loss": 4.053, + "step": 24026 + }, + { + "epoch": 0.24027, + "grad_norm": 1.1422904363946265, + "learning_rate": 0.003, + "loss": 4.086, + "step": 24027 + }, + { + "epoch": 0.24028, + "grad_norm": 0.9968835088333793, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 24028 + }, + { + "epoch": 0.24029, + "grad_norm": 0.9608823465490182, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 24029 + }, + { + "epoch": 0.2403, + "grad_norm": 0.9110222516140425, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 24030 + }, + { + "epoch": 0.24031, + "grad_norm": 0.8406450963945513, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 24031 + }, + { + "epoch": 0.24032, + "grad_norm": 0.7207218593477188, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 24032 + }, + { + "epoch": 0.24033, + "grad_norm": 0.7292615696924318, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 24033 + }, + { + "epoch": 0.24034, + "grad_norm": 0.7617648139464747, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 24034 + }, + { + "epoch": 0.24035, + "grad_norm": 0.8138200955879704, + "learning_rate": 0.003, + "loss": 4.052, + "step": 24035 + }, + { + "epoch": 0.24036, + "grad_norm": 0.8842014737287486, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 24036 + }, + { + "epoch": 0.24037, + "grad_norm": 0.9080577727404103, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 24037 + }, + { + "epoch": 0.24038, + "grad_norm": 0.9772843637010848, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 24038 + }, + { + "epoch": 0.24039, + "grad_norm": 0.9448048142964297, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 24039 + }, + { + "epoch": 0.2404, + "grad_norm": 0.8908336163678491, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 24040 + }, + { + "epoch": 0.24041, + "grad_norm": 0.8247061209426142, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 24041 + }, + { + "epoch": 0.24042, + "grad_norm": 0.7057120894638902, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 24042 + }, + { + "epoch": 0.24043, + "grad_norm": 0.7753361667428217, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 24043 + }, + { + "epoch": 0.24044, + "grad_norm": 0.8642769111189467, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 24044 + }, + { + "epoch": 0.24045, + "grad_norm": 0.8105010132379571, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 24045 + }, + { + "epoch": 0.24046, + "grad_norm": 0.9321346138541184, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 24046 + }, + { + "epoch": 0.24047, + "grad_norm": 1.172618432642941, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 24047 + }, + { + "epoch": 0.24048, + "grad_norm": 0.8845395641424207, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 24048 + }, + { + "epoch": 0.24049, + "grad_norm": 0.8266929907203441, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 24049 + }, + { + "epoch": 0.2405, + "grad_norm": 0.7570682587469636, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 24050 + }, + { + "epoch": 0.24051, + "grad_norm": 0.7661694990978863, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 24051 + }, + { + "epoch": 0.24052, + "grad_norm": 0.8754538021049573, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 24052 + }, + { + "epoch": 0.24053, + "grad_norm": 0.9897165739946506, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 24053 + }, + { + "epoch": 0.24054, + "grad_norm": 1.0698247632615983, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 24054 + }, + { + "epoch": 0.24055, + "grad_norm": 1.0627201411958178, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 24055 + }, + { + "epoch": 0.24056, + "grad_norm": 0.9362564997402691, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 24056 + }, + { + "epoch": 0.24057, + "grad_norm": 0.8467236984263127, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 24057 + }, + { + "epoch": 0.24058, + "grad_norm": 0.7607122760446369, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 24058 + }, + { + "epoch": 0.24059, + "grad_norm": 0.8469732919983674, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 24059 + }, + { + "epoch": 0.2406, + "grad_norm": 0.850599671449039, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 24060 + }, + { + "epoch": 0.24061, + "grad_norm": 0.797701451558165, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 24061 + }, + { + "epoch": 0.24062, + "grad_norm": 0.7576945317355044, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 24062 + }, + { + "epoch": 0.24063, + "grad_norm": 0.813543565878261, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 24063 + }, + { + "epoch": 0.24064, + "grad_norm": 0.9144138688692207, + "learning_rate": 0.003, + "loss": 4.016, + "step": 24064 + }, + { + "epoch": 0.24065, + "grad_norm": 0.9599663133224493, + "learning_rate": 0.003, + "loss": 4.0972, + "step": 24065 + }, + { + "epoch": 0.24066, + "grad_norm": 0.9303293374328918, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 24066 + }, + { + "epoch": 0.24067, + "grad_norm": 1.126293048651876, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 24067 + }, + { + "epoch": 0.24068, + "grad_norm": 0.9795215225076365, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 24068 + }, + { + "epoch": 0.24069, + "grad_norm": 0.9155905487857411, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 24069 + }, + { + "epoch": 0.2407, + "grad_norm": 0.8820146983859433, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 24070 + }, + { + "epoch": 0.24071, + "grad_norm": 0.8586767212271174, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 24071 + }, + { + "epoch": 0.24072, + "grad_norm": 0.839633664291364, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 24072 + }, + { + "epoch": 0.24073, + "grad_norm": 0.8261316751111537, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 24073 + }, + { + "epoch": 0.24074, + "grad_norm": 0.8610115053803861, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 24074 + }, + { + "epoch": 0.24075, + "grad_norm": 0.9258687684645923, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 24075 + }, + { + "epoch": 0.24076, + "grad_norm": 0.9160259048291821, + "learning_rate": 0.003, + "loss": 4.1049, + "step": 24076 + }, + { + "epoch": 0.24077, + "grad_norm": 0.8630133468852037, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 24077 + }, + { + "epoch": 0.24078, + "grad_norm": 0.9624628384547445, + "learning_rate": 0.003, + "loss": 4.075, + "step": 24078 + }, + { + "epoch": 0.24079, + "grad_norm": 1.2035624116654873, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 24079 + }, + { + "epoch": 0.2408, + "grad_norm": 0.9286162687609989, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 24080 + }, + { + "epoch": 0.24081, + "grad_norm": 0.8938556852424284, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 24081 + }, + { + "epoch": 0.24082, + "grad_norm": 0.9187324482863597, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 24082 + }, + { + "epoch": 0.24083, + "grad_norm": 0.8986330453597926, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 24083 + }, + { + "epoch": 0.24084, + "grad_norm": 0.7920071130393982, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 24084 + }, + { + "epoch": 0.24085, + "grad_norm": 0.8262623026327576, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 24085 + }, + { + "epoch": 0.24086, + "grad_norm": 0.9509769629925109, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 24086 + }, + { + "epoch": 0.24087, + "grad_norm": 0.9078900604210004, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 24087 + }, + { + "epoch": 0.24088, + "grad_norm": 0.8671285347289105, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 24088 + }, + { + "epoch": 0.24089, + "grad_norm": 0.7877398246175549, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 24089 + }, + { + "epoch": 0.2409, + "grad_norm": 0.9602516606503598, + "learning_rate": 0.003, + "loss": 4.034, + "step": 24090 + }, + { + "epoch": 0.24091, + "grad_norm": 1.2133207349124513, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 24091 + }, + { + "epoch": 0.24092, + "grad_norm": 0.9467568404172214, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 24092 + }, + { + "epoch": 0.24093, + "grad_norm": 0.9152984018909553, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 24093 + }, + { + "epoch": 0.24094, + "grad_norm": 0.8802671079002148, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 24094 + }, + { + "epoch": 0.24095, + "grad_norm": 0.8403928829283397, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 24095 + }, + { + "epoch": 0.24096, + "grad_norm": 0.8368004699832426, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 24096 + }, + { + "epoch": 0.24097, + "grad_norm": 0.8841447491975679, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 24097 + }, + { + "epoch": 0.24098, + "grad_norm": 0.8676446276360308, + "learning_rate": 0.003, + "loss": 4.056, + "step": 24098 + }, + { + "epoch": 0.24099, + "grad_norm": 0.6923580289860628, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 24099 + }, + { + "epoch": 0.241, + "grad_norm": 0.7559695744672192, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 24100 + }, + { + "epoch": 0.24101, + "grad_norm": 0.7833923172909907, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 24101 + }, + { + "epoch": 0.24102, + "grad_norm": 0.703475516395984, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 24102 + }, + { + "epoch": 0.24103, + "grad_norm": 0.7045020036834209, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 24103 + }, + { + "epoch": 0.24104, + "grad_norm": 0.8445827035933333, + "learning_rate": 0.003, + "loss": 4.0042, + "step": 24104 + }, + { + "epoch": 0.24105, + "grad_norm": 1.0090209676329296, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 24105 + }, + { + "epoch": 0.24106, + "grad_norm": 1.0583835261738044, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 24106 + }, + { + "epoch": 0.24107, + "grad_norm": 1.1115520289521503, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 24107 + }, + { + "epoch": 0.24108, + "grad_norm": 0.9322299807031126, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 24108 + }, + { + "epoch": 0.24109, + "grad_norm": 0.8429793769133921, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 24109 + }, + { + "epoch": 0.2411, + "grad_norm": 0.8528789339592429, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 24110 + }, + { + "epoch": 0.24111, + "grad_norm": 0.9193852572100761, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 24111 + }, + { + "epoch": 0.24112, + "grad_norm": 0.9153164852877265, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 24112 + }, + { + "epoch": 0.24113, + "grad_norm": 0.9015085025933132, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 24113 + }, + { + "epoch": 0.24114, + "grad_norm": 0.8491972547100965, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 24114 + }, + { + "epoch": 0.24115, + "grad_norm": 0.8471075219888548, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 24115 + }, + { + "epoch": 0.24116, + "grad_norm": 0.8113950724145746, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 24116 + }, + { + "epoch": 0.24117, + "grad_norm": 0.9034267834812154, + "learning_rate": 0.003, + "loss": 4.045, + "step": 24117 + }, + { + "epoch": 0.24118, + "grad_norm": 0.970666594247184, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 24118 + }, + { + "epoch": 0.24119, + "grad_norm": 1.161198757457236, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 24119 + }, + { + "epoch": 0.2412, + "grad_norm": 1.0344792926560433, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 24120 + }, + { + "epoch": 0.24121, + "grad_norm": 1.0425612444621266, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 24121 + }, + { + "epoch": 0.24122, + "grad_norm": 0.9970462459624021, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 24122 + }, + { + "epoch": 0.24123, + "grad_norm": 0.9870734396806171, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 24123 + }, + { + "epoch": 0.24124, + "grad_norm": 1.0518746016196538, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 24124 + }, + { + "epoch": 0.24125, + "grad_norm": 0.9057410869551177, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 24125 + }, + { + "epoch": 0.24126, + "grad_norm": 0.9588833475371494, + "learning_rate": 0.003, + "loss": 4.057, + "step": 24126 + }, + { + "epoch": 0.24127, + "grad_norm": 1.0643234806375812, + "learning_rate": 0.003, + "loss": 4.036, + "step": 24127 + }, + { + "epoch": 0.24128, + "grad_norm": 0.9236134913779293, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 24128 + }, + { + "epoch": 0.24129, + "grad_norm": 1.0624944274734087, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 24129 + }, + { + "epoch": 0.2413, + "grad_norm": 0.8791159413056923, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 24130 + }, + { + "epoch": 0.24131, + "grad_norm": 0.858911480960589, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 24131 + }, + { + "epoch": 0.24132, + "grad_norm": 0.9598499087669667, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 24132 + }, + { + "epoch": 0.24133, + "grad_norm": 1.2988452788383125, + "learning_rate": 0.003, + "loss": 4.046, + "step": 24133 + }, + { + "epoch": 0.24134, + "grad_norm": 0.7033876645181483, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 24134 + }, + { + "epoch": 0.24135, + "grad_norm": 0.6671360324346581, + "learning_rate": 0.003, + "loss": 4.058, + "step": 24135 + }, + { + "epoch": 0.24136, + "grad_norm": 0.6994956701561872, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 24136 + }, + { + "epoch": 0.24137, + "grad_norm": 0.707419260028128, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 24137 + }, + { + "epoch": 0.24138, + "grad_norm": 0.7537736726141795, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 24138 + }, + { + "epoch": 0.24139, + "grad_norm": 0.8303890492377798, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 24139 + }, + { + "epoch": 0.2414, + "grad_norm": 0.8807602694364087, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 24140 + }, + { + "epoch": 0.24141, + "grad_norm": 0.9473322044883445, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 24141 + }, + { + "epoch": 0.24142, + "grad_norm": 0.9988504915189392, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 24142 + }, + { + "epoch": 0.24143, + "grad_norm": 0.9432456202794777, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 24143 + }, + { + "epoch": 0.24144, + "grad_norm": 0.7894190853104446, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 24144 + }, + { + "epoch": 0.24145, + "grad_norm": 0.6556073314689115, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 24145 + }, + { + "epoch": 0.24146, + "grad_norm": 0.6966591763929674, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 24146 + }, + { + "epoch": 0.24147, + "grad_norm": 0.6634421876500709, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 24147 + }, + { + "epoch": 0.24148, + "grad_norm": 0.5720177451785374, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 24148 + }, + { + "epoch": 0.24149, + "grad_norm": 0.6204068925130867, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 24149 + }, + { + "epoch": 0.2415, + "grad_norm": 0.651967665265683, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 24150 + }, + { + "epoch": 0.24151, + "grad_norm": 0.6953440635861076, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 24151 + }, + { + "epoch": 0.24152, + "grad_norm": 0.647135884978751, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 24152 + }, + { + "epoch": 0.24153, + "grad_norm": 0.674243564605426, + "learning_rate": 0.003, + "loss": 4.032, + "step": 24153 + }, + { + "epoch": 0.24154, + "grad_norm": 0.8310934764623221, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 24154 + }, + { + "epoch": 0.24155, + "grad_norm": 0.9243250824534507, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 24155 + }, + { + "epoch": 0.24156, + "grad_norm": 0.937391872060682, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 24156 + }, + { + "epoch": 0.24157, + "grad_norm": 1.2233535253653858, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 24157 + }, + { + "epoch": 0.24158, + "grad_norm": 0.9254489179576979, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 24158 + }, + { + "epoch": 0.24159, + "grad_norm": 0.8716597696921334, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 24159 + }, + { + "epoch": 0.2416, + "grad_norm": 0.7376596928044449, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 24160 + }, + { + "epoch": 0.24161, + "grad_norm": 0.6792144436005407, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 24161 + }, + { + "epoch": 0.24162, + "grad_norm": 0.7410103196910114, + "learning_rate": 0.003, + "loss": 4.065, + "step": 24162 + }, + { + "epoch": 0.24163, + "grad_norm": 0.7761152322175966, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 24163 + }, + { + "epoch": 0.24164, + "grad_norm": 0.8557496137302675, + "learning_rate": 0.003, + "loss": 4.01, + "step": 24164 + }, + { + "epoch": 0.24165, + "grad_norm": 0.9968533721175374, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 24165 + }, + { + "epoch": 0.24166, + "grad_norm": 1.0736602698831794, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 24166 + }, + { + "epoch": 0.24167, + "grad_norm": 0.9335017312424864, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 24167 + }, + { + "epoch": 0.24168, + "grad_norm": 1.002279588368271, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 24168 + }, + { + "epoch": 0.24169, + "grad_norm": 1.0304395048539323, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 24169 + }, + { + "epoch": 0.2417, + "grad_norm": 0.9004535679958675, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 24170 + }, + { + "epoch": 0.24171, + "grad_norm": 0.7623714937196778, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 24171 + }, + { + "epoch": 0.24172, + "grad_norm": 0.7925348605122874, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 24172 + }, + { + "epoch": 0.24173, + "grad_norm": 0.8022218942854448, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 24173 + }, + { + "epoch": 0.24174, + "grad_norm": 0.9715698907444348, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 24174 + }, + { + "epoch": 0.24175, + "grad_norm": 1.167788691350063, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 24175 + }, + { + "epoch": 0.24176, + "grad_norm": 0.9657799070341617, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 24176 + }, + { + "epoch": 0.24177, + "grad_norm": 1.0941027355318231, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 24177 + }, + { + "epoch": 0.24178, + "grad_norm": 1.0371006729763246, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 24178 + }, + { + "epoch": 0.24179, + "grad_norm": 0.8953247629232066, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 24179 + }, + { + "epoch": 0.2418, + "grad_norm": 1.0244448704305995, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 24180 + }, + { + "epoch": 0.24181, + "grad_norm": 1.0836058961835568, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 24181 + }, + { + "epoch": 0.24182, + "grad_norm": 1.0545956331127269, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 24182 + }, + { + "epoch": 0.24183, + "grad_norm": 1.0061748330916442, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 24183 + }, + { + "epoch": 0.24184, + "grad_norm": 0.956632143181476, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 24184 + }, + { + "epoch": 0.24185, + "grad_norm": 0.966335582308729, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 24185 + }, + { + "epoch": 0.24186, + "grad_norm": 1.053920106041618, + "learning_rate": 0.003, + "loss": 4.0851, + "step": 24186 + }, + { + "epoch": 0.24187, + "grad_norm": 1.1552608542545688, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 24187 + }, + { + "epoch": 0.24188, + "grad_norm": 0.9568289460956998, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 24188 + }, + { + "epoch": 0.24189, + "grad_norm": 1.0775055250530354, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 24189 + }, + { + "epoch": 0.2419, + "grad_norm": 0.8990994611987276, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 24190 + }, + { + "epoch": 0.24191, + "grad_norm": 0.6953617355084967, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 24191 + }, + { + "epoch": 0.24192, + "grad_norm": 0.7683277587199077, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 24192 + }, + { + "epoch": 0.24193, + "grad_norm": 0.7317281515955276, + "learning_rate": 0.003, + "loss": 4.052, + "step": 24193 + }, + { + "epoch": 0.24194, + "grad_norm": 0.8010020901327233, + "learning_rate": 0.003, + "loss": 4.043, + "step": 24194 + }, + { + "epoch": 0.24195, + "grad_norm": 0.8871916582791392, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 24195 + }, + { + "epoch": 0.24196, + "grad_norm": 1.0477912538401217, + "learning_rate": 0.003, + "loss": 4.0997, + "step": 24196 + }, + { + "epoch": 0.24197, + "grad_norm": 1.179676640863997, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 24197 + }, + { + "epoch": 0.24198, + "grad_norm": 0.7469104539208697, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 24198 + }, + { + "epoch": 0.24199, + "grad_norm": 0.6710772028299249, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 24199 + }, + { + "epoch": 0.242, + "grad_norm": 0.642294993470878, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 24200 + }, + { + "epoch": 0.24201, + "grad_norm": 0.6277056825248893, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 24201 + }, + { + "epoch": 0.24202, + "grad_norm": 0.5732283591920163, + "learning_rate": 0.003, + "loss": 4.034, + "step": 24202 + }, + { + "epoch": 0.24203, + "grad_norm": 0.5988497386267027, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 24203 + }, + { + "epoch": 0.24204, + "grad_norm": 0.676491342829408, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 24204 + }, + { + "epoch": 0.24205, + "grad_norm": 0.6355924719006589, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 24205 + }, + { + "epoch": 0.24206, + "grad_norm": 0.7630946632554243, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 24206 + }, + { + "epoch": 0.24207, + "grad_norm": 0.9743088613115153, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 24207 + }, + { + "epoch": 0.24208, + "grad_norm": 1.1462597885369197, + "learning_rate": 0.003, + "loss": 4.061, + "step": 24208 + }, + { + "epoch": 0.24209, + "grad_norm": 0.7463667024549048, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 24209 + }, + { + "epoch": 0.2421, + "grad_norm": 0.5975735189894466, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 24210 + }, + { + "epoch": 0.24211, + "grad_norm": 0.5889850735651799, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 24211 + }, + { + "epoch": 0.24212, + "grad_norm": 0.6778442522801655, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 24212 + }, + { + "epoch": 0.24213, + "grad_norm": 0.6743831995812118, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 24213 + }, + { + "epoch": 0.24214, + "grad_norm": 0.6537179332625822, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 24214 + }, + { + "epoch": 0.24215, + "grad_norm": 0.6863728873890239, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 24215 + }, + { + "epoch": 0.24216, + "grad_norm": 0.7716472549885176, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 24216 + }, + { + "epoch": 0.24217, + "grad_norm": 0.8782807038157397, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 24217 + }, + { + "epoch": 0.24218, + "grad_norm": 0.9787453524008378, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 24218 + }, + { + "epoch": 0.24219, + "grad_norm": 1.0638054005236948, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 24219 + }, + { + "epoch": 0.2422, + "grad_norm": 1.0092302853948862, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 24220 + }, + { + "epoch": 0.24221, + "grad_norm": 1.034893608291703, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 24221 + }, + { + "epoch": 0.24222, + "grad_norm": 0.8978393050177532, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 24222 + }, + { + "epoch": 0.24223, + "grad_norm": 0.7945652743714896, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 24223 + }, + { + "epoch": 0.24224, + "grad_norm": 0.9326021576163757, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 24224 + }, + { + "epoch": 0.24225, + "grad_norm": 1.068358400059819, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 24225 + }, + { + "epoch": 0.24226, + "grad_norm": 1.0259352143570122, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 24226 + }, + { + "epoch": 0.24227, + "grad_norm": 1.0682534945349964, + "learning_rate": 0.003, + "loss": 4.082, + "step": 24227 + }, + { + "epoch": 0.24228, + "grad_norm": 1.0588508751646972, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 24228 + }, + { + "epoch": 0.24229, + "grad_norm": 1.1630363174986336, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 24229 + }, + { + "epoch": 0.2423, + "grad_norm": 1.0040485555428182, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 24230 + }, + { + "epoch": 0.24231, + "grad_norm": 1.1284992210312648, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 24231 + }, + { + "epoch": 0.24232, + "grad_norm": 1.1364189822818151, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 24232 + }, + { + "epoch": 0.24233, + "grad_norm": 0.8012730611671595, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 24233 + }, + { + "epoch": 0.24234, + "grad_norm": 0.7666971470983873, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 24234 + }, + { + "epoch": 0.24235, + "grad_norm": 0.8723108545176743, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 24235 + }, + { + "epoch": 0.24236, + "grad_norm": 0.8902507190691931, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 24236 + }, + { + "epoch": 0.24237, + "grad_norm": 1.054389849590314, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 24237 + }, + { + "epoch": 0.24238, + "grad_norm": 1.0113354389044351, + "learning_rate": 0.003, + "loss": 4.062, + "step": 24238 + }, + { + "epoch": 0.24239, + "grad_norm": 1.1018681673602049, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 24239 + }, + { + "epoch": 0.2424, + "grad_norm": 1.012787456329008, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 24240 + }, + { + "epoch": 0.24241, + "grad_norm": 0.9098124280491132, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 24241 + }, + { + "epoch": 0.24242, + "grad_norm": 0.9162320998383073, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 24242 + }, + { + "epoch": 0.24243, + "grad_norm": 0.9171120967426635, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 24243 + }, + { + "epoch": 0.24244, + "grad_norm": 0.8288125447727074, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 24244 + }, + { + "epoch": 0.24245, + "grad_norm": 0.7921256702533814, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 24245 + }, + { + "epoch": 0.24246, + "grad_norm": 0.6547228551426508, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 24246 + }, + { + "epoch": 0.24247, + "grad_norm": 0.6154267253320896, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 24247 + }, + { + "epoch": 0.24248, + "grad_norm": 0.5655510060592761, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 24248 + }, + { + "epoch": 0.24249, + "grad_norm": 0.5656358907799476, + "learning_rate": 0.003, + "loss": 4.04, + "step": 24249 + }, + { + "epoch": 0.2425, + "grad_norm": 0.6595247236447949, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 24250 + }, + { + "epoch": 0.24251, + "grad_norm": 0.7606006796457544, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 24251 + }, + { + "epoch": 0.24252, + "grad_norm": 0.7728654368369797, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 24252 + }, + { + "epoch": 0.24253, + "grad_norm": 0.8125461978029466, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 24253 + }, + { + "epoch": 0.24254, + "grad_norm": 1.0074276727792024, + "learning_rate": 0.003, + "loss": 4.039, + "step": 24254 + }, + { + "epoch": 0.24255, + "grad_norm": 1.3180408203580654, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 24255 + }, + { + "epoch": 0.24256, + "grad_norm": 0.8490214284262066, + "learning_rate": 0.003, + "loss": 4.032, + "step": 24256 + }, + { + "epoch": 0.24257, + "grad_norm": 0.8766396399202889, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 24257 + }, + { + "epoch": 0.24258, + "grad_norm": 0.7682491727145239, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 24258 + }, + { + "epoch": 0.24259, + "grad_norm": 0.7727747110084662, + "learning_rate": 0.003, + "loss": 4.002, + "step": 24259 + }, + { + "epoch": 0.2426, + "grad_norm": 0.9023445328646724, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 24260 + }, + { + "epoch": 0.24261, + "grad_norm": 1.0851170341260725, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 24261 + }, + { + "epoch": 0.24262, + "grad_norm": 1.1201151544391619, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 24262 + }, + { + "epoch": 0.24263, + "grad_norm": 0.7282776761565849, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 24263 + }, + { + "epoch": 0.24264, + "grad_norm": 0.7703995668135574, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 24264 + }, + { + "epoch": 0.24265, + "grad_norm": 0.9032639226083838, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 24265 + }, + { + "epoch": 0.24266, + "grad_norm": 1.002913863602453, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 24266 + }, + { + "epoch": 0.24267, + "grad_norm": 1.052561614646673, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 24267 + }, + { + "epoch": 0.24268, + "grad_norm": 0.9347242590951947, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 24268 + }, + { + "epoch": 0.24269, + "grad_norm": 0.9820523379471302, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 24269 + }, + { + "epoch": 0.2427, + "grad_norm": 1.0924795750111573, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 24270 + }, + { + "epoch": 0.24271, + "grad_norm": 1.0393275879710435, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 24271 + }, + { + "epoch": 0.24272, + "grad_norm": 0.9728710896968386, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 24272 + }, + { + "epoch": 0.24273, + "grad_norm": 0.8816673775283153, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 24273 + }, + { + "epoch": 0.24274, + "grad_norm": 1.0100960014260774, + "learning_rate": 0.003, + "loss": 4.07, + "step": 24274 + }, + { + "epoch": 0.24275, + "grad_norm": 1.2296447550917293, + "learning_rate": 0.003, + "loss": 4.048, + "step": 24275 + }, + { + "epoch": 0.24276, + "grad_norm": 0.8795849839402438, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 24276 + }, + { + "epoch": 0.24277, + "grad_norm": 0.7322407848306921, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 24277 + }, + { + "epoch": 0.24278, + "grad_norm": 0.7325024425579449, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 24278 + }, + { + "epoch": 0.24279, + "grad_norm": 0.6664534797868003, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 24279 + }, + { + "epoch": 0.2428, + "grad_norm": 0.6453087650570909, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 24280 + }, + { + "epoch": 0.24281, + "grad_norm": 0.6923670370070717, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 24281 + }, + { + "epoch": 0.24282, + "grad_norm": 0.6484054864314717, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 24282 + }, + { + "epoch": 0.24283, + "grad_norm": 0.7605076425691207, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 24283 + }, + { + "epoch": 0.24284, + "grad_norm": 0.8429779027342893, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 24284 + }, + { + "epoch": 0.24285, + "grad_norm": 0.7898554949164536, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 24285 + }, + { + "epoch": 0.24286, + "grad_norm": 0.6933544366141227, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 24286 + }, + { + "epoch": 0.24287, + "grad_norm": 0.6796303861773102, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 24287 + }, + { + "epoch": 0.24288, + "grad_norm": 0.6029291372376555, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 24288 + }, + { + "epoch": 0.24289, + "grad_norm": 0.5881464980014078, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 24289 + }, + { + "epoch": 0.2429, + "grad_norm": 0.651136758179586, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 24290 + }, + { + "epoch": 0.24291, + "grad_norm": 0.9153731252558913, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 24291 + }, + { + "epoch": 0.24292, + "grad_norm": 1.3474063057734336, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 24292 + }, + { + "epoch": 0.24293, + "grad_norm": 0.677031978897014, + "learning_rate": 0.003, + "loss": 3.997, + "step": 24293 + }, + { + "epoch": 0.24294, + "grad_norm": 0.6731321064724827, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 24294 + }, + { + "epoch": 0.24295, + "grad_norm": 0.6798318244558856, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 24295 + }, + { + "epoch": 0.24296, + "grad_norm": 0.6375843831806745, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 24296 + }, + { + "epoch": 0.24297, + "grad_norm": 0.6729926264270194, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 24297 + }, + { + "epoch": 0.24298, + "grad_norm": 0.7324733622903582, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 24298 + }, + { + "epoch": 0.24299, + "grad_norm": 0.9069620571468769, + "learning_rate": 0.003, + "loss": 4.031, + "step": 24299 + }, + { + "epoch": 0.243, + "grad_norm": 0.9972826409042985, + "learning_rate": 0.003, + "loss": 4.047, + "step": 24300 + }, + { + "epoch": 0.24301, + "grad_norm": 1.1127576757734765, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 24301 + }, + { + "epoch": 0.24302, + "grad_norm": 1.0143016483866745, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 24302 + }, + { + "epoch": 0.24303, + "grad_norm": 0.9381428663087634, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 24303 + }, + { + "epoch": 0.24304, + "grad_norm": 1.0541222790608915, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 24304 + }, + { + "epoch": 0.24305, + "grad_norm": 1.220678775709384, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 24305 + }, + { + "epoch": 0.24306, + "grad_norm": 0.8765027876004091, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 24306 + }, + { + "epoch": 0.24307, + "grad_norm": 0.9817730369750407, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 24307 + }, + { + "epoch": 0.24308, + "grad_norm": 1.1872241326313566, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 24308 + }, + { + "epoch": 0.24309, + "grad_norm": 0.782724219798603, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 24309 + }, + { + "epoch": 0.2431, + "grad_norm": 0.7776886030186891, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 24310 + }, + { + "epoch": 0.24311, + "grad_norm": 0.8318199707466516, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 24311 + }, + { + "epoch": 0.24312, + "grad_norm": 0.8426983867386284, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 24312 + }, + { + "epoch": 0.24313, + "grad_norm": 0.852138243236149, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 24313 + }, + { + "epoch": 0.24314, + "grad_norm": 1.1302089332431762, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 24314 + }, + { + "epoch": 0.24315, + "grad_norm": 1.0320332850477054, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 24315 + }, + { + "epoch": 0.24316, + "grad_norm": 0.916896084894644, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 24316 + }, + { + "epoch": 0.24317, + "grad_norm": 0.8335526627679575, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 24317 + }, + { + "epoch": 0.24318, + "grad_norm": 0.9103889407566678, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 24318 + }, + { + "epoch": 0.24319, + "grad_norm": 1.100555373003163, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 24319 + }, + { + "epoch": 0.2432, + "grad_norm": 0.7989921546119533, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 24320 + }, + { + "epoch": 0.24321, + "grad_norm": 0.758276426865167, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 24321 + }, + { + "epoch": 0.24322, + "grad_norm": 0.8109402183804613, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 24322 + }, + { + "epoch": 0.24323, + "grad_norm": 0.8753917152344733, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 24323 + }, + { + "epoch": 0.24324, + "grad_norm": 1.0209258137716997, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 24324 + }, + { + "epoch": 0.24325, + "grad_norm": 1.2201533941441849, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 24325 + }, + { + "epoch": 0.24326, + "grad_norm": 0.8374792611631589, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 24326 + }, + { + "epoch": 0.24327, + "grad_norm": 0.8356536837201828, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 24327 + }, + { + "epoch": 0.24328, + "grad_norm": 0.8125940116736208, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 24328 + }, + { + "epoch": 0.24329, + "grad_norm": 0.844528381739035, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 24329 + }, + { + "epoch": 0.2433, + "grad_norm": 0.9276686820154229, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 24330 + }, + { + "epoch": 0.24331, + "grad_norm": 1.0918135620640965, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 24331 + }, + { + "epoch": 0.24332, + "grad_norm": 0.9022723421239631, + "learning_rate": 0.003, + "loss": 4.078, + "step": 24332 + }, + { + "epoch": 0.24333, + "grad_norm": 0.8522007131246603, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 24333 + }, + { + "epoch": 0.24334, + "grad_norm": 0.9088536463096427, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 24334 + }, + { + "epoch": 0.24335, + "grad_norm": 0.8795732024738103, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 24335 + }, + { + "epoch": 0.24336, + "grad_norm": 0.8419468767859976, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 24336 + }, + { + "epoch": 0.24337, + "grad_norm": 0.847179246601703, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 24337 + }, + { + "epoch": 0.24338, + "grad_norm": 0.816899248272882, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 24338 + }, + { + "epoch": 0.24339, + "grad_norm": 0.7476287007640905, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 24339 + }, + { + "epoch": 0.2434, + "grad_norm": 0.7355001806883271, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 24340 + }, + { + "epoch": 0.24341, + "grad_norm": 0.883264695970655, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 24341 + }, + { + "epoch": 0.24342, + "grad_norm": 1.147953845061727, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 24342 + }, + { + "epoch": 0.24343, + "grad_norm": 0.9312668237821587, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 24343 + }, + { + "epoch": 0.24344, + "grad_norm": 1.0466720575363302, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 24344 + }, + { + "epoch": 0.24345, + "grad_norm": 1.1412413053078623, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 24345 + }, + { + "epoch": 0.24346, + "grad_norm": 0.9516014194879931, + "learning_rate": 0.003, + "loss": 4.063, + "step": 24346 + }, + { + "epoch": 0.24347, + "grad_norm": 0.9382388128949345, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 24347 + }, + { + "epoch": 0.24348, + "grad_norm": 0.9244033979742177, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 24348 + }, + { + "epoch": 0.24349, + "grad_norm": 0.8806876867652167, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 24349 + }, + { + "epoch": 0.2435, + "grad_norm": 0.8774994066516855, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 24350 + }, + { + "epoch": 0.24351, + "grad_norm": 0.9669429264958953, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 24351 + }, + { + "epoch": 0.24352, + "grad_norm": 0.9750995866386545, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 24352 + }, + { + "epoch": 0.24353, + "grad_norm": 1.0642171007194772, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 24353 + }, + { + "epoch": 0.24354, + "grad_norm": 1.023318935289836, + "learning_rate": 0.003, + "loss": 4.07, + "step": 24354 + }, + { + "epoch": 0.24355, + "grad_norm": 1.037057094905722, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 24355 + }, + { + "epoch": 0.24356, + "grad_norm": 0.8528389113927962, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 24356 + }, + { + "epoch": 0.24357, + "grad_norm": 0.8084312248659596, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 24357 + }, + { + "epoch": 0.24358, + "grad_norm": 0.8994360509207241, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 24358 + }, + { + "epoch": 0.24359, + "grad_norm": 0.9506378387000618, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 24359 + }, + { + "epoch": 0.2436, + "grad_norm": 0.8435290336291547, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 24360 + }, + { + "epoch": 0.24361, + "grad_norm": 0.6930848325647802, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 24361 + }, + { + "epoch": 0.24362, + "grad_norm": 0.6855402735593364, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 24362 + }, + { + "epoch": 0.24363, + "grad_norm": 0.6282268791818146, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 24363 + }, + { + "epoch": 0.24364, + "grad_norm": 0.6529410185477602, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 24364 + }, + { + "epoch": 0.24365, + "grad_norm": 0.7812121432098154, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 24365 + }, + { + "epoch": 0.24366, + "grad_norm": 1.1277676898190097, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 24366 + }, + { + "epoch": 0.24367, + "grad_norm": 1.1297822136217337, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 24367 + }, + { + "epoch": 0.24368, + "grad_norm": 0.8177340570124606, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 24368 + }, + { + "epoch": 0.24369, + "grad_norm": 0.911387406128397, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 24369 + }, + { + "epoch": 0.2437, + "grad_norm": 0.9365083714800969, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 24370 + }, + { + "epoch": 0.24371, + "grad_norm": 0.858893091702094, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 24371 + }, + { + "epoch": 0.24372, + "grad_norm": 0.9163546378870618, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 24372 + }, + { + "epoch": 0.24373, + "grad_norm": 1.0582870825659023, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 24373 + }, + { + "epoch": 0.24374, + "grad_norm": 0.9334249060481281, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 24374 + }, + { + "epoch": 0.24375, + "grad_norm": 0.9799317083201554, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 24375 + }, + { + "epoch": 0.24376, + "grad_norm": 1.1045988202635622, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 24376 + }, + { + "epoch": 0.24377, + "grad_norm": 0.9975518141001947, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 24377 + }, + { + "epoch": 0.24378, + "grad_norm": 0.9658299093965321, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 24378 + }, + { + "epoch": 0.24379, + "grad_norm": 0.8155898042883143, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 24379 + }, + { + "epoch": 0.2438, + "grad_norm": 0.814010990543558, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 24380 + }, + { + "epoch": 0.24381, + "grad_norm": 0.8087958503543714, + "learning_rate": 0.003, + "loss": 4.06, + "step": 24381 + }, + { + "epoch": 0.24382, + "grad_norm": 0.8697846829860854, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 24382 + }, + { + "epoch": 0.24383, + "grad_norm": 0.9175689317485503, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 24383 + }, + { + "epoch": 0.24384, + "grad_norm": 1.005163379336692, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 24384 + }, + { + "epoch": 0.24385, + "grad_norm": 1.0397244058946067, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 24385 + }, + { + "epoch": 0.24386, + "grad_norm": 0.8556278638141066, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 24386 + }, + { + "epoch": 0.24387, + "grad_norm": 0.803804951620591, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 24387 + }, + { + "epoch": 0.24388, + "grad_norm": 0.8118970192039237, + "learning_rate": 0.003, + "loss": 4.032, + "step": 24388 + }, + { + "epoch": 0.24389, + "grad_norm": 0.9646815950518908, + "learning_rate": 0.003, + "loss": 4.066, + "step": 24389 + }, + { + "epoch": 0.2439, + "grad_norm": 1.0119267231370443, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 24390 + }, + { + "epoch": 0.24391, + "grad_norm": 1.0989985571835281, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 24391 + }, + { + "epoch": 0.24392, + "grad_norm": 0.9792742723407618, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 24392 + }, + { + "epoch": 0.24393, + "grad_norm": 0.888863836050796, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 24393 + }, + { + "epoch": 0.24394, + "grad_norm": 0.7022669694122133, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 24394 + }, + { + "epoch": 0.24395, + "grad_norm": 0.6878590637291429, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 24395 + }, + { + "epoch": 0.24396, + "grad_norm": 0.5908483768920613, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 24396 + }, + { + "epoch": 0.24397, + "grad_norm": 0.5355051428876914, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 24397 + }, + { + "epoch": 0.24398, + "grad_norm": 0.5372096324250133, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 24398 + }, + { + "epoch": 0.24399, + "grad_norm": 0.5651486344449366, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 24399 + }, + { + "epoch": 0.244, + "grad_norm": 0.601368453393794, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 24400 + }, + { + "epoch": 0.24401, + "grad_norm": 0.6220715313771641, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 24401 + }, + { + "epoch": 0.24402, + "grad_norm": 0.6163657436173646, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 24402 + }, + { + "epoch": 0.24403, + "grad_norm": 0.6155634994260076, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 24403 + }, + { + "epoch": 0.24404, + "grad_norm": 0.6122338660245654, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 24404 + }, + { + "epoch": 0.24405, + "grad_norm": 0.6586059752067807, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 24405 + }, + { + "epoch": 0.24406, + "grad_norm": 0.8409746987929847, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 24406 + }, + { + "epoch": 0.24407, + "grad_norm": 1.2167870661472961, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 24407 + }, + { + "epoch": 0.24408, + "grad_norm": 1.1624399269643086, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 24408 + }, + { + "epoch": 0.24409, + "grad_norm": 1.0177443994499635, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 24409 + }, + { + "epoch": 0.2441, + "grad_norm": 1.088981240220816, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 24410 + }, + { + "epoch": 0.24411, + "grad_norm": 0.9805322389670114, + "learning_rate": 0.003, + "loss": 4.031, + "step": 24411 + }, + { + "epoch": 0.24412, + "grad_norm": 0.983151090768275, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 24412 + }, + { + "epoch": 0.24413, + "grad_norm": 1.0216249476932648, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 24413 + }, + { + "epoch": 0.24414, + "grad_norm": 0.9387040218719899, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 24414 + }, + { + "epoch": 0.24415, + "grad_norm": 0.9427996239529268, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 24415 + }, + { + "epoch": 0.24416, + "grad_norm": 0.7726171323072034, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 24416 + }, + { + "epoch": 0.24417, + "grad_norm": 0.8761754992263954, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 24417 + }, + { + "epoch": 0.24418, + "grad_norm": 1.0686455921997247, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 24418 + }, + { + "epoch": 0.24419, + "grad_norm": 1.0908022871719525, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 24419 + }, + { + "epoch": 0.2442, + "grad_norm": 0.8541406837957457, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 24420 + }, + { + "epoch": 0.24421, + "grad_norm": 0.885758959494135, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 24421 + }, + { + "epoch": 0.24422, + "grad_norm": 0.9547142044914406, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 24422 + }, + { + "epoch": 0.24423, + "grad_norm": 1.0533696961035683, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 24423 + }, + { + "epoch": 0.24424, + "grad_norm": 0.969538276128009, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 24424 + }, + { + "epoch": 0.24425, + "grad_norm": 1.0818288288985716, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 24425 + }, + { + "epoch": 0.24426, + "grad_norm": 1.034316125393436, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 24426 + }, + { + "epoch": 0.24427, + "grad_norm": 0.9508345299795138, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 24427 + }, + { + "epoch": 0.24428, + "grad_norm": 1.045863445948455, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 24428 + }, + { + "epoch": 0.24429, + "grad_norm": 1.0239659339972875, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 24429 + }, + { + "epoch": 0.2443, + "grad_norm": 1.039770757932298, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 24430 + }, + { + "epoch": 0.24431, + "grad_norm": 1.0297198312633498, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 24431 + }, + { + "epoch": 0.24432, + "grad_norm": 0.9654788859834478, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 24432 + }, + { + "epoch": 0.24433, + "grad_norm": 0.9690528358014315, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24433 + }, + { + "epoch": 0.24434, + "grad_norm": 1.0109256894738612, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 24434 + }, + { + "epoch": 0.24435, + "grad_norm": 1.0010957223393737, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 24435 + }, + { + "epoch": 0.24436, + "grad_norm": 1.054536591309224, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 24436 + }, + { + "epoch": 0.24437, + "grad_norm": 1.0223286449754077, + "learning_rate": 0.003, + "loss": 4.038, + "step": 24437 + }, + { + "epoch": 0.24438, + "grad_norm": 0.9570499914386106, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 24438 + }, + { + "epoch": 0.24439, + "grad_norm": 0.9422571672786546, + "learning_rate": 0.003, + "loss": 4.058, + "step": 24439 + }, + { + "epoch": 0.2444, + "grad_norm": 0.9645464832832905, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 24440 + }, + { + "epoch": 0.24441, + "grad_norm": 1.0420590883156042, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 24441 + }, + { + "epoch": 0.24442, + "grad_norm": 0.852256173245297, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 24442 + }, + { + "epoch": 0.24443, + "grad_norm": 0.7411281220278786, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 24443 + }, + { + "epoch": 0.24444, + "grad_norm": 0.712094237709309, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 24444 + }, + { + "epoch": 0.24445, + "grad_norm": 0.8415610013976501, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 24445 + }, + { + "epoch": 0.24446, + "grad_norm": 0.947926946681682, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 24446 + }, + { + "epoch": 0.24447, + "grad_norm": 1.0947504770310268, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 24447 + }, + { + "epoch": 0.24448, + "grad_norm": 0.9135832351063355, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 24448 + }, + { + "epoch": 0.24449, + "grad_norm": 0.925166794047048, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 24449 + }, + { + "epoch": 0.2445, + "grad_norm": 1.1143021409451108, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 24450 + }, + { + "epoch": 0.24451, + "grad_norm": 0.9939752701268647, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 24451 + }, + { + "epoch": 0.24452, + "grad_norm": 1.0081036507024388, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 24452 + }, + { + "epoch": 0.24453, + "grad_norm": 1.018741683828051, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 24453 + }, + { + "epoch": 0.24454, + "grad_norm": 0.8103139534307585, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 24454 + }, + { + "epoch": 0.24455, + "grad_norm": 0.7462387815193459, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 24455 + }, + { + "epoch": 0.24456, + "grad_norm": 0.7286375087075916, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 24456 + }, + { + "epoch": 0.24457, + "grad_norm": 0.7996862448008925, + "learning_rate": 0.003, + "loss": 4.049, + "step": 24457 + }, + { + "epoch": 0.24458, + "grad_norm": 0.7208759097321523, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 24458 + }, + { + "epoch": 0.24459, + "grad_norm": 0.8642368488907551, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 24459 + }, + { + "epoch": 0.2446, + "grad_norm": 1.152726837349672, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 24460 + }, + { + "epoch": 0.24461, + "grad_norm": 0.9634881046961308, + "learning_rate": 0.003, + "loss": 4.058, + "step": 24461 + }, + { + "epoch": 0.24462, + "grad_norm": 0.8070163171672409, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 24462 + }, + { + "epoch": 0.24463, + "grad_norm": 0.7314973056339306, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 24463 + }, + { + "epoch": 0.24464, + "grad_norm": 0.6358579966626574, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 24464 + }, + { + "epoch": 0.24465, + "grad_norm": 0.6533617287048956, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 24465 + }, + { + "epoch": 0.24466, + "grad_norm": 0.6458387295110005, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 24466 + }, + { + "epoch": 0.24467, + "grad_norm": 0.7789525457207829, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 24467 + }, + { + "epoch": 0.24468, + "grad_norm": 0.7741433302318864, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 24468 + }, + { + "epoch": 0.24469, + "grad_norm": 0.7069997049876953, + "learning_rate": 0.003, + "loss": 4.089, + "step": 24469 + }, + { + "epoch": 0.2447, + "grad_norm": 0.6601098571565199, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 24470 + }, + { + "epoch": 0.24471, + "grad_norm": 0.6860472221961996, + "learning_rate": 0.003, + "loss": 4.028, + "step": 24471 + }, + { + "epoch": 0.24472, + "grad_norm": 0.7542101040776972, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 24472 + }, + { + "epoch": 0.24473, + "grad_norm": 0.9479543221470939, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 24473 + }, + { + "epoch": 0.24474, + "grad_norm": 1.3122349840516991, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 24474 + }, + { + "epoch": 0.24475, + "grad_norm": 0.7782334959312933, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 24475 + }, + { + "epoch": 0.24476, + "grad_norm": 0.6485967236211962, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 24476 + }, + { + "epoch": 0.24477, + "grad_norm": 0.6695866426233174, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 24477 + }, + { + "epoch": 0.24478, + "grad_norm": 0.7679246093453307, + "learning_rate": 0.003, + "loss": 4.048, + "step": 24478 + }, + { + "epoch": 0.24479, + "grad_norm": 0.7823657632506431, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 24479 + }, + { + "epoch": 0.2448, + "grad_norm": 0.8248241381331618, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 24480 + }, + { + "epoch": 0.24481, + "grad_norm": 0.991312044256379, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 24481 + }, + { + "epoch": 0.24482, + "grad_norm": 1.209441981026284, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 24482 + }, + { + "epoch": 0.24483, + "grad_norm": 0.8359220232153316, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 24483 + }, + { + "epoch": 0.24484, + "grad_norm": 0.7673875730449259, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 24484 + }, + { + "epoch": 0.24485, + "grad_norm": 0.8984091683424169, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 24485 + }, + { + "epoch": 0.24486, + "grad_norm": 0.9267010252873435, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 24486 + }, + { + "epoch": 0.24487, + "grad_norm": 0.9265271550963003, + "learning_rate": 0.003, + "loss": 4.043, + "step": 24487 + }, + { + "epoch": 0.24488, + "grad_norm": 0.9454430342517407, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 24488 + }, + { + "epoch": 0.24489, + "grad_norm": 1.0299287751967152, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24489 + }, + { + "epoch": 0.2449, + "grad_norm": 1.2684214157343432, + "learning_rate": 0.003, + "loss": 4.078, + "step": 24490 + }, + { + "epoch": 0.24491, + "grad_norm": 1.0357306082180378, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 24491 + }, + { + "epoch": 0.24492, + "grad_norm": 1.0256960624927896, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 24492 + }, + { + "epoch": 0.24493, + "grad_norm": 1.0261104633484393, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 24493 + }, + { + "epoch": 0.24494, + "grad_norm": 0.9969551753139565, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 24494 + }, + { + "epoch": 0.24495, + "grad_norm": 0.9159010962327077, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 24495 + }, + { + "epoch": 0.24496, + "grad_norm": 1.0386224763459773, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 24496 + }, + { + "epoch": 0.24497, + "grad_norm": 1.112029523923787, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 24497 + }, + { + "epoch": 0.24498, + "grad_norm": 1.0846915951808902, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 24498 + }, + { + "epoch": 0.24499, + "grad_norm": 0.8904941839709802, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 24499 + }, + { + "epoch": 0.245, + "grad_norm": 0.8983980505009838, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 24500 + }, + { + "epoch": 0.24501, + "grad_norm": 0.8990993133307748, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 24501 + }, + { + "epoch": 0.24502, + "grad_norm": 0.9616035618335347, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 24502 + }, + { + "epoch": 0.24503, + "grad_norm": 1.0473821737012714, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 24503 + }, + { + "epoch": 0.24504, + "grad_norm": 1.0715748104037908, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 24504 + }, + { + "epoch": 0.24505, + "grad_norm": 0.9173272758342427, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 24505 + }, + { + "epoch": 0.24506, + "grad_norm": 0.7635617413598949, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 24506 + }, + { + "epoch": 0.24507, + "grad_norm": 0.7045735912434508, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 24507 + }, + { + "epoch": 0.24508, + "grad_norm": 0.6656034642751955, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 24508 + }, + { + "epoch": 0.24509, + "grad_norm": 0.7339683669241369, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 24509 + }, + { + "epoch": 0.2451, + "grad_norm": 0.8787924541871007, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 24510 + }, + { + "epoch": 0.24511, + "grad_norm": 1.0741159663223918, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 24511 + }, + { + "epoch": 0.24512, + "grad_norm": 0.9404947670189978, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 24512 + }, + { + "epoch": 0.24513, + "grad_norm": 0.806775192995787, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 24513 + }, + { + "epoch": 0.24514, + "grad_norm": 0.8281536041647255, + "learning_rate": 0.003, + "loss": 4.0888, + "step": 24514 + }, + { + "epoch": 0.24515, + "grad_norm": 0.9222970560580745, + "learning_rate": 0.003, + "loss": 4.053, + "step": 24515 + }, + { + "epoch": 0.24516, + "grad_norm": 1.0001790133935085, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 24516 + }, + { + "epoch": 0.24517, + "grad_norm": 1.0525706735221212, + "learning_rate": 0.003, + "loss": 4.047, + "step": 24517 + }, + { + "epoch": 0.24518, + "grad_norm": 1.0518489591809843, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 24518 + }, + { + "epoch": 0.24519, + "grad_norm": 0.9027628384600236, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 24519 + }, + { + "epoch": 0.2452, + "grad_norm": 0.7341292918381099, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 24520 + }, + { + "epoch": 0.24521, + "grad_norm": 0.7505505315678521, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 24521 + }, + { + "epoch": 0.24522, + "grad_norm": 0.7147919366249711, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 24522 + }, + { + "epoch": 0.24523, + "grad_norm": 0.7618300206719442, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 24523 + }, + { + "epoch": 0.24524, + "grad_norm": 0.843399850314302, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 24524 + }, + { + "epoch": 0.24525, + "grad_norm": 0.8962612866561526, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 24525 + }, + { + "epoch": 0.24526, + "grad_norm": 0.9710199731710112, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 24526 + }, + { + "epoch": 0.24527, + "grad_norm": 1.0536729698385872, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 24527 + }, + { + "epoch": 0.24528, + "grad_norm": 1.1065920884567686, + "learning_rate": 0.003, + "loss": 4.05, + "step": 24528 + }, + { + "epoch": 0.24529, + "grad_norm": 0.7931915600997107, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 24529 + }, + { + "epoch": 0.2453, + "grad_norm": 0.6642813299313023, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 24530 + }, + { + "epoch": 0.24531, + "grad_norm": 0.5928295581730393, + "learning_rate": 0.003, + "loss": 4.0057, + "step": 24531 + }, + { + "epoch": 0.24532, + "grad_norm": 0.6105854159000165, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 24532 + }, + { + "epoch": 0.24533, + "grad_norm": 0.6929444411840987, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 24533 + }, + { + "epoch": 0.24534, + "grad_norm": 0.8307734920885856, + "learning_rate": 0.003, + "loss": 4.057, + "step": 24534 + }, + { + "epoch": 0.24535, + "grad_norm": 0.8812980873115231, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 24535 + }, + { + "epoch": 0.24536, + "grad_norm": 0.8540772901309754, + "learning_rate": 0.003, + "loss": 4.044, + "step": 24536 + }, + { + "epoch": 0.24537, + "grad_norm": 0.8721503281246444, + "learning_rate": 0.003, + "loss": 4.043, + "step": 24537 + }, + { + "epoch": 0.24538, + "grad_norm": 0.8826222742602462, + "learning_rate": 0.003, + "loss": 4.0052, + "step": 24538 + }, + { + "epoch": 0.24539, + "grad_norm": 1.027541132939409, + "learning_rate": 0.003, + "loss": 4.046, + "step": 24539 + }, + { + "epoch": 0.2454, + "grad_norm": 0.9479684540952781, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 24540 + }, + { + "epoch": 0.24541, + "grad_norm": 0.9098121022120571, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 24541 + }, + { + "epoch": 0.24542, + "grad_norm": 0.799271098334082, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 24542 + }, + { + "epoch": 0.24543, + "grad_norm": 0.7974412236664123, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 24543 + }, + { + "epoch": 0.24544, + "grad_norm": 0.8633252007284155, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 24544 + }, + { + "epoch": 0.24545, + "grad_norm": 1.0450055269426481, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 24545 + }, + { + "epoch": 0.24546, + "grad_norm": 1.3504095530058007, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 24546 + }, + { + "epoch": 0.24547, + "grad_norm": 0.7201918766578288, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24547 + }, + { + "epoch": 0.24548, + "grad_norm": 0.6665553792881939, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 24548 + }, + { + "epoch": 0.24549, + "grad_norm": 0.649404244455346, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 24549 + }, + { + "epoch": 0.2455, + "grad_norm": 0.711918767363004, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 24550 + }, + { + "epoch": 0.24551, + "grad_norm": 0.8861142713760095, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 24551 + }, + { + "epoch": 0.24552, + "grad_norm": 1.1463952628985503, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 24552 + }, + { + "epoch": 0.24553, + "grad_norm": 0.9979914529419902, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 24553 + }, + { + "epoch": 0.24554, + "grad_norm": 1.0355392003761392, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 24554 + }, + { + "epoch": 0.24555, + "grad_norm": 0.7490410049192778, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 24555 + }, + { + "epoch": 0.24556, + "grad_norm": 0.7940086360621044, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 24556 + }, + { + "epoch": 0.24557, + "grad_norm": 0.8355854056301416, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 24557 + }, + { + "epoch": 0.24558, + "grad_norm": 0.9660214099487647, + "learning_rate": 0.003, + "loss": 4.075, + "step": 24558 + }, + { + "epoch": 0.24559, + "grad_norm": 1.161799568779313, + "learning_rate": 0.003, + "loss": 4.064, + "step": 24559 + }, + { + "epoch": 0.2456, + "grad_norm": 0.9141415634201788, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 24560 + }, + { + "epoch": 0.24561, + "grad_norm": 0.9220859100779807, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 24561 + }, + { + "epoch": 0.24562, + "grad_norm": 0.8563433617162411, + "learning_rate": 0.003, + "loss": 4.043, + "step": 24562 + }, + { + "epoch": 0.24563, + "grad_norm": 0.8384508116151052, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 24563 + }, + { + "epoch": 0.24564, + "grad_norm": 0.7440743867370847, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 24564 + }, + { + "epoch": 0.24565, + "grad_norm": 0.6166764581732535, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 24565 + }, + { + "epoch": 0.24566, + "grad_norm": 0.622542271787728, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 24566 + }, + { + "epoch": 0.24567, + "grad_norm": 0.740494332514769, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 24567 + }, + { + "epoch": 0.24568, + "grad_norm": 0.8810675961657772, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 24568 + }, + { + "epoch": 0.24569, + "grad_norm": 1.123409218405106, + "learning_rate": 0.003, + "loss": 4.023, + "step": 24569 + }, + { + "epoch": 0.2457, + "grad_norm": 0.9942011367916302, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 24570 + }, + { + "epoch": 0.24571, + "grad_norm": 0.9873217859039962, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 24571 + }, + { + "epoch": 0.24572, + "grad_norm": 0.9443072355720511, + "learning_rate": 0.003, + "loss": 4.05, + "step": 24572 + }, + { + "epoch": 0.24573, + "grad_norm": 0.7949518462028978, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 24573 + }, + { + "epoch": 0.24574, + "grad_norm": 0.7871394044340357, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 24574 + }, + { + "epoch": 0.24575, + "grad_norm": 0.7171278673981191, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 24575 + }, + { + "epoch": 0.24576, + "grad_norm": 0.683884836392166, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 24576 + }, + { + "epoch": 0.24577, + "grad_norm": 0.7374527050245074, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 24577 + }, + { + "epoch": 0.24578, + "grad_norm": 0.7169688555678069, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 24578 + }, + { + "epoch": 0.24579, + "grad_norm": 0.919359313311798, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 24579 + }, + { + "epoch": 0.2458, + "grad_norm": 1.0816742554865615, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 24580 + }, + { + "epoch": 0.24581, + "grad_norm": 0.9876970990361058, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 24581 + }, + { + "epoch": 0.24582, + "grad_norm": 0.9040966199309631, + "learning_rate": 0.003, + "loss": 4.0929, + "step": 24582 + }, + { + "epoch": 0.24583, + "grad_norm": 0.7528270493757065, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 24583 + }, + { + "epoch": 0.24584, + "grad_norm": 0.644371703879505, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 24584 + }, + { + "epoch": 0.24585, + "grad_norm": 0.6791416962956901, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 24585 + }, + { + "epoch": 0.24586, + "grad_norm": 0.7969920922288772, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 24586 + }, + { + "epoch": 0.24587, + "grad_norm": 0.8669266099281332, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 24587 + }, + { + "epoch": 0.24588, + "grad_norm": 0.9344526329921855, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 24588 + }, + { + "epoch": 0.24589, + "grad_norm": 0.9238748636611341, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 24589 + }, + { + "epoch": 0.2459, + "grad_norm": 0.8562565134627064, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 24590 + }, + { + "epoch": 0.24591, + "grad_norm": 0.9157681943604944, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 24591 + }, + { + "epoch": 0.24592, + "grad_norm": 0.8768472479259104, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 24592 + }, + { + "epoch": 0.24593, + "grad_norm": 0.9366894146324585, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 24593 + }, + { + "epoch": 0.24594, + "grad_norm": 1.029622894257853, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 24594 + }, + { + "epoch": 0.24595, + "grad_norm": 1.023180264617836, + "learning_rate": 0.003, + "loss": 4.051, + "step": 24595 + }, + { + "epoch": 0.24596, + "grad_norm": 1.041952609550889, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 24596 + }, + { + "epoch": 0.24597, + "grad_norm": 1.223953981909932, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 24597 + }, + { + "epoch": 0.24598, + "grad_norm": 1.024048157605506, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 24598 + }, + { + "epoch": 0.24599, + "grad_norm": 1.039935132693132, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 24599 + }, + { + "epoch": 0.246, + "grad_norm": 1.0069069146554877, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 24600 + }, + { + "epoch": 0.24601, + "grad_norm": 1.0035211573348073, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 24601 + }, + { + "epoch": 0.24602, + "grad_norm": 1.1169823236415817, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 24602 + }, + { + "epoch": 0.24603, + "grad_norm": 0.891371798503237, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 24603 + }, + { + "epoch": 0.24604, + "grad_norm": 0.9676870444959385, + "learning_rate": 0.003, + "loss": 4.054, + "step": 24604 + }, + { + "epoch": 0.24605, + "grad_norm": 1.0489158142775563, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 24605 + }, + { + "epoch": 0.24606, + "grad_norm": 0.8795753911851342, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 24606 + }, + { + "epoch": 0.24607, + "grad_norm": 0.9008059453842032, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 24607 + }, + { + "epoch": 0.24608, + "grad_norm": 0.8948294619900213, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 24608 + }, + { + "epoch": 0.24609, + "grad_norm": 0.8669148434389932, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 24609 + }, + { + "epoch": 0.2461, + "grad_norm": 0.9476235126134983, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 24610 + }, + { + "epoch": 0.24611, + "grad_norm": 1.1555889591484974, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 24611 + }, + { + "epoch": 0.24612, + "grad_norm": 0.9734972167554912, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 24612 + }, + { + "epoch": 0.24613, + "grad_norm": 1.0066209721027795, + "learning_rate": 0.003, + "loss": 4.069, + "step": 24613 + }, + { + "epoch": 0.24614, + "grad_norm": 1.1991349465218428, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 24614 + }, + { + "epoch": 0.24615, + "grad_norm": 0.893054301207456, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 24615 + }, + { + "epoch": 0.24616, + "grad_norm": 0.7525212218009527, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 24616 + }, + { + "epoch": 0.24617, + "grad_norm": 0.6814761964706885, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 24617 + }, + { + "epoch": 0.24618, + "grad_norm": 0.6158485789533341, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 24618 + }, + { + "epoch": 0.24619, + "grad_norm": 0.599654424169733, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 24619 + }, + { + "epoch": 0.2462, + "grad_norm": 0.605089332202948, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 24620 + }, + { + "epoch": 0.24621, + "grad_norm": 0.6333693455845567, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 24621 + }, + { + "epoch": 0.24622, + "grad_norm": 0.6522380367977607, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 24622 + }, + { + "epoch": 0.24623, + "grad_norm": 0.6404569613178623, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 24623 + }, + { + "epoch": 0.24624, + "grad_norm": 0.7127617209939574, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 24624 + }, + { + "epoch": 0.24625, + "grad_norm": 0.7150750049984763, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 24625 + }, + { + "epoch": 0.24626, + "grad_norm": 0.7948715753286921, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 24626 + }, + { + "epoch": 0.24627, + "grad_norm": 0.8329950199868997, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 24627 + }, + { + "epoch": 0.24628, + "grad_norm": 1.0090902242277051, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 24628 + }, + { + "epoch": 0.24629, + "grad_norm": 1.3046360215069766, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 24629 + }, + { + "epoch": 0.2463, + "grad_norm": 0.6238796384952265, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 24630 + }, + { + "epoch": 0.24631, + "grad_norm": 0.8098156655686518, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 24631 + }, + { + "epoch": 0.24632, + "grad_norm": 1.0827874234534993, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 24632 + }, + { + "epoch": 0.24633, + "grad_norm": 1.1313240245989384, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 24633 + }, + { + "epoch": 0.24634, + "grad_norm": 0.9234789050291327, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 24634 + }, + { + "epoch": 0.24635, + "grad_norm": 0.7579391562768962, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 24635 + }, + { + "epoch": 0.24636, + "grad_norm": 0.6840384425367269, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 24636 + }, + { + "epoch": 0.24637, + "grad_norm": 0.666970778594563, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 24637 + }, + { + "epoch": 0.24638, + "grad_norm": 0.5875872466756134, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 24638 + }, + { + "epoch": 0.24639, + "grad_norm": 0.6030339869890238, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 24639 + }, + { + "epoch": 0.2464, + "grad_norm": 0.8130442360060062, + "learning_rate": 0.003, + "loss": 4.024, + "step": 24640 + }, + { + "epoch": 0.24641, + "grad_norm": 0.866987205018245, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 24641 + }, + { + "epoch": 0.24642, + "grad_norm": 0.8730514946172345, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 24642 + }, + { + "epoch": 0.24643, + "grad_norm": 0.9495781595679572, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 24643 + }, + { + "epoch": 0.24644, + "grad_norm": 1.0683735204189886, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 24644 + }, + { + "epoch": 0.24645, + "grad_norm": 1.0181861336800289, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 24645 + }, + { + "epoch": 0.24646, + "grad_norm": 0.9076283002489532, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 24646 + }, + { + "epoch": 0.24647, + "grad_norm": 0.9023066045004511, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 24647 + }, + { + "epoch": 0.24648, + "grad_norm": 0.9332708068104332, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 24648 + }, + { + "epoch": 0.24649, + "grad_norm": 1.2216648557084486, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 24649 + }, + { + "epoch": 0.2465, + "grad_norm": 0.8617397414962509, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 24650 + }, + { + "epoch": 0.24651, + "grad_norm": 0.7704009370317823, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 24651 + }, + { + "epoch": 0.24652, + "grad_norm": 0.7621235717071047, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 24652 + }, + { + "epoch": 0.24653, + "grad_norm": 0.6911446747333821, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 24653 + }, + { + "epoch": 0.24654, + "grad_norm": 0.6757519167272384, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 24654 + }, + { + "epoch": 0.24655, + "grad_norm": 0.7230822977944228, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 24655 + }, + { + "epoch": 0.24656, + "grad_norm": 0.8204899761186022, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 24656 + }, + { + "epoch": 0.24657, + "grad_norm": 0.8417548021534176, + "learning_rate": 0.003, + "loss": 4.034, + "step": 24657 + }, + { + "epoch": 0.24658, + "grad_norm": 0.6914000170242425, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 24658 + }, + { + "epoch": 0.24659, + "grad_norm": 0.7541513842650349, + "learning_rate": 0.003, + "loss": 4.039, + "step": 24659 + }, + { + "epoch": 0.2466, + "grad_norm": 0.9119403770842963, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 24660 + }, + { + "epoch": 0.24661, + "grad_norm": 1.0059259801339178, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 24661 + }, + { + "epoch": 0.24662, + "grad_norm": 1.144988246335087, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 24662 + }, + { + "epoch": 0.24663, + "grad_norm": 1.0126773677529235, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 24663 + }, + { + "epoch": 0.24664, + "grad_norm": 1.23661429732905, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 24664 + }, + { + "epoch": 0.24665, + "grad_norm": 0.8853960918435931, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 24665 + }, + { + "epoch": 0.24666, + "grad_norm": 0.8211520498673379, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 24666 + }, + { + "epoch": 0.24667, + "grad_norm": 0.8818568812447078, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 24667 + }, + { + "epoch": 0.24668, + "grad_norm": 1.029482335682524, + "learning_rate": 0.003, + "loss": 4.061, + "step": 24668 + }, + { + "epoch": 0.24669, + "grad_norm": 1.2413643237833103, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 24669 + }, + { + "epoch": 0.2467, + "grad_norm": 0.9163924612219522, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 24670 + }, + { + "epoch": 0.24671, + "grad_norm": 0.8658217052942775, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 24671 + }, + { + "epoch": 0.24672, + "grad_norm": 0.9735428662871606, + "learning_rate": 0.003, + "loss": 4.0964, + "step": 24672 + }, + { + "epoch": 0.24673, + "grad_norm": 1.0538013419192105, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 24673 + }, + { + "epoch": 0.24674, + "grad_norm": 0.9005057573009775, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 24674 + }, + { + "epoch": 0.24675, + "grad_norm": 0.9036759118640024, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 24675 + }, + { + "epoch": 0.24676, + "grad_norm": 0.9341753418882119, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 24676 + }, + { + "epoch": 0.24677, + "grad_norm": 0.9530372226742692, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 24677 + }, + { + "epoch": 0.24678, + "grad_norm": 1.0492651950375116, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 24678 + }, + { + "epoch": 0.24679, + "grad_norm": 0.9808535249134382, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 24679 + }, + { + "epoch": 0.2468, + "grad_norm": 0.9043372436810647, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 24680 + }, + { + "epoch": 0.24681, + "grad_norm": 0.8333167760827939, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 24681 + }, + { + "epoch": 0.24682, + "grad_norm": 0.7305892410341606, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 24682 + }, + { + "epoch": 0.24683, + "grad_norm": 0.8182417796276454, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 24683 + }, + { + "epoch": 0.24684, + "grad_norm": 0.8962203238233069, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 24684 + }, + { + "epoch": 0.24685, + "grad_norm": 1.022006486814936, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 24685 + }, + { + "epoch": 0.24686, + "grad_norm": 0.8235361509699674, + "learning_rate": 0.003, + "loss": 4.061, + "step": 24686 + }, + { + "epoch": 0.24687, + "grad_norm": 0.8322128349856641, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 24687 + }, + { + "epoch": 0.24688, + "grad_norm": 0.947593731940025, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 24688 + }, + { + "epoch": 0.24689, + "grad_norm": 1.0744293197372679, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 24689 + }, + { + "epoch": 0.2469, + "grad_norm": 1.0719831977834464, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 24690 + }, + { + "epoch": 0.24691, + "grad_norm": 1.009321321922841, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 24691 + }, + { + "epoch": 0.24692, + "grad_norm": 1.0086298575503758, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 24692 + }, + { + "epoch": 0.24693, + "grad_norm": 0.8484700218957829, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 24693 + }, + { + "epoch": 0.24694, + "grad_norm": 0.7011939122472073, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 24694 + }, + { + "epoch": 0.24695, + "grad_norm": 0.6973174542007287, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 24695 + }, + { + "epoch": 0.24696, + "grad_norm": 0.6460009765212417, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 24696 + }, + { + "epoch": 0.24697, + "grad_norm": 0.5822610636416075, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 24697 + }, + { + "epoch": 0.24698, + "grad_norm": 0.6295102264070797, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 24698 + }, + { + "epoch": 0.24699, + "grad_norm": 0.8215623491772215, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 24699 + }, + { + "epoch": 0.247, + "grad_norm": 1.1065741248396577, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 24700 + }, + { + "epoch": 0.24701, + "grad_norm": 1.0704589969820588, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 24701 + }, + { + "epoch": 0.24702, + "grad_norm": 0.96409858853508, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 24702 + }, + { + "epoch": 0.24703, + "grad_norm": 0.995248603870337, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 24703 + }, + { + "epoch": 0.24704, + "grad_norm": 1.0169583464904324, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 24704 + }, + { + "epoch": 0.24705, + "grad_norm": 0.92935327420754, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 24705 + }, + { + "epoch": 0.24706, + "grad_norm": 0.8762743078953602, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 24706 + }, + { + "epoch": 0.24707, + "grad_norm": 0.888311788992918, + "learning_rate": 0.003, + "loss": 4.0769, + "step": 24707 + }, + { + "epoch": 0.24708, + "grad_norm": 0.9920518993548603, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 24708 + }, + { + "epoch": 0.24709, + "grad_norm": 1.0064818585982882, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 24709 + }, + { + "epoch": 0.2471, + "grad_norm": 0.9764175218133553, + "learning_rate": 0.003, + "loss": 4.057, + "step": 24710 + }, + { + "epoch": 0.24711, + "grad_norm": 1.030980800256014, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 24711 + }, + { + "epoch": 0.24712, + "grad_norm": 0.9028728185396866, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 24712 + }, + { + "epoch": 0.24713, + "grad_norm": 0.9469948178538513, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 24713 + }, + { + "epoch": 0.24714, + "grad_norm": 0.9417309166030591, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 24714 + }, + { + "epoch": 0.24715, + "grad_norm": 0.9080061121964882, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 24715 + }, + { + "epoch": 0.24716, + "grad_norm": 1.1144677976787054, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 24716 + }, + { + "epoch": 0.24717, + "grad_norm": 1.0757300325956651, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 24717 + }, + { + "epoch": 0.24718, + "grad_norm": 1.0115350298444612, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 24718 + }, + { + "epoch": 0.24719, + "grad_norm": 1.0042228857292683, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 24719 + }, + { + "epoch": 0.2472, + "grad_norm": 0.8617575876170146, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 24720 + }, + { + "epoch": 0.24721, + "grad_norm": 0.6371764931692104, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 24721 + }, + { + "epoch": 0.24722, + "grad_norm": 0.7827463665470048, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 24722 + }, + { + "epoch": 0.24723, + "grad_norm": 0.7877463881592692, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 24723 + }, + { + "epoch": 0.24724, + "grad_norm": 0.6977610093299308, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 24724 + }, + { + "epoch": 0.24725, + "grad_norm": 0.658963242077991, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 24725 + }, + { + "epoch": 0.24726, + "grad_norm": 0.7264138519782811, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 24726 + }, + { + "epoch": 0.24727, + "grad_norm": 0.7566545850125642, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 24727 + }, + { + "epoch": 0.24728, + "grad_norm": 0.6479300315513504, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 24728 + }, + { + "epoch": 0.24729, + "grad_norm": 0.6615274006260478, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 24729 + }, + { + "epoch": 0.2473, + "grad_norm": 0.6398172779962713, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 24730 + }, + { + "epoch": 0.24731, + "grad_norm": 0.6799161993424543, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 24731 + }, + { + "epoch": 0.24732, + "grad_norm": 0.9864726102491923, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 24732 + }, + { + "epoch": 0.24733, + "grad_norm": 1.4206291095977883, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 24733 + }, + { + "epoch": 0.24734, + "grad_norm": 0.5077966152350878, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 24734 + }, + { + "epoch": 0.24735, + "grad_norm": 0.84335173414029, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 24735 + }, + { + "epoch": 0.24736, + "grad_norm": 1.0037136452923288, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 24736 + }, + { + "epoch": 0.24737, + "grad_norm": 1.0332639285660383, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 24737 + }, + { + "epoch": 0.24738, + "grad_norm": 0.9890618706340799, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 24738 + }, + { + "epoch": 0.24739, + "grad_norm": 0.7160421752294072, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24739 + }, + { + "epoch": 0.2474, + "grad_norm": 0.6945554613381376, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 24740 + }, + { + "epoch": 0.24741, + "grad_norm": 0.7168899707682601, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 24741 + }, + { + "epoch": 0.24742, + "grad_norm": 0.6894617129340092, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 24742 + }, + { + "epoch": 0.24743, + "grad_norm": 0.6167657059677037, + "learning_rate": 0.003, + "loss": 4.028, + "step": 24743 + }, + { + "epoch": 0.24744, + "grad_norm": 0.6147783105928425, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 24744 + }, + { + "epoch": 0.24745, + "grad_norm": 0.5732554180329266, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 24745 + }, + { + "epoch": 0.24746, + "grad_norm": 0.5866926273942326, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 24746 + }, + { + "epoch": 0.24747, + "grad_norm": 0.657803947560524, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 24747 + }, + { + "epoch": 0.24748, + "grad_norm": 0.7258347385420751, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 24748 + }, + { + "epoch": 0.24749, + "grad_norm": 0.8354675692037408, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 24749 + }, + { + "epoch": 0.2475, + "grad_norm": 1.050702796588416, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 24750 + }, + { + "epoch": 0.24751, + "grad_norm": 1.2221512772746002, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 24751 + }, + { + "epoch": 0.24752, + "grad_norm": 0.8878833253257229, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 24752 + }, + { + "epoch": 0.24753, + "grad_norm": 0.891905601398202, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 24753 + }, + { + "epoch": 0.24754, + "grad_norm": 0.8954660906896276, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 24754 + }, + { + "epoch": 0.24755, + "grad_norm": 0.8081400097737341, + "learning_rate": 0.003, + "loss": 4.003, + "step": 24755 + }, + { + "epoch": 0.24756, + "grad_norm": 0.7745240128448215, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 24756 + }, + { + "epoch": 0.24757, + "grad_norm": 0.8066309571613186, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 24757 + }, + { + "epoch": 0.24758, + "grad_norm": 0.8822188390264811, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 24758 + }, + { + "epoch": 0.24759, + "grad_norm": 1.066529063953866, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 24759 + }, + { + "epoch": 0.2476, + "grad_norm": 1.0892137216793414, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 24760 + }, + { + "epoch": 0.24761, + "grad_norm": 0.9048657406616295, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 24761 + }, + { + "epoch": 0.24762, + "grad_norm": 0.9287335087150327, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 24762 + }, + { + "epoch": 0.24763, + "grad_norm": 0.967006744373872, + "learning_rate": 0.003, + "loss": 4.075, + "step": 24763 + }, + { + "epoch": 0.24764, + "grad_norm": 0.9574266884551479, + "learning_rate": 0.003, + "loss": 4.076, + "step": 24764 + }, + { + "epoch": 0.24765, + "grad_norm": 0.9334459746376527, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 24765 + }, + { + "epoch": 0.24766, + "grad_norm": 0.9955984347985325, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 24766 + }, + { + "epoch": 0.24767, + "grad_norm": 1.3725984115284247, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 24767 + }, + { + "epoch": 0.24768, + "grad_norm": 0.8897627354191866, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 24768 + }, + { + "epoch": 0.24769, + "grad_norm": 0.8873010424819779, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 24769 + }, + { + "epoch": 0.2477, + "grad_norm": 0.9025217673476289, + "learning_rate": 0.003, + "loss": 4.073, + "step": 24770 + }, + { + "epoch": 0.24771, + "grad_norm": 0.9828900103483713, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 24771 + }, + { + "epoch": 0.24772, + "grad_norm": 1.1461005440890513, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 24772 + }, + { + "epoch": 0.24773, + "grad_norm": 0.8271918024365653, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 24773 + }, + { + "epoch": 0.24774, + "grad_norm": 0.8317913632171102, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 24774 + }, + { + "epoch": 0.24775, + "grad_norm": 0.9515172200959463, + "learning_rate": 0.003, + "loss": 4.041, + "step": 24775 + }, + { + "epoch": 0.24776, + "grad_norm": 1.0600256384223756, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 24776 + }, + { + "epoch": 0.24777, + "grad_norm": 0.9347496944658488, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 24777 + }, + { + "epoch": 0.24778, + "grad_norm": 0.9360542863318813, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24778 + }, + { + "epoch": 0.24779, + "grad_norm": 0.8722014137978014, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 24779 + }, + { + "epoch": 0.2478, + "grad_norm": 0.9267173825630988, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 24780 + }, + { + "epoch": 0.24781, + "grad_norm": 1.0443956534226608, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 24781 + }, + { + "epoch": 0.24782, + "grad_norm": 1.0717018137122372, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 24782 + }, + { + "epoch": 0.24783, + "grad_norm": 0.9413876208645124, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 24783 + }, + { + "epoch": 0.24784, + "grad_norm": 0.9214263540762326, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 24784 + }, + { + "epoch": 0.24785, + "grad_norm": 0.9538147781170828, + "learning_rate": 0.003, + "loss": 4.057, + "step": 24785 + }, + { + "epoch": 0.24786, + "grad_norm": 1.0071186243550303, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 24786 + }, + { + "epoch": 0.24787, + "grad_norm": 0.9744871374873619, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 24787 + }, + { + "epoch": 0.24788, + "grad_norm": 0.783671659155178, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 24788 + }, + { + "epoch": 0.24789, + "grad_norm": 0.7978079683295433, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 24789 + }, + { + "epoch": 0.2479, + "grad_norm": 0.7212399446965075, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 24790 + }, + { + "epoch": 0.24791, + "grad_norm": 0.8788970269379971, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 24791 + }, + { + "epoch": 0.24792, + "grad_norm": 1.1371002890939632, + "learning_rate": 0.003, + "loss": 4.047, + "step": 24792 + }, + { + "epoch": 0.24793, + "grad_norm": 0.93647985161739, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 24793 + }, + { + "epoch": 0.24794, + "grad_norm": 0.8738315150522239, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 24794 + }, + { + "epoch": 0.24795, + "grad_norm": 0.8016735992864765, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 24795 + }, + { + "epoch": 0.24796, + "grad_norm": 0.7688918003058197, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 24796 + }, + { + "epoch": 0.24797, + "grad_norm": 0.7537986058755989, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 24797 + }, + { + "epoch": 0.24798, + "grad_norm": 0.6696285599736962, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 24798 + }, + { + "epoch": 0.24799, + "grad_norm": 0.6615559594680294, + "learning_rate": 0.003, + "loss": 4.037, + "step": 24799 + }, + { + "epoch": 0.248, + "grad_norm": 0.6497014484506389, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 24800 + }, + { + "epoch": 0.24801, + "grad_norm": 0.5817485664092115, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 24801 + }, + { + "epoch": 0.24802, + "grad_norm": 0.6355465096765639, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 24802 + }, + { + "epoch": 0.24803, + "grad_norm": 0.73560123039149, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 24803 + }, + { + "epoch": 0.24804, + "grad_norm": 0.8284279396712518, + "learning_rate": 0.003, + "loss": 4.069, + "step": 24804 + }, + { + "epoch": 0.24805, + "grad_norm": 1.0745509499017416, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 24805 + }, + { + "epoch": 0.24806, + "grad_norm": 1.1999133801130832, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 24806 + }, + { + "epoch": 0.24807, + "grad_norm": 0.6550126467021705, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 24807 + }, + { + "epoch": 0.24808, + "grad_norm": 0.5559945393777279, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 24808 + }, + { + "epoch": 0.24809, + "grad_norm": 0.6468984513478265, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 24809 + }, + { + "epoch": 0.2481, + "grad_norm": 0.6239437965266489, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 24810 + }, + { + "epoch": 0.24811, + "grad_norm": 0.6365183078204011, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 24811 + }, + { + "epoch": 0.24812, + "grad_norm": 0.8547635354364749, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24812 + }, + { + "epoch": 0.24813, + "grad_norm": 1.0382798659209185, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 24813 + }, + { + "epoch": 0.24814, + "grad_norm": 0.9577500950739808, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 24814 + }, + { + "epoch": 0.24815, + "grad_norm": 0.9778281621053052, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 24815 + }, + { + "epoch": 0.24816, + "grad_norm": 1.203667926338086, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 24816 + }, + { + "epoch": 0.24817, + "grad_norm": 1.0414492207553028, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 24817 + }, + { + "epoch": 0.24818, + "grad_norm": 0.9908089619771676, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 24818 + }, + { + "epoch": 0.24819, + "grad_norm": 0.9557612358848938, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 24819 + }, + { + "epoch": 0.2482, + "grad_norm": 0.9632243454343027, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 24820 + }, + { + "epoch": 0.24821, + "grad_norm": 0.9270575289010755, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 24821 + }, + { + "epoch": 0.24822, + "grad_norm": 1.1387627243666694, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 24822 + }, + { + "epoch": 0.24823, + "grad_norm": 1.0276329289393864, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 24823 + }, + { + "epoch": 0.24824, + "grad_norm": 0.9504317077327361, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 24824 + }, + { + "epoch": 0.24825, + "grad_norm": 0.983997143480041, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 24825 + }, + { + "epoch": 0.24826, + "grad_norm": 1.2518179359624375, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 24826 + }, + { + "epoch": 0.24827, + "grad_norm": 0.8497038585910796, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 24827 + }, + { + "epoch": 0.24828, + "grad_norm": 0.6919839942540301, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 24828 + }, + { + "epoch": 0.24829, + "grad_norm": 0.6125319858387864, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 24829 + }, + { + "epoch": 0.2483, + "grad_norm": 0.5794198150584007, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 24830 + }, + { + "epoch": 0.24831, + "grad_norm": 0.5728331087813103, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 24831 + }, + { + "epoch": 0.24832, + "grad_norm": 0.6231918215982206, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 24832 + }, + { + "epoch": 0.24833, + "grad_norm": 0.6255952264934582, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 24833 + }, + { + "epoch": 0.24834, + "grad_norm": 0.6504108732818489, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 24834 + }, + { + "epoch": 0.24835, + "grad_norm": 0.7072904200647722, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 24835 + }, + { + "epoch": 0.24836, + "grad_norm": 0.7735055408247175, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 24836 + }, + { + "epoch": 0.24837, + "grad_norm": 0.8877538511342092, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 24837 + }, + { + "epoch": 0.24838, + "grad_norm": 0.9839276414632111, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 24838 + }, + { + "epoch": 0.24839, + "grad_norm": 0.9879033699394432, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 24839 + }, + { + "epoch": 0.2484, + "grad_norm": 1.1456824476675491, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 24840 + }, + { + "epoch": 0.24841, + "grad_norm": 0.7949750677248135, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 24841 + }, + { + "epoch": 0.24842, + "grad_norm": 0.6789789405730079, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 24842 + }, + { + "epoch": 0.24843, + "grad_norm": 0.6868752078579549, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 24843 + }, + { + "epoch": 0.24844, + "grad_norm": 0.7227304920843479, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 24844 + }, + { + "epoch": 0.24845, + "grad_norm": 0.8076739281818901, + "learning_rate": 0.003, + "loss": 4.061, + "step": 24845 + }, + { + "epoch": 0.24846, + "grad_norm": 0.9065040681895562, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 24846 + }, + { + "epoch": 0.24847, + "grad_norm": 1.0696085890558287, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 24847 + }, + { + "epoch": 0.24848, + "grad_norm": 1.2031992950138768, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 24848 + }, + { + "epoch": 0.24849, + "grad_norm": 1.0716571661164402, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 24849 + }, + { + "epoch": 0.2485, + "grad_norm": 1.0035113937137332, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 24850 + }, + { + "epoch": 0.24851, + "grad_norm": 1.0420199852033427, + "learning_rate": 0.003, + "loss": 4.045, + "step": 24851 + }, + { + "epoch": 0.24852, + "grad_norm": 1.0191196344070392, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 24852 + }, + { + "epoch": 0.24853, + "grad_norm": 0.9571857445906612, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 24853 + }, + { + "epoch": 0.24854, + "grad_norm": 0.9325267228249841, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 24854 + }, + { + "epoch": 0.24855, + "grad_norm": 1.1446378646908675, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 24855 + }, + { + "epoch": 0.24856, + "grad_norm": 0.9968365980358628, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 24856 + }, + { + "epoch": 0.24857, + "grad_norm": 1.0245139920884605, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 24857 + }, + { + "epoch": 0.24858, + "grad_norm": 1.023267065835406, + "learning_rate": 0.003, + "loss": 4.058, + "step": 24858 + }, + { + "epoch": 0.24859, + "grad_norm": 0.9163418620445073, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 24859 + }, + { + "epoch": 0.2486, + "grad_norm": 0.8688354713394478, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 24860 + }, + { + "epoch": 0.24861, + "grad_norm": 0.7638953468694308, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 24861 + }, + { + "epoch": 0.24862, + "grad_norm": 0.775892936801226, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 24862 + }, + { + "epoch": 0.24863, + "grad_norm": 0.9414704002418842, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 24863 + }, + { + "epoch": 0.24864, + "grad_norm": 1.1069737548040102, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 24864 + }, + { + "epoch": 0.24865, + "grad_norm": 0.9627236081944984, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 24865 + }, + { + "epoch": 0.24866, + "grad_norm": 1.1142266001833596, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 24866 + }, + { + "epoch": 0.24867, + "grad_norm": 0.877953098865027, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 24867 + }, + { + "epoch": 0.24868, + "grad_norm": 0.7929952625450133, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 24868 + }, + { + "epoch": 0.24869, + "grad_norm": 0.8337173579590729, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 24869 + }, + { + "epoch": 0.2487, + "grad_norm": 0.9784999540984296, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 24870 + }, + { + "epoch": 0.24871, + "grad_norm": 0.9147725816951395, + "learning_rate": 0.003, + "loss": 4.057, + "step": 24871 + }, + { + "epoch": 0.24872, + "grad_norm": 0.9431863496340517, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 24872 + }, + { + "epoch": 0.24873, + "grad_norm": 0.9740491400547014, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 24873 + }, + { + "epoch": 0.24874, + "grad_norm": 0.9598012206746511, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 24874 + }, + { + "epoch": 0.24875, + "grad_norm": 0.9626385728458898, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 24875 + }, + { + "epoch": 0.24876, + "grad_norm": 1.100486765283263, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 24876 + }, + { + "epoch": 0.24877, + "grad_norm": 0.8755189818545953, + "learning_rate": 0.003, + "loss": 4.0917, + "step": 24877 + }, + { + "epoch": 0.24878, + "grad_norm": 0.7676829437375242, + "learning_rate": 0.003, + "loss": 4.031, + "step": 24878 + }, + { + "epoch": 0.24879, + "grad_norm": 0.686228247796003, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 24879 + }, + { + "epoch": 0.2488, + "grad_norm": 0.6967292236037987, + "learning_rate": 0.003, + "loss": 4.038, + "step": 24880 + }, + { + "epoch": 0.24881, + "grad_norm": 0.8175813489031121, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 24881 + }, + { + "epoch": 0.24882, + "grad_norm": 0.886370468618304, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 24882 + }, + { + "epoch": 0.24883, + "grad_norm": 0.9965065779238221, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 24883 + }, + { + "epoch": 0.24884, + "grad_norm": 1.0001017372960763, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 24884 + }, + { + "epoch": 0.24885, + "grad_norm": 0.8079299682451483, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 24885 + }, + { + "epoch": 0.24886, + "grad_norm": 0.7920299518496763, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 24886 + }, + { + "epoch": 0.24887, + "grad_norm": 0.8279145887675554, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 24887 + }, + { + "epoch": 0.24888, + "grad_norm": 0.7469662010775341, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 24888 + }, + { + "epoch": 0.24889, + "grad_norm": 0.7367691468173052, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 24889 + }, + { + "epoch": 0.2489, + "grad_norm": 0.7782965206923067, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 24890 + }, + { + "epoch": 0.24891, + "grad_norm": 0.8188408684244228, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 24891 + }, + { + "epoch": 0.24892, + "grad_norm": 0.8440261633842542, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 24892 + }, + { + "epoch": 0.24893, + "grad_norm": 0.9606538215243803, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 24893 + }, + { + "epoch": 0.24894, + "grad_norm": 1.2530527302284533, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 24894 + }, + { + "epoch": 0.24895, + "grad_norm": 0.8797449775830343, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 24895 + }, + { + "epoch": 0.24896, + "grad_norm": 0.9188052800326935, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 24896 + }, + { + "epoch": 0.24897, + "grad_norm": 1.0491535537198742, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 24897 + }, + { + "epoch": 0.24898, + "grad_norm": 0.8943513368138608, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 24898 + }, + { + "epoch": 0.24899, + "grad_norm": 0.851867599968842, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 24899 + }, + { + "epoch": 0.249, + "grad_norm": 0.9601212192643067, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 24900 + }, + { + "epoch": 0.24901, + "grad_norm": 1.179622299882782, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 24901 + }, + { + "epoch": 0.24902, + "grad_norm": 0.8488716073711199, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 24902 + }, + { + "epoch": 0.24903, + "grad_norm": 0.7308817866111754, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 24903 + }, + { + "epoch": 0.24904, + "grad_norm": 0.7244690098561029, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 24904 + }, + { + "epoch": 0.24905, + "grad_norm": 0.6584451941518012, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 24905 + }, + { + "epoch": 0.24906, + "grad_norm": 0.7333743058979599, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 24906 + }, + { + "epoch": 0.24907, + "grad_norm": 0.7966410189561395, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 24907 + }, + { + "epoch": 0.24908, + "grad_norm": 0.9334433371618684, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 24908 + }, + { + "epoch": 0.24909, + "grad_norm": 1.0749466387909368, + "learning_rate": 0.003, + "loss": 4.061, + "step": 24909 + }, + { + "epoch": 0.2491, + "grad_norm": 0.9487453627887742, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 24910 + }, + { + "epoch": 0.24911, + "grad_norm": 1.0007955506965904, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 24911 + }, + { + "epoch": 0.24912, + "grad_norm": 1.0039564686318823, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 24912 + }, + { + "epoch": 0.24913, + "grad_norm": 1.0003759211214167, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 24913 + }, + { + "epoch": 0.24914, + "grad_norm": 0.9306396598207398, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 24914 + }, + { + "epoch": 0.24915, + "grad_norm": 0.8012488769211433, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 24915 + }, + { + "epoch": 0.24916, + "grad_norm": 0.8033458625520037, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 24916 + }, + { + "epoch": 0.24917, + "grad_norm": 0.801851353295159, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 24917 + }, + { + "epoch": 0.24918, + "grad_norm": 0.8531648412002176, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 24918 + }, + { + "epoch": 0.24919, + "grad_norm": 0.8115686677098268, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 24919 + }, + { + "epoch": 0.2492, + "grad_norm": 0.7656263463347808, + "learning_rate": 0.003, + "loss": 4.045, + "step": 24920 + }, + { + "epoch": 0.24921, + "grad_norm": 0.8188487924561175, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 24921 + }, + { + "epoch": 0.24922, + "grad_norm": 0.8795499923126378, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 24922 + }, + { + "epoch": 0.24923, + "grad_norm": 0.7395555329643584, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 24923 + }, + { + "epoch": 0.24924, + "grad_norm": 0.7288140675284586, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 24924 + }, + { + "epoch": 0.24925, + "grad_norm": 0.8203781303967768, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 24925 + }, + { + "epoch": 0.24926, + "grad_norm": 0.813997783854369, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 24926 + }, + { + "epoch": 0.24927, + "grad_norm": 0.8009930195595387, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 24927 + }, + { + "epoch": 0.24928, + "grad_norm": 0.8451900865691463, + "learning_rate": 0.003, + "loss": 4.063, + "step": 24928 + }, + { + "epoch": 0.24929, + "grad_norm": 0.9571485694909434, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 24929 + }, + { + "epoch": 0.2493, + "grad_norm": 1.1632393280145823, + "learning_rate": 0.003, + "loss": 4.037, + "step": 24930 + }, + { + "epoch": 0.24931, + "grad_norm": 1.0313534830884388, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 24931 + }, + { + "epoch": 0.24932, + "grad_norm": 1.1209655208497427, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 24932 + }, + { + "epoch": 0.24933, + "grad_norm": 1.084133027216186, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 24933 + }, + { + "epoch": 0.24934, + "grad_norm": 0.932391633903948, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 24934 + }, + { + "epoch": 0.24935, + "grad_norm": 0.8286007566347191, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 24935 + }, + { + "epoch": 0.24936, + "grad_norm": 0.7784070356969961, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 24936 + }, + { + "epoch": 0.24937, + "grad_norm": 0.7777675795002211, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 24937 + }, + { + "epoch": 0.24938, + "grad_norm": 0.8766752872675143, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 24938 + }, + { + "epoch": 0.24939, + "grad_norm": 0.8273996662328338, + "learning_rate": 0.003, + "loss": 4.059, + "step": 24939 + }, + { + "epoch": 0.2494, + "grad_norm": 0.9403034246012082, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 24940 + }, + { + "epoch": 0.24941, + "grad_norm": 1.1423149896106994, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 24941 + }, + { + "epoch": 0.24942, + "grad_norm": 1.035884751440043, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 24942 + }, + { + "epoch": 0.24943, + "grad_norm": 1.2471782884521638, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 24943 + }, + { + "epoch": 0.24944, + "grad_norm": 0.9759278201259436, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 24944 + }, + { + "epoch": 0.24945, + "grad_norm": 1.0042091292036346, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 24945 + }, + { + "epoch": 0.24946, + "grad_norm": 0.9481287782686568, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 24946 + }, + { + "epoch": 0.24947, + "grad_norm": 0.9582763274073257, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 24947 + }, + { + "epoch": 0.24948, + "grad_norm": 0.9119848983728647, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 24948 + }, + { + "epoch": 0.24949, + "grad_norm": 0.8319973091960529, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 24949 + }, + { + "epoch": 0.2495, + "grad_norm": 0.843193795345759, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 24950 + }, + { + "epoch": 0.24951, + "grad_norm": 0.8504347772802173, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 24951 + }, + { + "epoch": 0.24952, + "grad_norm": 0.9108638648891836, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 24952 + }, + { + "epoch": 0.24953, + "grad_norm": 0.8601271859555615, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 24953 + }, + { + "epoch": 0.24954, + "grad_norm": 0.9081774866326066, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 24954 + }, + { + "epoch": 0.24955, + "grad_norm": 0.9733720661267132, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 24955 + }, + { + "epoch": 0.24956, + "grad_norm": 0.92237190093115, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 24956 + }, + { + "epoch": 0.24957, + "grad_norm": 0.81505986901497, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 24957 + }, + { + "epoch": 0.24958, + "grad_norm": 0.774415666361524, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 24958 + }, + { + "epoch": 0.24959, + "grad_norm": 0.6938756128041287, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 24959 + }, + { + "epoch": 0.2496, + "grad_norm": 0.7412381256067151, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 24960 + }, + { + "epoch": 0.24961, + "grad_norm": 0.7520596050887787, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 24961 + }, + { + "epoch": 0.24962, + "grad_norm": 0.854951810178195, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 24962 + }, + { + "epoch": 0.24963, + "grad_norm": 1.0002367487907915, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 24963 + }, + { + "epoch": 0.24964, + "grad_norm": 1.1207272549093727, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 24964 + }, + { + "epoch": 0.24965, + "grad_norm": 0.729142390671697, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 24965 + }, + { + "epoch": 0.24966, + "grad_norm": 0.6233655498523967, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 24966 + }, + { + "epoch": 0.24967, + "grad_norm": 0.6350926012944166, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 24967 + }, + { + "epoch": 0.24968, + "grad_norm": 0.764886544532569, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 24968 + }, + { + "epoch": 0.24969, + "grad_norm": 0.8855675547198433, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 24969 + }, + { + "epoch": 0.2497, + "grad_norm": 0.9471713777068682, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 24970 + }, + { + "epoch": 0.24971, + "grad_norm": 0.880132568392059, + "learning_rate": 0.003, + "loss": 4.064, + "step": 24971 + }, + { + "epoch": 0.24972, + "grad_norm": 0.9267782852662799, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 24972 + }, + { + "epoch": 0.24973, + "grad_norm": 0.8460522414116444, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 24973 + }, + { + "epoch": 0.24974, + "grad_norm": 0.867101449897478, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 24974 + }, + { + "epoch": 0.24975, + "grad_norm": 0.9236297007084834, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 24975 + }, + { + "epoch": 0.24976, + "grad_norm": 0.9534470208657182, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 24976 + }, + { + "epoch": 0.24977, + "grad_norm": 0.9579467687298114, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 24977 + }, + { + "epoch": 0.24978, + "grad_norm": 1.0074596869477315, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 24978 + }, + { + "epoch": 0.24979, + "grad_norm": 1.1933870957283397, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 24979 + }, + { + "epoch": 0.2498, + "grad_norm": 0.9262414555758269, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 24980 + }, + { + "epoch": 0.24981, + "grad_norm": 1.0407926728673673, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 24981 + }, + { + "epoch": 0.24982, + "grad_norm": 1.0573723151059415, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 24982 + }, + { + "epoch": 0.24983, + "grad_norm": 0.998923768936595, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 24983 + }, + { + "epoch": 0.24984, + "grad_norm": 0.8679313654728963, + "learning_rate": 0.003, + "loss": 4.0058, + "step": 24984 + }, + { + "epoch": 0.24985, + "grad_norm": 0.8384879860214401, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 24985 + }, + { + "epoch": 0.24986, + "grad_norm": 0.9633055182038077, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 24986 + }, + { + "epoch": 0.24987, + "grad_norm": 1.070799731316395, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 24987 + }, + { + "epoch": 0.24988, + "grad_norm": 0.9470749894070539, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 24988 + }, + { + "epoch": 0.24989, + "grad_norm": 0.9558436825349234, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 24989 + }, + { + "epoch": 0.2499, + "grad_norm": 0.9846096362443845, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 24990 + }, + { + "epoch": 0.24991, + "grad_norm": 1.053019455540664, + "learning_rate": 0.003, + "loss": 4.062, + "step": 24991 + }, + { + "epoch": 0.24992, + "grad_norm": 0.9438830834941232, + "learning_rate": 0.003, + "loss": 4.0908, + "step": 24992 + }, + { + "epoch": 0.24993, + "grad_norm": 0.8934186776989499, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 24993 + }, + { + "epoch": 0.24994, + "grad_norm": 0.9014277737825089, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 24994 + }, + { + "epoch": 0.24995, + "grad_norm": 0.8329123557886955, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 24995 + }, + { + "epoch": 0.24996, + "grad_norm": 0.8251514479363162, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 24996 + }, + { + "epoch": 0.24997, + "grad_norm": 0.7977269692120545, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 24997 + }, + { + "epoch": 0.24998, + "grad_norm": 0.6571353906713401, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 24998 + }, + { + "epoch": 0.24999, + "grad_norm": 0.7322523637202981, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 24999 + }, + { + "epoch": 0.25, + "grad_norm": 0.8412326268340425, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 25000 + }, + { + "epoch": 0.25001, + "grad_norm": 1.037524276671091, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 25001 + }, + { + "epoch": 0.25002, + "grad_norm": 1.0190557401502651, + "learning_rate": 0.003, + "loss": 4.035, + "step": 25002 + }, + { + "epoch": 0.25003, + "grad_norm": 1.083487939041908, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 25003 + }, + { + "epoch": 0.25004, + "grad_norm": 1.001186293158713, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 25004 + }, + { + "epoch": 0.25005, + "grad_norm": 0.6619402560882597, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 25005 + }, + { + "epoch": 0.25006, + "grad_norm": 0.7721198796401331, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 25006 + }, + { + "epoch": 0.25007, + "grad_norm": 0.9574069605386972, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 25007 + }, + { + "epoch": 0.25008, + "grad_norm": 1.2899009248707805, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 25008 + }, + { + "epoch": 0.25009, + "grad_norm": 1.2224013035021652, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 25009 + }, + { + "epoch": 0.2501, + "grad_norm": 1.0757192290835778, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 25010 + }, + { + "epoch": 0.25011, + "grad_norm": 0.9376665664113467, + "learning_rate": 0.003, + "loss": 4.0914, + "step": 25011 + }, + { + "epoch": 0.25012, + "grad_norm": 0.8812841673479098, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 25012 + }, + { + "epoch": 0.25013, + "grad_norm": 0.9729970063704426, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 25013 + }, + { + "epoch": 0.25014, + "grad_norm": 1.072993005848804, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 25014 + }, + { + "epoch": 0.25015, + "grad_norm": 1.4911524103483254, + "learning_rate": 0.003, + "loss": 4.0984, + "step": 25015 + }, + { + "epoch": 0.25016, + "grad_norm": 1.4401482924523783, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 25016 + }, + { + "epoch": 0.25017, + "grad_norm": 1.3798354667013213, + "learning_rate": 0.003, + "loss": 4.1174, + "step": 25017 + }, + { + "epoch": 0.25018, + "grad_norm": 0.9626967197458514, + "learning_rate": 0.003, + "loss": 4.093, + "step": 25018 + }, + { + "epoch": 0.25019, + "grad_norm": 1.058305573669329, + "learning_rate": 0.003, + "loss": 4.1113, + "step": 25019 + }, + { + "epoch": 0.2502, + "grad_norm": 1.043219965259639, + "learning_rate": 0.003, + "loss": 4.0961, + "step": 25020 + }, + { + "epoch": 0.25021, + "grad_norm": 1.1496339136709215, + "learning_rate": 0.003, + "loss": 4.0999, + "step": 25021 + }, + { + "epoch": 0.25022, + "grad_norm": 1.2164552872614778, + "learning_rate": 0.003, + "loss": 4.0967, + "step": 25022 + }, + { + "epoch": 0.25023, + "grad_norm": 1.6243003789370114, + "learning_rate": 0.003, + "loss": 4.1022, + "step": 25023 + }, + { + "epoch": 0.25024, + "grad_norm": 1.124096707417229, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 25024 + }, + { + "epoch": 0.25025, + "grad_norm": 1.0419774078006838, + "learning_rate": 0.003, + "loss": 4.1052, + "step": 25025 + }, + { + "epoch": 0.25026, + "grad_norm": 0.9711366351677849, + "learning_rate": 0.003, + "loss": 4.1024, + "step": 25026 + }, + { + "epoch": 0.25027, + "grad_norm": 0.97675025403763, + "learning_rate": 0.003, + "loss": 4.0932, + "step": 25027 + }, + { + "epoch": 0.25028, + "grad_norm": 1.1109070886817412, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 25028 + }, + { + "epoch": 0.25029, + "grad_norm": 1.2172574584394815, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 25029 + }, + { + "epoch": 0.2503, + "grad_norm": 1.247965871887491, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 25030 + }, + { + "epoch": 0.25031, + "grad_norm": 0.7776165108217428, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 25031 + }, + { + "epoch": 0.25032, + "grad_norm": 0.8306721903717568, + "learning_rate": 0.003, + "loss": 4.084, + "step": 25032 + }, + { + "epoch": 0.25033, + "grad_norm": 0.9670834995339764, + "learning_rate": 0.003, + "loss": 4.0979, + "step": 25033 + }, + { + "epoch": 0.25034, + "grad_norm": 1.0347753903428625, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 25034 + }, + { + "epoch": 0.25035, + "grad_norm": 0.9160684364908147, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 25035 + }, + { + "epoch": 0.25036, + "grad_norm": 0.8458236427949395, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 25036 + }, + { + "epoch": 0.25037, + "grad_norm": 1.1818064024537793, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 25037 + }, + { + "epoch": 0.25038, + "grad_norm": 1.1883236506527866, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 25038 + }, + { + "epoch": 0.25039, + "grad_norm": 1.2020523876367668, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 25039 + }, + { + "epoch": 0.2504, + "grad_norm": 0.8837722727470024, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 25040 + }, + { + "epoch": 0.25041, + "grad_norm": 0.8522048980155027, + "learning_rate": 0.003, + "loss": 4.091, + "step": 25041 + }, + { + "epoch": 0.25042, + "grad_norm": 0.9399330735825984, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 25042 + }, + { + "epoch": 0.25043, + "grad_norm": 1.0378991269960471, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 25043 + }, + { + "epoch": 0.25044, + "grad_norm": 0.929956005496625, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 25044 + }, + { + "epoch": 0.25045, + "grad_norm": 1.020457057304478, + "learning_rate": 0.003, + "loss": 4.055, + "step": 25045 + }, + { + "epoch": 0.25046, + "grad_norm": 1.140047710480014, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 25046 + }, + { + "epoch": 0.25047, + "grad_norm": 0.7902108998865288, + "learning_rate": 0.003, + "loss": 4.08, + "step": 25047 + }, + { + "epoch": 0.25048, + "grad_norm": 0.8971655814094962, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 25048 + }, + { + "epoch": 0.25049, + "grad_norm": 0.8922678756905887, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 25049 + }, + { + "epoch": 0.2505, + "grad_norm": 1.1519497631746742, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 25050 + }, + { + "epoch": 0.25051, + "grad_norm": 1.1195376673999218, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 25051 + }, + { + "epoch": 0.25052, + "grad_norm": 0.97888067076068, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 25052 + }, + { + "epoch": 0.25053, + "grad_norm": 0.8241755508775127, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 25053 + }, + { + "epoch": 0.25054, + "grad_norm": 0.6713725274741734, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 25054 + }, + { + "epoch": 0.25055, + "grad_norm": 0.6150041292698852, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 25055 + }, + { + "epoch": 0.25056, + "grad_norm": 0.6315935068627877, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 25056 + }, + { + "epoch": 0.25057, + "grad_norm": 0.7602159538029633, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 25057 + }, + { + "epoch": 0.25058, + "grad_norm": 0.916700254370856, + "learning_rate": 0.003, + "loss": 4.06, + "step": 25058 + }, + { + "epoch": 0.25059, + "grad_norm": 1.058707832673356, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 25059 + }, + { + "epoch": 0.2506, + "grad_norm": 1.0954966816091278, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 25060 + }, + { + "epoch": 0.25061, + "grad_norm": 0.9691939733883124, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 25061 + }, + { + "epoch": 0.25062, + "grad_norm": 0.8092999766323778, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 25062 + }, + { + "epoch": 0.25063, + "grad_norm": 0.8246555639580762, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 25063 + }, + { + "epoch": 0.25064, + "grad_norm": 0.7628830041794898, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 25064 + }, + { + "epoch": 0.25065, + "grad_norm": 0.7272746422352339, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 25065 + }, + { + "epoch": 0.25066, + "grad_norm": 0.6345537117820802, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 25066 + }, + { + "epoch": 0.25067, + "grad_norm": 0.4675959715614357, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 25067 + }, + { + "epoch": 0.25068, + "grad_norm": 0.4863201850031912, + "learning_rate": 0.003, + "loss": 4.0073, + "step": 25068 + }, + { + "epoch": 0.25069, + "grad_norm": 0.4617046091041555, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 25069 + }, + { + "epoch": 0.2507, + "grad_norm": 0.43825143815971773, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 25070 + }, + { + "epoch": 0.25071, + "grad_norm": 0.525905116417564, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 25071 + }, + { + "epoch": 0.25072, + "grad_norm": 0.668319837458617, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 25072 + }, + { + "epoch": 0.25073, + "grad_norm": 0.8422337250179343, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 25073 + }, + { + "epoch": 0.25074, + "grad_norm": 1.1816687225665465, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 25074 + }, + { + "epoch": 0.25075, + "grad_norm": 1.0020566381361342, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 25075 + }, + { + "epoch": 0.25076, + "grad_norm": 0.986729759270076, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 25076 + }, + { + "epoch": 0.25077, + "grad_norm": 1.00469383067617, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 25077 + }, + { + "epoch": 0.25078, + "grad_norm": 0.8837208467119414, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 25078 + }, + { + "epoch": 0.25079, + "grad_norm": 0.6824917933565879, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 25079 + }, + { + "epoch": 0.2508, + "grad_norm": 0.6198671270040899, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 25080 + }, + { + "epoch": 0.25081, + "grad_norm": 0.5443134229543437, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 25081 + }, + { + "epoch": 0.25082, + "grad_norm": 0.6004036647609302, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 25082 + }, + { + "epoch": 0.25083, + "grad_norm": 0.6508500130264369, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 25083 + }, + { + "epoch": 0.25084, + "grad_norm": 0.9005061338615322, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 25084 + }, + { + "epoch": 0.25085, + "grad_norm": 1.2109685128305554, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 25085 + }, + { + "epoch": 0.25086, + "grad_norm": 0.8615367253459664, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 25086 + }, + { + "epoch": 0.25087, + "grad_norm": 0.61490159543475, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 25087 + }, + { + "epoch": 0.25088, + "grad_norm": 0.5645078039744594, + "learning_rate": 0.003, + "loss": 4.062, + "step": 25088 + }, + { + "epoch": 0.25089, + "grad_norm": 0.6806136886781947, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25089 + }, + { + "epoch": 0.2509, + "grad_norm": 0.8254573353259544, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 25090 + }, + { + "epoch": 0.25091, + "grad_norm": 0.9523313090582275, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 25091 + }, + { + "epoch": 0.25092, + "grad_norm": 1.0607135733791946, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 25092 + }, + { + "epoch": 0.25093, + "grad_norm": 0.9336317380898301, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 25093 + }, + { + "epoch": 0.25094, + "grad_norm": 0.9383989739073828, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 25094 + }, + { + "epoch": 0.25095, + "grad_norm": 0.976214928244242, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 25095 + }, + { + "epoch": 0.25096, + "grad_norm": 1.038526541165959, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 25096 + }, + { + "epoch": 0.25097, + "grad_norm": 1.129597483555832, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 25097 + }, + { + "epoch": 0.25098, + "grad_norm": 1.019362127675871, + "learning_rate": 0.003, + "loss": 4.047, + "step": 25098 + }, + { + "epoch": 0.25099, + "grad_norm": 0.8608281225298912, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 25099 + }, + { + "epoch": 0.251, + "grad_norm": 0.8012608490443469, + "learning_rate": 0.003, + "loss": 4.053, + "step": 25100 + }, + { + "epoch": 0.25101, + "grad_norm": 0.6842043070653451, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 25101 + }, + { + "epoch": 0.25102, + "grad_norm": 0.5740110129881074, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 25102 + }, + { + "epoch": 0.25103, + "grad_norm": 0.5147349471610092, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 25103 + }, + { + "epoch": 0.25104, + "grad_norm": 0.4756094369133143, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 25104 + }, + { + "epoch": 0.25105, + "grad_norm": 0.46513665124776143, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 25105 + }, + { + "epoch": 0.25106, + "grad_norm": 0.4329026291624255, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 25106 + }, + { + "epoch": 0.25107, + "grad_norm": 0.4567673634992878, + "learning_rate": 0.003, + "loss": 4.0027, + "step": 25107 + }, + { + "epoch": 0.25108, + "grad_norm": 0.4936691979667193, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 25108 + }, + { + "epoch": 0.25109, + "grad_norm": 0.5902121576068349, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 25109 + }, + { + "epoch": 0.2511, + "grad_norm": 0.8069945609455702, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 25110 + }, + { + "epoch": 0.25111, + "grad_norm": 1.136310175425401, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 25111 + }, + { + "epoch": 0.25112, + "grad_norm": 1.1967387967538097, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 25112 + }, + { + "epoch": 0.25113, + "grad_norm": 0.7619060803032691, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 25113 + }, + { + "epoch": 0.25114, + "grad_norm": 0.8071813860116033, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 25114 + }, + { + "epoch": 0.25115, + "grad_norm": 0.9339821731483683, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 25115 + }, + { + "epoch": 0.25116, + "grad_norm": 0.8957828945875281, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 25116 + }, + { + "epoch": 0.25117, + "grad_norm": 0.8617330471773642, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 25117 + }, + { + "epoch": 0.25118, + "grad_norm": 0.9180024376747362, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 25118 + }, + { + "epoch": 0.25119, + "grad_norm": 0.8684379685402638, + "learning_rate": 0.003, + "loss": 4.0058, + "step": 25119 + }, + { + "epoch": 0.2512, + "grad_norm": 0.9286283278573202, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 25120 + }, + { + "epoch": 0.25121, + "grad_norm": 0.9401947200061271, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 25121 + }, + { + "epoch": 0.25122, + "grad_norm": 0.8834515685729514, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 25122 + }, + { + "epoch": 0.25123, + "grad_norm": 0.7750276533630232, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 25123 + }, + { + "epoch": 0.25124, + "grad_norm": 0.8631769841365771, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 25124 + }, + { + "epoch": 0.25125, + "grad_norm": 1.1906761217962791, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 25125 + }, + { + "epoch": 0.25126, + "grad_norm": 1.0720401482892024, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 25126 + }, + { + "epoch": 0.25127, + "grad_norm": 0.8767545569957969, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 25127 + }, + { + "epoch": 0.25128, + "grad_norm": 0.7863285916794128, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 25128 + }, + { + "epoch": 0.25129, + "grad_norm": 0.7340058131336088, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 25129 + }, + { + "epoch": 0.2513, + "grad_norm": 0.6875929322762984, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 25130 + }, + { + "epoch": 0.25131, + "grad_norm": 0.599577439650636, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 25131 + }, + { + "epoch": 0.25132, + "grad_norm": 0.690679847366617, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 25132 + }, + { + "epoch": 0.25133, + "grad_norm": 0.6529766175097188, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 25133 + }, + { + "epoch": 0.25134, + "grad_norm": 0.5916883964116354, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 25134 + }, + { + "epoch": 0.25135, + "grad_norm": 0.57167109764271, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 25135 + }, + { + "epoch": 0.25136, + "grad_norm": 0.668114027669172, + "learning_rate": 0.003, + "loss": 3.9963, + "step": 25136 + }, + { + "epoch": 0.25137, + "grad_norm": 0.8738805137224449, + "learning_rate": 0.003, + "loss": 4.047, + "step": 25137 + }, + { + "epoch": 0.25138, + "grad_norm": 1.1286847828099993, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 25138 + }, + { + "epoch": 0.25139, + "grad_norm": 1.0587493220005495, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 25139 + }, + { + "epoch": 0.2514, + "grad_norm": 0.9975101547437617, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 25140 + }, + { + "epoch": 0.25141, + "grad_norm": 0.9880287927954604, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 25141 + }, + { + "epoch": 0.25142, + "grad_norm": 0.9229511689269122, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 25142 + }, + { + "epoch": 0.25143, + "grad_norm": 0.8843976777708843, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 25143 + }, + { + "epoch": 0.25144, + "grad_norm": 0.962221493395311, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 25144 + }, + { + "epoch": 0.25145, + "grad_norm": 0.9698641348802396, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 25145 + }, + { + "epoch": 0.25146, + "grad_norm": 1.0569166111746384, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 25146 + }, + { + "epoch": 0.25147, + "grad_norm": 1.2123511031266783, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 25147 + }, + { + "epoch": 0.25148, + "grad_norm": 0.9203625784837228, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 25148 + }, + { + "epoch": 0.25149, + "grad_norm": 0.8211277267162767, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 25149 + }, + { + "epoch": 0.2515, + "grad_norm": 0.9898899201348818, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 25150 + }, + { + "epoch": 0.25151, + "grad_norm": 1.1705211043169956, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 25151 + }, + { + "epoch": 0.25152, + "grad_norm": 0.9526808882868566, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 25152 + }, + { + "epoch": 0.25153, + "grad_norm": 0.8389608061512351, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 25153 + }, + { + "epoch": 0.25154, + "grad_norm": 0.7533982375758314, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 25154 + }, + { + "epoch": 0.25155, + "grad_norm": 0.8544760858647533, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 25155 + }, + { + "epoch": 0.25156, + "grad_norm": 0.8828070352273748, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 25156 + }, + { + "epoch": 0.25157, + "grad_norm": 0.9604626305653975, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 25157 + }, + { + "epoch": 0.25158, + "grad_norm": 1.048648378825132, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 25158 + }, + { + "epoch": 0.25159, + "grad_norm": 1.0833908511434693, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 25159 + }, + { + "epoch": 0.2516, + "grad_norm": 0.8408758870744419, + "learning_rate": 0.003, + "loss": 4.03, + "step": 25160 + }, + { + "epoch": 0.25161, + "grad_norm": 0.7556336457631874, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 25161 + }, + { + "epoch": 0.25162, + "grad_norm": 0.7293796072226307, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 25162 + }, + { + "epoch": 0.25163, + "grad_norm": 0.709286265984986, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 25163 + }, + { + "epoch": 0.25164, + "grad_norm": 0.6638362614172476, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 25164 + }, + { + "epoch": 0.25165, + "grad_norm": 0.6338064291897981, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 25165 + }, + { + "epoch": 0.25166, + "grad_norm": 0.6816138402393093, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 25166 + }, + { + "epoch": 0.25167, + "grad_norm": 0.795382936425935, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 25167 + }, + { + "epoch": 0.25168, + "grad_norm": 0.9295426899641294, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 25168 + }, + { + "epoch": 0.25169, + "grad_norm": 0.9547852532356386, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 25169 + }, + { + "epoch": 0.2517, + "grad_norm": 0.9005290815211721, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 25170 + }, + { + "epoch": 0.25171, + "grad_norm": 0.8003418463630907, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 25171 + }, + { + "epoch": 0.25172, + "grad_norm": 0.7098177762206411, + "learning_rate": 0.003, + "loss": 4.036, + "step": 25172 + }, + { + "epoch": 0.25173, + "grad_norm": 0.7087210930518817, + "learning_rate": 0.003, + "loss": 3.987, + "step": 25173 + }, + { + "epoch": 0.25174, + "grad_norm": 0.6279601183392163, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 25174 + }, + { + "epoch": 0.25175, + "grad_norm": 0.688130799164182, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 25175 + }, + { + "epoch": 0.25176, + "grad_norm": 0.7765466121227397, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 25176 + }, + { + "epoch": 0.25177, + "grad_norm": 0.940906905031278, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 25177 + }, + { + "epoch": 0.25178, + "grad_norm": 1.075773281279824, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 25178 + }, + { + "epoch": 0.25179, + "grad_norm": 0.9841992918270726, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 25179 + }, + { + "epoch": 0.2518, + "grad_norm": 0.9369489061134327, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 25180 + }, + { + "epoch": 0.25181, + "grad_norm": 0.8248480057966866, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 25181 + }, + { + "epoch": 0.25182, + "grad_norm": 0.9892215434358935, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 25182 + }, + { + "epoch": 0.25183, + "grad_norm": 0.9207774582261831, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 25183 + }, + { + "epoch": 0.25184, + "grad_norm": 0.8886669570125391, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 25184 + }, + { + "epoch": 0.25185, + "grad_norm": 0.7821355678708054, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 25185 + }, + { + "epoch": 0.25186, + "grad_norm": 0.8901906377624766, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 25186 + }, + { + "epoch": 0.25187, + "grad_norm": 0.9900567139605738, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 25187 + }, + { + "epoch": 0.25188, + "grad_norm": 1.1305716629622564, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 25188 + }, + { + "epoch": 0.25189, + "grad_norm": 0.7542767343839198, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 25189 + }, + { + "epoch": 0.2519, + "grad_norm": 0.6027405628789158, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 25190 + }, + { + "epoch": 0.25191, + "grad_norm": 0.7246862571682013, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 25191 + }, + { + "epoch": 0.25192, + "grad_norm": 0.774040854479532, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 25192 + }, + { + "epoch": 0.25193, + "grad_norm": 0.6341528986659734, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 25193 + }, + { + "epoch": 0.25194, + "grad_norm": 0.5694056197889509, + "learning_rate": 0.003, + "loss": 4.01, + "step": 25194 + }, + { + "epoch": 0.25195, + "grad_norm": 0.7739948032594215, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 25195 + }, + { + "epoch": 0.25196, + "grad_norm": 0.9624965736244836, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 25196 + }, + { + "epoch": 0.25197, + "grad_norm": 1.070892106503341, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 25197 + }, + { + "epoch": 0.25198, + "grad_norm": 0.9605315431678978, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 25198 + }, + { + "epoch": 0.25199, + "grad_norm": 0.9788966527264475, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 25199 + }, + { + "epoch": 0.252, + "grad_norm": 0.9424917208332682, + "learning_rate": 0.003, + "loss": 4.062, + "step": 25200 + }, + { + "epoch": 0.25201, + "grad_norm": 1.1661054463651646, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 25201 + }, + { + "epoch": 0.25202, + "grad_norm": 1.0190964944092762, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 25202 + }, + { + "epoch": 0.25203, + "grad_norm": 1.1096687047797764, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 25203 + }, + { + "epoch": 0.25204, + "grad_norm": 1.0279324933045393, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 25204 + }, + { + "epoch": 0.25205, + "grad_norm": 1.1370091227500148, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 25205 + }, + { + "epoch": 0.25206, + "grad_norm": 0.9181255689912888, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 25206 + }, + { + "epoch": 0.25207, + "grad_norm": 0.7704206828748278, + "learning_rate": 0.003, + "loss": 4.042, + "step": 25207 + }, + { + "epoch": 0.25208, + "grad_norm": 0.745834151731976, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 25208 + }, + { + "epoch": 0.25209, + "grad_norm": 0.6856649792357714, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 25209 + }, + { + "epoch": 0.2521, + "grad_norm": 0.7622430125826458, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 25210 + }, + { + "epoch": 0.25211, + "grad_norm": 0.7986538714644699, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 25211 + }, + { + "epoch": 0.25212, + "grad_norm": 0.8734018047260165, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 25212 + }, + { + "epoch": 0.25213, + "grad_norm": 0.8326016255756234, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 25213 + }, + { + "epoch": 0.25214, + "grad_norm": 0.7443003911106852, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 25214 + }, + { + "epoch": 0.25215, + "grad_norm": 0.7526355470218531, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 25215 + }, + { + "epoch": 0.25216, + "grad_norm": 0.9472294575923542, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 25216 + }, + { + "epoch": 0.25217, + "grad_norm": 1.2402170598118385, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 25217 + }, + { + "epoch": 0.25218, + "grad_norm": 0.9745632980354524, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 25218 + }, + { + "epoch": 0.25219, + "grad_norm": 0.973079921672595, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 25219 + }, + { + "epoch": 0.2522, + "grad_norm": 1.0026004606339565, + "learning_rate": 0.003, + "loss": 4.062, + "step": 25220 + }, + { + "epoch": 0.25221, + "grad_norm": 0.9005587341136152, + "learning_rate": 0.003, + "loss": 4.0934, + "step": 25221 + }, + { + "epoch": 0.25222, + "grad_norm": 0.8087794163926014, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 25222 + }, + { + "epoch": 0.25223, + "grad_norm": 0.7121581125382069, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 25223 + }, + { + "epoch": 0.25224, + "grad_norm": 0.7424726687515298, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 25224 + }, + { + "epoch": 0.25225, + "grad_norm": 0.7476766293586138, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 25225 + }, + { + "epoch": 0.25226, + "grad_norm": 0.7346687257354678, + "learning_rate": 0.003, + "loss": 4.083, + "step": 25226 + }, + { + "epoch": 0.25227, + "grad_norm": 0.6869175734922605, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25227 + }, + { + "epoch": 0.25228, + "grad_norm": 0.7498749359421119, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 25228 + }, + { + "epoch": 0.25229, + "grad_norm": 0.8699068122680356, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 25229 + }, + { + "epoch": 0.2523, + "grad_norm": 0.9198411467379168, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 25230 + }, + { + "epoch": 0.25231, + "grad_norm": 0.9380321912887235, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 25231 + }, + { + "epoch": 0.25232, + "grad_norm": 0.9905981867738339, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 25232 + }, + { + "epoch": 0.25233, + "grad_norm": 0.9430014506728069, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 25233 + }, + { + "epoch": 0.25234, + "grad_norm": 1.0044826588230238, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 25234 + }, + { + "epoch": 0.25235, + "grad_norm": 1.1855414762160477, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 25235 + }, + { + "epoch": 0.25236, + "grad_norm": 0.8237914038283526, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 25236 + }, + { + "epoch": 0.25237, + "grad_norm": 0.7198964430985642, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 25237 + }, + { + "epoch": 0.25238, + "grad_norm": 0.7088122263581368, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 25238 + }, + { + "epoch": 0.25239, + "grad_norm": 0.7691120843572459, + "learning_rate": 0.003, + "loss": 4.035, + "step": 25239 + }, + { + "epoch": 0.2524, + "grad_norm": 0.7349382923158768, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 25240 + }, + { + "epoch": 0.25241, + "grad_norm": 0.7921997771722659, + "learning_rate": 0.003, + "loss": 4.052, + "step": 25241 + }, + { + "epoch": 0.25242, + "grad_norm": 0.9768713384762246, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 25242 + }, + { + "epoch": 0.25243, + "grad_norm": 1.0542123636991634, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 25243 + }, + { + "epoch": 0.25244, + "grad_norm": 0.8276986442114519, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 25244 + }, + { + "epoch": 0.25245, + "grad_norm": 0.7357455725766722, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 25245 + }, + { + "epoch": 0.25246, + "grad_norm": 1.0319044598724785, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 25246 + }, + { + "epoch": 0.25247, + "grad_norm": 1.3120279592577566, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 25247 + }, + { + "epoch": 0.25248, + "grad_norm": 0.8286114938976565, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 25248 + }, + { + "epoch": 0.25249, + "grad_norm": 0.7851173587255428, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 25249 + }, + { + "epoch": 0.2525, + "grad_norm": 0.7484835280417966, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 25250 + }, + { + "epoch": 0.25251, + "grad_norm": 0.7262841719914204, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 25251 + }, + { + "epoch": 0.25252, + "grad_norm": 0.7762935232725945, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 25252 + }, + { + "epoch": 0.25253, + "grad_norm": 0.7348960292908512, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 25253 + }, + { + "epoch": 0.25254, + "grad_norm": 0.7490491515241705, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 25254 + }, + { + "epoch": 0.25255, + "grad_norm": 0.8354672467712169, + "learning_rate": 0.003, + "loss": 4.057, + "step": 25255 + }, + { + "epoch": 0.25256, + "grad_norm": 0.926177167624343, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 25256 + }, + { + "epoch": 0.25257, + "grad_norm": 1.0427794923489049, + "learning_rate": 0.003, + "loss": 4.056, + "step": 25257 + }, + { + "epoch": 0.25258, + "grad_norm": 1.0955999190626222, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 25258 + }, + { + "epoch": 0.25259, + "grad_norm": 1.056473983272084, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 25259 + }, + { + "epoch": 0.2526, + "grad_norm": 0.9659000814153147, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 25260 + }, + { + "epoch": 0.25261, + "grad_norm": 0.9590073496711436, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 25261 + }, + { + "epoch": 0.25262, + "grad_norm": 1.1558839791320852, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 25262 + }, + { + "epoch": 0.25263, + "grad_norm": 0.8344369479069749, + "learning_rate": 0.003, + "loss": 4.036, + "step": 25263 + }, + { + "epoch": 0.25264, + "grad_norm": 0.8111044497591182, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 25264 + }, + { + "epoch": 0.25265, + "grad_norm": 0.7271217320257766, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 25265 + }, + { + "epoch": 0.25266, + "grad_norm": 0.6665717487847707, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 25266 + }, + { + "epoch": 0.25267, + "grad_norm": 0.5663656116799429, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 25267 + }, + { + "epoch": 0.25268, + "grad_norm": 0.5214214111924499, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 25268 + }, + { + "epoch": 0.25269, + "grad_norm": 0.6292278456194152, + "learning_rate": 0.003, + "loss": 4.002, + "step": 25269 + }, + { + "epoch": 0.2527, + "grad_norm": 0.7029327778166253, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 25270 + }, + { + "epoch": 0.25271, + "grad_norm": 0.8759019153673309, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 25271 + }, + { + "epoch": 0.25272, + "grad_norm": 0.8955640173246128, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 25272 + }, + { + "epoch": 0.25273, + "grad_norm": 0.7466769901288722, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 25273 + }, + { + "epoch": 0.25274, + "grad_norm": 0.6332287408372114, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 25274 + }, + { + "epoch": 0.25275, + "grad_norm": 0.6729359609891398, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 25275 + }, + { + "epoch": 0.25276, + "grad_norm": 0.736250896404633, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 25276 + }, + { + "epoch": 0.25277, + "grad_norm": 0.7645156763425622, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 25277 + }, + { + "epoch": 0.25278, + "grad_norm": 0.9239753453357612, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 25278 + }, + { + "epoch": 0.25279, + "grad_norm": 1.0369218580532171, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 25279 + }, + { + "epoch": 0.2528, + "grad_norm": 0.960665030868886, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 25280 + }, + { + "epoch": 0.25281, + "grad_norm": 1.1587882851744427, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 25281 + }, + { + "epoch": 0.25282, + "grad_norm": 0.9250410365536718, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 25282 + }, + { + "epoch": 0.25283, + "grad_norm": 0.9849085885811593, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 25283 + }, + { + "epoch": 0.25284, + "grad_norm": 1.0664066574748838, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 25284 + }, + { + "epoch": 0.25285, + "grad_norm": 0.8908381876198105, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 25285 + }, + { + "epoch": 0.25286, + "grad_norm": 1.000445410786719, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 25286 + }, + { + "epoch": 0.25287, + "grad_norm": 1.0783585706918735, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 25287 + }, + { + "epoch": 0.25288, + "grad_norm": 0.9430737984026302, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 25288 + }, + { + "epoch": 0.25289, + "grad_norm": 0.9242728937612926, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 25289 + }, + { + "epoch": 0.2529, + "grad_norm": 0.9618947346540768, + "learning_rate": 0.003, + "loss": 4.041, + "step": 25290 + }, + { + "epoch": 0.25291, + "grad_norm": 0.9284490082835613, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 25291 + }, + { + "epoch": 0.25292, + "grad_norm": 0.8769662225821231, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 25292 + }, + { + "epoch": 0.25293, + "grad_norm": 0.9586373807822492, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 25293 + }, + { + "epoch": 0.25294, + "grad_norm": 1.069876489815011, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 25294 + }, + { + "epoch": 0.25295, + "grad_norm": 0.9192923928513109, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 25295 + }, + { + "epoch": 0.25296, + "grad_norm": 1.0978734419387173, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 25296 + }, + { + "epoch": 0.25297, + "grad_norm": 0.976272785047859, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 25297 + }, + { + "epoch": 0.25298, + "grad_norm": 0.8816945945241275, + "learning_rate": 0.003, + "loss": 4.066, + "step": 25298 + }, + { + "epoch": 0.25299, + "grad_norm": 0.7207166363875429, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 25299 + }, + { + "epoch": 0.253, + "grad_norm": 0.7201205185320428, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 25300 + }, + { + "epoch": 0.25301, + "grad_norm": 0.8543453280327397, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 25301 + }, + { + "epoch": 0.25302, + "grad_norm": 1.0716620064262306, + "learning_rate": 0.003, + "loss": 4.046, + "step": 25302 + }, + { + "epoch": 0.25303, + "grad_norm": 1.0881797746825694, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 25303 + }, + { + "epoch": 0.25304, + "grad_norm": 1.016852006375203, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 25304 + }, + { + "epoch": 0.25305, + "grad_norm": 0.9554927945062803, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 25305 + }, + { + "epoch": 0.25306, + "grad_norm": 0.9277551869535792, + "learning_rate": 0.003, + "loss": 4.049, + "step": 25306 + }, + { + "epoch": 0.25307, + "grad_norm": 0.9986351820325645, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 25307 + }, + { + "epoch": 0.25308, + "grad_norm": 1.0005087099042784, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 25308 + }, + { + "epoch": 0.25309, + "grad_norm": 0.9566213582654867, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 25309 + }, + { + "epoch": 0.2531, + "grad_norm": 0.9462389729872062, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 25310 + }, + { + "epoch": 0.25311, + "grad_norm": 0.9786778133935703, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 25311 + }, + { + "epoch": 0.25312, + "grad_norm": 0.9349738469532464, + "learning_rate": 0.003, + "loss": 4.074, + "step": 25312 + }, + { + "epoch": 0.25313, + "grad_norm": 0.9532965691201983, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 25313 + }, + { + "epoch": 0.25314, + "grad_norm": 0.9982918258218613, + "learning_rate": 0.003, + "loss": 4.03, + "step": 25314 + }, + { + "epoch": 0.25315, + "grad_norm": 0.8885490950199534, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 25315 + }, + { + "epoch": 0.25316, + "grad_norm": 0.7721583075658163, + "learning_rate": 0.003, + "loss": 4.023, + "step": 25316 + }, + { + "epoch": 0.25317, + "grad_norm": 0.7743708964220125, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 25317 + }, + { + "epoch": 0.25318, + "grad_norm": 0.7591344200289556, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 25318 + }, + { + "epoch": 0.25319, + "grad_norm": 0.727765352689977, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 25319 + }, + { + "epoch": 0.2532, + "grad_norm": 0.7709962453743868, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 25320 + }, + { + "epoch": 0.25321, + "grad_norm": 0.705722334589492, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 25321 + }, + { + "epoch": 0.25322, + "grad_norm": 0.7525885466576987, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 25322 + }, + { + "epoch": 0.25323, + "grad_norm": 0.8126192126234123, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 25323 + }, + { + "epoch": 0.25324, + "grad_norm": 0.8560447124533925, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 25324 + }, + { + "epoch": 0.25325, + "grad_norm": 0.9334315537878937, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 25325 + }, + { + "epoch": 0.25326, + "grad_norm": 1.0088873037776613, + "learning_rate": 0.003, + "loss": 4.056, + "step": 25326 + }, + { + "epoch": 0.25327, + "grad_norm": 1.1520395561855754, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 25327 + }, + { + "epoch": 0.25328, + "grad_norm": 0.8636976251836621, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 25328 + }, + { + "epoch": 0.25329, + "grad_norm": 0.8233150442572819, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 25329 + }, + { + "epoch": 0.2533, + "grad_norm": 0.740016900916187, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 25330 + }, + { + "epoch": 0.25331, + "grad_norm": 0.6811805208570688, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 25331 + }, + { + "epoch": 0.25332, + "grad_norm": 0.662662156005869, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 25332 + }, + { + "epoch": 0.25333, + "grad_norm": 0.7645817960044332, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 25333 + }, + { + "epoch": 0.25334, + "grad_norm": 1.0647025891409587, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 25334 + }, + { + "epoch": 0.25335, + "grad_norm": 1.1499368124227125, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 25335 + }, + { + "epoch": 0.25336, + "grad_norm": 0.9691997843489087, + "learning_rate": 0.003, + "loss": 4.074, + "step": 25336 + }, + { + "epoch": 0.25337, + "grad_norm": 0.9956269773790227, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 25337 + }, + { + "epoch": 0.25338, + "grad_norm": 1.0395578016260243, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 25338 + }, + { + "epoch": 0.25339, + "grad_norm": 1.1080047341468648, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 25339 + }, + { + "epoch": 0.2534, + "grad_norm": 1.0891566670494628, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 25340 + }, + { + "epoch": 0.25341, + "grad_norm": 0.9992001174644397, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 25341 + }, + { + "epoch": 0.25342, + "grad_norm": 0.9654661123101281, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 25342 + }, + { + "epoch": 0.25343, + "grad_norm": 0.9812674394225781, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25343 + }, + { + "epoch": 0.25344, + "grad_norm": 0.9329278657193633, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 25344 + }, + { + "epoch": 0.25345, + "grad_norm": 0.9958000555101592, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 25345 + }, + { + "epoch": 0.25346, + "grad_norm": 1.0684253808163764, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 25346 + }, + { + "epoch": 0.25347, + "grad_norm": 0.9881712765715379, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 25347 + }, + { + "epoch": 0.25348, + "grad_norm": 0.9211958370620181, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 25348 + }, + { + "epoch": 0.25349, + "grad_norm": 0.800051863518404, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 25349 + }, + { + "epoch": 0.2535, + "grad_norm": 0.6863668037561003, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 25350 + }, + { + "epoch": 0.25351, + "grad_norm": 0.6540307341820532, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 25351 + }, + { + "epoch": 0.25352, + "grad_norm": 0.6795148378455266, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 25352 + }, + { + "epoch": 0.25353, + "grad_norm": 0.8136563327333226, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 25353 + }, + { + "epoch": 0.25354, + "grad_norm": 0.997991280001786, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 25354 + }, + { + "epoch": 0.25355, + "grad_norm": 1.1504047054129567, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 25355 + }, + { + "epoch": 0.25356, + "grad_norm": 0.8354597398779309, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 25356 + }, + { + "epoch": 0.25357, + "grad_norm": 0.7497340470905738, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 25357 + }, + { + "epoch": 0.25358, + "grad_norm": 0.7779159811438947, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 25358 + }, + { + "epoch": 0.25359, + "grad_norm": 0.7357366437028393, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 25359 + }, + { + "epoch": 0.2536, + "grad_norm": 0.8257347824140242, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 25360 + }, + { + "epoch": 0.25361, + "grad_norm": 0.8791147133718787, + "learning_rate": 0.003, + "loss": 4.075, + "step": 25361 + }, + { + "epoch": 0.25362, + "grad_norm": 0.9820244568066587, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25362 + }, + { + "epoch": 0.25363, + "grad_norm": 1.008120974601097, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 25363 + }, + { + "epoch": 0.25364, + "grad_norm": 0.9968645577938047, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 25364 + }, + { + "epoch": 0.25365, + "grad_norm": 1.3774432360776234, + "learning_rate": 0.003, + "loss": 4.1042, + "step": 25365 + }, + { + "epoch": 0.25366, + "grad_norm": 0.7743160983542406, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 25366 + }, + { + "epoch": 0.25367, + "grad_norm": 0.6521957190836891, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 25367 + }, + { + "epoch": 0.25368, + "grad_norm": 0.6310637605850519, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 25368 + }, + { + "epoch": 0.25369, + "grad_norm": 0.6617832700730458, + "learning_rate": 0.003, + "loss": 4.043, + "step": 25369 + }, + { + "epoch": 0.2537, + "grad_norm": 0.7164069115245838, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 25370 + }, + { + "epoch": 0.25371, + "grad_norm": 0.7580742664238952, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 25371 + }, + { + "epoch": 0.25372, + "grad_norm": 0.7807874231529279, + "learning_rate": 0.003, + "loss": 4.033, + "step": 25372 + }, + { + "epoch": 0.25373, + "grad_norm": 0.6564404852899143, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 25373 + }, + { + "epoch": 0.25374, + "grad_norm": 0.5935282523274951, + "learning_rate": 0.003, + "loss": 4.028, + "step": 25374 + }, + { + "epoch": 0.25375, + "grad_norm": 0.5296634657853462, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 25375 + }, + { + "epoch": 0.25376, + "grad_norm": 0.4181778099612975, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 25376 + }, + { + "epoch": 0.25377, + "grad_norm": 0.47798468276113126, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 25377 + }, + { + "epoch": 0.25378, + "grad_norm": 0.596500985522097, + "learning_rate": 0.003, + "loss": 3.9937, + "step": 25378 + }, + { + "epoch": 0.25379, + "grad_norm": 0.7719746235945136, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 25379 + }, + { + "epoch": 0.2538, + "grad_norm": 0.9590825149948662, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 25380 + }, + { + "epoch": 0.25381, + "grad_norm": 1.1441179454035353, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 25381 + }, + { + "epoch": 0.25382, + "grad_norm": 0.851145891503752, + "learning_rate": 0.003, + "loss": 4.059, + "step": 25382 + }, + { + "epoch": 0.25383, + "grad_norm": 0.9736854894282182, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 25383 + }, + { + "epoch": 0.25384, + "grad_norm": 1.08488241572489, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 25384 + }, + { + "epoch": 0.25385, + "grad_norm": 1.08421579681445, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 25385 + }, + { + "epoch": 0.25386, + "grad_norm": 0.960931951607441, + "learning_rate": 0.003, + "loss": 4.032, + "step": 25386 + }, + { + "epoch": 0.25387, + "grad_norm": 0.8716795048806129, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 25387 + }, + { + "epoch": 0.25388, + "grad_norm": 0.743683455320296, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 25388 + }, + { + "epoch": 0.25389, + "grad_norm": 0.7409421438781537, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 25389 + }, + { + "epoch": 0.2539, + "grad_norm": 0.9583556615793433, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 25390 + }, + { + "epoch": 0.25391, + "grad_norm": 1.2822763993639898, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 25391 + }, + { + "epoch": 0.25392, + "grad_norm": 0.7852785808092915, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 25392 + }, + { + "epoch": 0.25393, + "grad_norm": 0.8054222509597576, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 25393 + }, + { + "epoch": 0.25394, + "grad_norm": 0.8448170134102573, + "learning_rate": 0.003, + "loss": 4.0991, + "step": 25394 + }, + { + "epoch": 0.25395, + "grad_norm": 0.9647944876800983, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 25395 + }, + { + "epoch": 0.25396, + "grad_norm": 1.008748791335382, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 25396 + }, + { + "epoch": 0.25397, + "grad_norm": 1.0277717931923147, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 25397 + }, + { + "epoch": 0.25398, + "grad_norm": 0.9367218610733006, + "learning_rate": 0.003, + "loss": 4.024, + "step": 25398 + }, + { + "epoch": 0.25399, + "grad_norm": 0.8636153444094693, + "learning_rate": 0.003, + "loss": 4.038, + "step": 25399 + }, + { + "epoch": 0.254, + "grad_norm": 0.8795463288207355, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 25400 + }, + { + "epoch": 0.25401, + "grad_norm": 1.038900095744999, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 25401 + }, + { + "epoch": 0.25402, + "grad_norm": 1.153109891819328, + "learning_rate": 0.003, + "loss": 4.04, + "step": 25402 + }, + { + "epoch": 0.25403, + "grad_norm": 0.7834441092563924, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 25403 + }, + { + "epoch": 0.25404, + "grad_norm": 0.5804236645021761, + "learning_rate": 0.003, + "loss": 4.054, + "step": 25404 + }, + { + "epoch": 0.25405, + "grad_norm": 0.6443431267252544, + "learning_rate": 0.003, + "loss": 4.064, + "step": 25405 + }, + { + "epoch": 0.25406, + "grad_norm": 0.6917601310589838, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 25406 + }, + { + "epoch": 0.25407, + "grad_norm": 0.7705419030514664, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 25407 + }, + { + "epoch": 0.25408, + "grad_norm": 0.844362485479889, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 25408 + }, + { + "epoch": 0.25409, + "grad_norm": 0.8736617083147991, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 25409 + }, + { + "epoch": 0.2541, + "grad_norm": 0.9456108059698425, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 25410 + }, + { + "epoch": 0.25411, + "grad_norm": 1.0031540315671683, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 25411 + }, + { + "epoch": 0.25412, + "grad_norm": 0.9115030835895792, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 25412 + }, + { + "epoch": 0.25413, + "grad_norm": 0.7643016281522143, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 25413 + }, + { + "epoch": 0.25414, + "grad_norm": 0.7753846567339371, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 25414 + }, + { + "epoch": 0.25415, + "grad_norm": 0.816961213431605, + "learning_rate": 0.003, + "loss": 4.03, + "step": 25415 + }, + { + "epoch": 0.25416, + "grad_norm": 0.7984824395942095, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 25416 + }, + { + "epoch": 0.25417, + "grad_norm": 0.9236465661385209, + "learning_rate": 0.003, + "loss": 4.046, + "step": 25417 + }, + { + "epoch": 0.25418, + "grad_norm": 0.9805961052156241, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 25418 + }, + { + "epoch": 0.25419, + "grad_norm": 1.0609700217988367, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 25419 + }, + { + "epoch": 0.2542, + "grad_norm": 1.014883826356174, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 25420 + }, + { + "epoch": 0.25421, + "grad_norm": 0.9724732555754642, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 25421 + }, + { + "epoch": 0.25422, + "grad_norm": 1.0059934252737834, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 25422 + }, + { + "epoch": 0.25423, + "grad_norm": 0.998373456982844, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 25423 + }, + { + "epoch": 0.25424, + "grad_norm": 1.0188897861571056, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 25424 + }, + { + "epoch": 0.25425, + "grad_norm": 1.0102722734940304, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 25425 + }, + { + "epoch": 0.25426, + "grad_norm": 0.9847216496965288, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 25426 + }, + { + "epoch": 0.25427, + "grad_norm": 0.8519119893093335, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 25427 + }, + { + "epoch": 0.25428, + "grad_norm": 0.6378448683837147, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 25428 + }, + { + "epoch": 0.25429, + "grad_norm": 0.5954905714975156, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 25429 + }, + { + "epoch": 0.2543, + "grad_norm": 0.6600043923253341, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 25430 + }, + { + "epoch": 0.25431, + "grad_norm": 0.7272253830209318, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25431 + }, + { + "epoch": 0.25432, + "grad_norm": 0.7771316581669774, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 25432 + }, + { + "epoch": 0.25433, + "grad_norm": 0.8608374103288706, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 25433 + }, + { + "epoch": 0.25434, + "grad_norm": 0.9706761600012382, + "learning_rate": 0.003, + "loss": 4.065, + "step": 25434 + }, + { + "epoch": 0.25435, + "grad_norm": 1.3222323352698893, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 25435 + }, + { + "epoch": 0.25436, + "grad_norm": 0.795646782107252, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 25436 + }, + { + "epoch": 0.25437, + "grad_norm": 0.7422078669450148, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 25437 + }, + { + "epoch": 0.25438, + "grad_norm": 0.7205753569530813, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 25438 + }, + { + "epoch": 0.25439, + "grad_norm": 0.8374703056208461, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 25439 + }, + { + "epoch": 0.2544, + "grad_norm": 0.9473015924535735, + "learning_rate": 0.003, + "loss": 4.0889, + "step": 25440 + }, + { + "epoch": 0.25441, + "grad_norm": 0.9610956779741119, + "learning_rate": 0.003, + "loss": 4.026, + "step": 25441 + }, + { + "epoch": 0.25442, + "grad_norm": 1.0998434799535048, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 25442 + }, + { + "epoch": 0.25443, + "grad_norm": 0.9517483964201439, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 25443 + }, + { + "epoch": 0.25444, + "grad_norm": 1.0881923435275649, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 25444 + }, + { + "epoch": 0.25445, + "grad_norm": 0.9854540428577792, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 25445 + }, + { + "epoch": 0.25446, + "grad_norm": 0.9937334275077021, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 25446 + }, + { + "epoch": 0.25447, + "grad_norm": 0.8991009518576873, + "learning_rate": 0.003, + "loss": 4.088, + "step": 25447 + }, + { + "epoch": 0.25448, + "grad_norm": 0.7835774811713323, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 25448 + }, + { + "epoch": 0.25449, + "grad_norm": 0.7177785680100887, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 25449 + }, + { + "epoch": 0.2545, + "grad_norm": 0.7240991220140802, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 25450 + }, + { + "epoch": 0.25451, + "grad_norm": 0.8143129993012853, + "learning_rate": 0.003, + "loss": 4.041, + "step": 25451 + }, + { + "epoch": 0.25452, + "grad_norm": 0.932923088842068, + "learning_rate": 0.003, + "loss": 4.028, + "step": 25452 + }, + { + "epoch": 0.25453, + "grad_norm": 1.0511225170303924, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 25453 + }, + { + "epoch": 0.25454, + "grad_norm": 0.9486055805646416, + "learning_rate": 0.003, + "loss": 4.06, + "step": 25454 + }, + { + "epoch": 0.25455, + "grad_norm": 0.8992113606784659, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 25455 + }, + { + "epoch": 0.25456, + "grad_norm": 0.8141542958239314, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 25456 + }, + { + "epoch": 0.25457, + "grad_norm": 0.8003685390606392, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 25457 + }, + { + "epoch": 0.25458, + "grad_norm": 0.7702494461996949, + "learning_rate": 0.003, + "loss": 4.056, + "step": 25458 + }, + { + "epoch": 0.25459, + "grad_norm": 0.7926421691522504, + "learning_rate": 0.003, + "loss": 4.03, + "step": 25459 + }, + { + "epoch": 0.2546, + "grad_norm": 0.9799385857050648, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 25460 + }, + { + "epoch": 0.25461, + "grad_norm": 0.9615310596977051, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 25461 + }, + { + "epoch": 0.25462, + "grad_norm": 0.9028601664226293, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 25462 + }, + { + "epoch": 0.25463, + "grad_norm": 0.9020841310022902, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 25463 + }, + { + "epoch": 0.25464, + "grad_norm": 0.861457753056404, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 25464 + }, + { + "epoch": 0.25465, + "grad_norm": 0.8009973446596149, + "learning_rate": 0.003, + "loss": 4.047, + "step": 25465 + }, + { + "epoch": 0.25466, + "grad_norm": 0.858171763018721, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 25466 + }, + { + "epoch": 0.25467, + "grad_norm": 0.9410375851232814, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 25467 + }, + { + "epoch": 0.25468, + "grad_norm": 0.8383089541302179, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 25468 + }, + { + "epoch": 0.25469, + "grad_norm": 0.8337644242039746, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 25469 + }, + { + "epoch": 0.2547, + "grad_norm": 0.8698234142245929, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 25470 + }, + { + "epoch": 0.25471, + "grad_norm": 0.9416410545568833, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 25471 + }, + { + "epoch": 0.25472, + "grad_norm": 0.9582760517410952, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 25472 + }, + { + "epoch": 0.25473, + "grad_norm": 0.9543998899328856, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 25473 + }, + { + "epoch": 0.25474, + "grad_norm": 0.890724598577648, + "learning_rate": 0.003, + "loss": 4.043, + "step": 25474 + }, + { + "epoch": 0.25475, + "grad_norm": 0.9239564384184911, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 25475 + }, + { + "epoch": 0.25476, + "grad_norm": 0.9457922367645536, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 25476 + }, + { + "epoch": 0.25477, + "grad_norm": 0.8573318977003976, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 25477 + }, + { + "epoch": 0.25478, + "grad_norm": 0.895295965815012, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 25478 + }, + { + "epoch": 0.25479, + "grad_norm": 0.9168892873538358, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 25479 + }, + { + "epoch": 0.2548, + "grad_norm": 0.8741428763811664, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 25480 + }, + { + "epoch": 0.25481, + "grad_norm": 0.9105460421050953, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 25481 + }, + { + "epoch": 0.25482, + "grad_norm": 0.9370675321003953, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 25482 + }, + { + "epoch": 0.25483, + "grad_norm": 1.0718822402509025, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 25483 + }, + { + "epoch": 0.25484, + "grad_norm": 1.0001632500340663, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 25484 + }, + { + "epoch": 0.25485, + "grad_norm": 1.1392574341173547, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 25485 + }, + { + "epoch": 0.25486, + "grad_norm": 1.013848565441649, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 25486 + }, + { + "epoch": 0.25487, + "grad_norm": 0.9946501617885597, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 25487 + }, + { + "epoch": 0.25488, + "grad_norm": 0.8692316044232674, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 25488 + }, + { + "epoch": 0.25489, + "grad_norm": 0.8405934452494283, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 25489 + }, + { + "epoch": 0.2549, + "grad_norm": 0.8184790010403689, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 25490 + }, + { + "epoch": 0.25491, + "grad_norm": 0.7197990368625105, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 25491 + }, + { + "epoch": 0.25492, + "grad_norm": 0.782838330427358, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 25492 + }, + { + "epoch": 0.25493, + "grad_norm": 0.7672056907291688, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 25493 + }, + { + "epoch": 0.25494, + "grad_norm": 0.7505411870142598, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 25494 + }, + { + "epoch": 0.25495, + "grad_norm": 0.8081563798574534, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 25495 + }, + { + "epoch": 0.25496, + "grad_norm": 1.0025012764633239, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 25496 + }, + { + "epoch": 0.25497, + "grad_norm": 1.1301600497198363, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 25497 + }, + { + "epoch": 0.25498, + "grad_norm": 0.7981922005489023, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 25498 + }, + { + "epoch": 0.25499, + "grad_norm": 0.9103924489681806, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 25499 + }, + { + "epoch": 0.255, + "grad_norm": 0.9494091859031275, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 25500 + }, + { + "epoch": 0.25501, + "grad_norm": 1.0208856096818113, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 25501 + }, + { + "epoch": 0.25502, + "grad_norm": 1.3111924181603687, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 25502 + }, + { + "epoch": 0.25503, + "grad_norm": 0.7788057062135986, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 25503 + }, + { + "epoch": 0.25504, + "grad_norm": 0.7520958041327547, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 25504 + }, + { + "epoch": 0.25505, + "grad_norm": 0.7187610784459232, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 25505 + }, + { + "epoch": 0.25506, + "grad_norm": 0.5748661321152742, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 25506 + }, + { + "epoch": 0.25507, + "grad_norm": 0.5997867976081815, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 25507 + }, + { + "epoch": 0.25508, + "grad_norm": 0.5955428365180547, + "learning_rate": 0.003, + "loss": 4.0836, + "step": 25508 + }, + { + "epoch": 0.25509, + "grad_norm": 0.8084719075610944, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 25509 + }, + { + "epoch": 0.2551, + "grad_norm": 1.0142895679559947, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 25510 + }, + { + "epoch": 0.25511, + "grad_norm": 1.2282173894268338, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 25511 + }, + { + "epoch": 0.25512, + "grad_norm": 0.683460215566792, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 25512 + }, + { + "epoch": 0.25513, + "grad_norm": 0.6797596531837293, + "learning_rate": 0.003, + "loss": 4.04, + "step": 25513 + }, + { + "epoch": 0.25514, + "grad_norm": 0.6641751647633197, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 25514 + }, + { + "epoch": 0.25515, + "grad_norm": 0.657087574467802, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 25515 + }, + { + "epoch": 0.25516, + "grad_norm": 0.6702335307387821, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 25516 + }, + { + "epoch": 0.25517, + "grad_norm": 0.6197197046005322, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 25517 + }, + { + "epoch": 0.25518, + "grad_norm": 0.7558327995708012, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 25518 + }, + { + "epoch": 0.25519, + "grad_norm": 0.8089473984054202, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 25519 + }, + { + "epoch": 0.2552, + "grad_norm": 0.7265745355368336, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 25520 + }, + { + "epoch": 0.25521, + "grad_norm": 0.5823227255198964, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 25521 + }, + { + "epoch": 0.25522, + "grad_norm": 0.6230522270827914, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 25522 + }, + { + "epoch": 0.25523, + "grad_norm": 0.5992914012060027, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 25523 + }, + { + "epoch": 0.25524, + "grad_norm": 0.742508757382326, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 25524 + }, + { + "epoch": 0.25525, + "grad_norm": 1.0403867381216383, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 25525 + }, + { + "epoch": 0.25526, + "grad_norm": 1.4542059590245984, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 25526 + }, + { + "epoch": 0.25527, + "grad_norm": 0.7504983488349494, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 25527 + }, + { + "epoch": 0.25528, + "grad_norm": 0.7723766434979357, + "learning_rate": 0.003, + "loss": 4.024, + "step": 25528 + }, + { + "epoch": 0.25529, + "grad_norm": 0.9118160035062748, + "learning_rate": 0.003, + "loss": 4.038, + "step": 25529 + }, + { + "epoch": 0.2553, + "grad_norm": 0.9095808486226669, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 25530 + }, + { + "epoch": 0.25531, + "grad_norm": 0.8449916001815673, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 25531 + }, + { + "epoch": 0.25532, + "grad_norm": 0.8702440244971412, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 25532 + }, + { + "epoch": 0.25533, + "grad_norm": 0.9794938453352677, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 25533 + }, + { + "epoch": 0.25534, + "grad_norm": 1.2167464027111463, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 25534 + }, + { + "epoch": 0.25535, + "grad_norm": 0.8682439736283258, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 25535 + }, + { + "epoch": 0.25536, + "grad_norm": 0.7341110227597509, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 25536 + }, + { + "epoch": 0.25537, + "grad_norm": 0.780687291149449, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 25537 + }, + { + "epoch": 0.25538, + "grad_norm": 0.8103130226149866, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 25538 + }, + { + "epoch": 0.25539, + "grad_norm": 0.801259339981181, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 25539 + }, + { + "epoch": 0.2554, + "grad_norm": 0.8013416542998175, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 25540 + }, + { + "epoch": 0.25541, + "grad_norm": 1.0429397349442209, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 25541 + }, + { + "epoch": 0.25542, + "grad_norm": 1.1249394254592362, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 25542 + }, + { + "epoch": 0.25543, + "grad_norm": 1.0430346449018129, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 25543 + }, + { + "epoch": 0.25544, + "grad_norm": 1.0034310086022964, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 25544 + }, + { + "epoch": 0.25545, + "grad_norm": 0.9712440289340865, + "learning_rate": 0.003, + "loss": 4.052, + "step": 25545 + }, + { + "epoch": 0.25546, + "grad_norm": 1.213194252001686, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 25546 + }, + { + "epoch": 0.25547, + "grad_norm": 0.9168267254222705, + "learning_rate": 0.003, + "loss": 4.0905, + "step": 25547 + }, + { + "epoch": 0.25548, + "grad_norm": 0.9423992685584162, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 25548 + }, + { + "epoch": 0.25549, + "grad_norm": 1.0913704277648193, + "learning_rate": 0.003, + "loss": 4.0993, + "step": 25549 + }, + { + "epoch": 0.2555, + "grad_norm": 0.8593106401338043, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 25550 + }, + { + "epoch": 0.25551, + "grad_norm": 0.8969179599582954, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 25551 + }, + { + "epoch": 0.25552, + "grad_norm": 0.9422136983288252, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 25552 + }, + { + "epoch": 0.25553, + "grad_norm": 1.2035228954571051, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 25553 + }, + { + "epoch": 0.25554, + "grad_norm": 0.8811183019321478, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 25554 + }, + { + "epoch": 0.25555, + "grad_norm": 0.8789305952076087, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 25555 + }, + { + "epoch": 0.25556, + "grad_norm": 0.8863340009715519, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 25556 + }, + { + "epoch": 0.25557, + "grad_norm": 0.9700979930725055, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 25557 + }, + { + "epoch": 0.25558, + "grad_norm": 1.0179645129320452, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 25558 + }, + { + "epoch": 0.25559, + "grad_norm": 1.0769272307893212, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 25559 + }, + { + "epoch": 0.2556, + "grad_norm": 0.9473458840685861, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 25560 + }, + { + "epoch": 0.25561, + "grad_norm": 0.9506894946488, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 25561 + }, + { + "epoch": 0.25562, + "grad_norm": 1.008624553357757, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 25562 + }, + { + "epoch": 0.25563, + "grad_norm": 1.015411260879144, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 25563 + }, + { + "epoch": 0.25564, + "grad_norm": 0.9091591793628141, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 25564 + }, + { + "epoch": 0.25565, + "grad_norm": 0.7957076530872577, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 25565 + }, + { + "epoch": 0.25566, + "grad_norm": 0.8147177574731048, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 25566 + }, + { + "epoch": 0.25567, + "grad_norm": 0.890155925257158, + "learning_rate": 0.003, + "loss": 4.077, + "step": 25567 + }, + { + "epoch": 0.25568, + "grad_norm": 0.9524484198845756, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 25568 + }, + { + "epoch": 0.25569, + "grad_norm": 0.9976570676585215, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 25569 + }, + { + "epoch": 0.2557, + "grad_norm": 0.9443232980798448, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 25570 + }, + { + "epoch": 0.25571, + "grad_norm": 0.8625075738759506, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 25571 + }, + { + "epoch": 0.25572, + "grad_norm": 0.7747316821490035, + "learning_rate": 0.003, + "loss": 4.038, + "step": 25572 + }, + { + "epoch": 0.25573, + "grad_norm": 0.6705995262255924, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 25573 + }, + { + "epoch": 0.25574, + "grad_norm": 0.7759838653009686, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 25574 + }, + { + "epoch": 0.25575, + "grad_norm": 0.6786614748639236, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 25575 + }, + { + "epoch": 0.25576, + "grad_norm": 0.6569226479781927, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 25576 + }, + { + "epoch": 0.25577, + "grad_norm": 0.6123073315146863, + "learning_rate": 0.003, + "loss": 3.9865, + "step": 25577 + }, + { + "epoch": 0.25578, + "grad_norm": 0.555942781431035, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 25578 + }, + { + "epoch": 0.25579, + "grad_norm": 0.508188277926097, + "learning_rate": 0.003, + "loss": 4.058, + "step": 25579 + }, + { + "epoch": 0.2558, + "grad_norm": 0.48736665887242026, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 25580 + }, + { + "epoch": 0.25581, + "grad_norm": 0.5591794872385124, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 25581 + }, + { + "epoch": 0.25582, + "grad_norm": 0.6799994399804233, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 25582 + }, + { + "epoch": 0.25583, + "grad_norm": 0.7925941771204883, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 25583 + }, + { + "epoch": 0.25584, + "grad_norm": 1.0116346878747515, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 25584 + }, + { + "epoch": 0.25585, + "grad_norm": 1.2898610556182433, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 25585 + }, + { + "epoch": 0.25586, + "grad_norm": 0.6588163015969251, + "learning_rate": 0.003, + "loss": 4.0077, + "step": 25586 + }, + { + "epoch": 0.25587, + "grad_norm": 0.6672315270036102, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 25587 + }, + { + "epoch": 0.25588, + "grad_norm": 0.700809253249972, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 25588 + }, + { + "epoch": 0.25589, + "grad_norm": 0.6146810028191083, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 25589 + }, + { + "epoch": 0.2559, + "grad_norm": 0.6556997210054634, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 25590 + }, + { + "epoch": 0.25591, + "grad_norm": 0.8232126219912062, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 25591 + }, + { + "epoch": 0.25592, + "grad_norm": 0.8550132623495436, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 25592 + }, + { + "epoch": 0.25593, + "grad_norm": 0.9510529383893469, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 25593 + }, + { + "epoch": 0.25594, + "grad_norm": 1.4121719430186077, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 25594 + }, + { + "epoch": 0.25595, + "grad_norm": 0.7312173130655611, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 25595 + }, + { + "epoch": 0.25596, + "grad_norm": 0.7739422926134939, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 25596 + }, + { + "epoch": 0.25597, + "grad_norm": 0.9058690802755301, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 25597 + }, + { + "epoch": 0.25598, + "grad_norm": 0.8462997574632294, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 25598 + }, + { + "epoch": 0.25599, + "grad_norm": 0.909678267602146, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 25599 + }, + { + "epoch": 0.256, + "grad_norm": 0.8912180986731276, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 25600 + }, + { + "epoch": 0.25601, + "grad_norm": 0.8666255804984621, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 25601 + }, + { + "epoch": 0.25602, + "grad_norm": 0.9898990004298943, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 25602 + }, + { + "epoch": 0.25603, + "grad_norm": 1.3179788081266002, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 25603 + }, + { + "epoch": 0.25604, + "grad_norm": 0.8472022132025522, + "learning_rate": 0.003, + "loss": 4.052, + "step": 25604 + }, + { + "epoch": 0.25605, + "grad_norm": 0.8768467138574475, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 25605 + }, + { + "epoch": 0.25606, + "grad_norm": 0.9307311815448447, + "learning_rate": 0.003, + "loss": 4.0896, + "step": 25606 + }, + { + "epoch": 0.25607, + "grad_norm": 1.028048741438936, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 25607 + }, + { + "epoch": 0.25608, + "grad_norm": 0.959890155600721, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 25608 + }, + { + "epoch": 0.25609, + "grad_norm": 1.0530517493433318, + "learning_rate": 0.003, + "loss": 4.029, + "step": 25609 + }, + { + "epoch": 0.2561, + "grad_norm": 0.9781700754553776, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 25610 + }, + { + "epoch": 0.25611, + "grad_norm": 0.9615322913474433, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 25611 + }, + { + "epoch": 0.25612, + "grad_norm": 0.9705404058828069, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 25612 + }, + { + "epoch": 0.25613, + "grad_norm": 1.0440101934038097, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 25613 + }, + { + "epoch": 0.25614, + "grad_norm": 1.1459438755959968, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 25614 + }, + { + "epoch": 0.25615, + "grad_norm": 1.055613045407354, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 25615 + }, + { + "epoch": 0.25616, + "grad_norm": 0.8461364468505412, + "learning_rate": 0.003, + "loss": 4.0894, + "step": 25616 + }, + { + "epoch": 0.25617, + "grad_norm": 0.8194803481270396, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 25617 + }, + { + "epoch": 0.25618, + "grad_norm": 0.7574136598273855, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 25618 + }, + { + "epoch": 0.25619, + "grad_norm": 0.6849649655458872, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 25619 + }, + { + "epoch": 0.2562, + "grad_norm": 0.6740934658316818, + "learning_rate": 0.003, + "loss": 4.076, + "step": 25620 + }, + { + "epoch": 0.25621, + "grad_norm": 0.7352822610255584, + "learning_rate": 0.003, + "loss": 4.068, + "step": 25621 + }, + { + "epoch": 0.25622, + "grad_norm": 0.8764602573807927, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 25622 + }, + { + "epoch": 0.25623, + "grad_norm": 0.8715898480093845, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 25623 + }, + { + "epoch": 0.25624, + "grad_norm": 0.8413433696271648, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 25624 + }, + { + "epoch": 0.25625, + "grad_norm": 0.9523683103187759, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 25625 + }, + { + "epoch": 0.25626, + "grad_norm": 1.2790198080640687, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 25626 + }, + { + "epoch": 0.25627, + "grad_norm": 0.8367658837111188, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 25627 + }, + { + "epoch": 0.25628, + "grad_norm": 0.7835435226818986, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 25628 + }, + { + "epoch": 0.25629, + "grad_norm": 0.815543660024562, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 25629 + }, + { + "epoch": 0.2563, + "grad_norm": 0.8565677365740393, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 25630 + }, + { + "epoch": 0.25631, + "grad_norm": 0.9228440531461264, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 25631 + }, + { + "epoch": 0.25632, + "grad_norm": 1.0810341151556777, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 25632 + }, + { + "epoch": 0.25633, + "grad_norm": 1.0614724139330207, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 25633 + }, + { + "epoch": 0.25634, + "grad_norm": 0.9351934832162114, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 25634 + }, + { + "epoch": 0.25635, + "grad_norm": 0.9388407753147652, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 25635 + }, + { + "epoch": 0.25636, + "grad_norm": 0.962198303489308, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 25636 + }, + { + "epoch": 0.25637, + "grad_norm": 1.0712447127686475, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 25637 + }, + { + "epoch": 0.25638, + "grad_norm": 0.9816949890034204, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 25638 + }, + { + "epoch": 0.25639, + "grad_norm": 1.0146084575011993, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 25639 + }, + { + "epoch": 0.2564, + "grad_norm": 0.9697545953193931, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 25640 + }, + { + "epoch": 0.25641, + "grad_norm": 0.904696813168741, + "learning_rate": 0.003, + "loss": 4.072, + "step": 25641 + }, + { + "epoch": 0.25642, + "grad_norm": 0.8029834457338992, + "learning_rate": 0.003, + "loss": 4.063, + "step": 25642 + }, + { + "epoch": 0.25643, + "grad_norm": 0.7341474364686016, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 25643 + }, + { + "epoch": 0.25644, + "grad_norm": 0.7317341320643208, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 25644 + }, + { + "epoch": 0.25645, + "grad_norm": 0.7290685796342252, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 25645 + }, + { + "epoch": 0.25646, + "grad_norm": 0.8389223548237573, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 25646 + }, + { + "epoch": 0.25647, + "grad_norm": 0.7323136602680181, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 25647 + }, + { + "epoch": 0.25648, + "grad_norm": 0.7006458012311327, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 25648 + }, + { + "epoch": 0.25649, + "grad_norm": 0.8796614539421426, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 25649 + }, + { + "epoch": 0.2565, + "grad_norm": 1.134289283000181, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 25650 + }, + { + "epoch": 0.25651, + "grad_norm": 1.0994671854732656, + "learning_rate": 0.003, + "loss": 4.062, + "step": 25651 + }, + { + "epoch": 0.25652, + "grad_norm": 1.0696433314732081, + "learning_rate": 0.003, + "loss": 4.026, + "step": 25652 + }, + { + "epoch": 0.25653, + "grad_norm": 0.9782450421160436, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 25653 + }, + { + "epoch": 0.25654, + "grad_norm": 0.9176937429881871, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 25654 + }, + { + "epoch": 0.25655, + "grad_norm": 1.0226859288837291, + "learning_rate": 0.003, + "loss": 4.025, + "step": 25655 + }, + { + "epoch": 0.25656, + "grad_norm": 1.1466472485479016, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 25656 + }, + { + "epoch": 0.25657, + "grad_norm": 0.8646470179603246, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 25657 + }, + { + "epoch": 0.25658, + "grad_norm": 0.8301829953055189, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 25658 + }, + { + "epoch": 0.25659, + "grad_norm": 0.8899418748121447, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 25659 + }, + { + "epoch": 0.2566, + "grad_norm": 0.7554487148446792, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 25660 + }, + { + "epoch": 0.25661, + "grad_norm": 0.7664215667047021, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 25661 + }, + { + "epoch": 0.25662, + "grad_norm": 0.7743180600402911, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 25662 + }, + { + "epoch": 0.25663, + "grad_norm": 0.7839466630831146, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 25663 + }, + { + "epoch": 0.25664, + "grad_norm": 0.8355764237406392, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 25664 + }, + { + "epoch": 0.25665, + "grad_norm": 0.9217610469090546, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 25665 + }, + { + "epoch": 0.25666, + "grad_norm": 1.0406837259996145, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 25666 + }, + { + "epoch": 0.25667, + "grad_norm": 1.0871064122799725, + "learning_rate": 0.003, + "loss": 4.037, + "step": 25667 + }, + { + "epoch": 0.25668, + "grad_norm": 0.8762911761659423, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 25668 + }, + { + "epoch": 0.25669, + "grad_norm": 0.7654352563074435, + "learning_rate": 0.003, + "loss": 4.069, + "step": 25669 + }, + { + "epoch": 0.2567, + "grad_norm": 0.6871249401806157, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 25670 + }, + { + "epoch": 0.25671, + "grad_norm": 0.7061119219021542, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 25671 + }, + { + "epoch": 0.25672, + "grad_norm": 0.7579857118313267, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 25672 + }, + { + "epoch": 0.25673, + "grad_norm": 0.8100201997864876, + "learning_rate": 0.003, + "loss": 4.064, + "step": 25673 + }, + { + "epoch": 0.25674, + "grad_norm": 0.9258903405609418, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 25674 + }, + { + "epoch": 0.25675, + "grad_norm": 0.986459741839688, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 25675 + }, + { + "epoch": 0.25676, + "grad_norm": 0.9253357279033798, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 25676 + }, + { + "epoch": 0.25677, + "grad_norm": 0.8626855461393848, + "learning_rate": 0.003, + "loss": 4.0875, + "step": 25677 + }, + { + "epoch": 0.25678, + "grad_norm": 0.7917328886376643, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 25678 + }, + { + "epoch": 0.25679, + "grad_norm": 0.8560363159827711, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 25679 + }, + { + "epoch": 0.2568, + "grad_norm": 0.8596436207762913, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 25680 + }, + { + "epoch": 0.25681, + "grad_norm": 0.8531007742661948, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 25681 + }, + { + "epoch": 0.25682, + "grad_norm": 0.9904724442011617, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 25682 + }, + { + "epoch": 0.25683, + "grad_norm": 1.0273013989368105, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 25683 + }, + { + "epoch": 0.25684, + "grad_norm": 0.8995409725632936, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 25684 + }, + { + "epoch": 0.25685, + "grad_norm": 0.7661760095224859, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 25685 + }, + { + "epoch": 0.25686, + "grad_norm": 0.6275484598990031, + "learning_rate": 0.003, + "loss": 4.052, + "step": 25686 + }, + { + "epoch": 0.25687, + "grad_norm": 0.7235673533566921, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 25687 + }, + { + "epoch": 0.25688, + "grad_norm": 0.8798494700149826, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 25688 + }, + { + "epoch": 0.25689, + "grad_norm": 1.0494151545547452, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 25689 + }, + { + "epoch": 0.2569, + "grad_norm": 1.1671667975695061, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 25690 + }, + { + "epoch": 0.25691, + "grad_norm": 0.7408143499619712, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 25691 + }, + { + "epoch": 0.25692, + "grad_norm": 0.7026773157590855, + "learning_rate": 0.003, + "loss": 4.028, + "step": 25692 + }, + { + "epoch": 0.25693, + "grad_norm": 0.8980310242223455, + "learning_rate": 0.003, + "loss": 4.0004, + "step": 25693 + }, + { + "epoch": 0.25694, + "grad_norm": 1.0824228126626987, + "learning_rate": 0.003, + "loss": 4.035, + "step": 25694 + }, + { + "epoch": 0.25695, + "grad_norm": 0.924279239306092, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 25695 + }, + { + "epoch": 0.25696, + "grad_norm": 0.753752479617001, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 25696 + }, + { + "epoch": 0.25697, + "grad_norm": 0.6580218777582888, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 25697 + }, + { + "epoch": 0.25698, + "grad_norm": 0.6875077978791423, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 25698 + }, + { + "epoch": 0.25699, + "grad_norm": 0.6162836382824428, + "learning_rate": 0.003, + "loss": 4.045, + "step": 25699 + }, + { + "epoch": 0.257, + "grad_norm": 0.5708048517756945, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 25700 + }, + { + "epoch": 0.25701, + "grad_norm": 0.5898817318191806, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 25701 + }, + { + "epoch": 0.25702, + "grad_norm": 0.6010757892884534, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 25702 + }, + { + "epoch": 0.25703, + "grad_norm": 0.6421809623132237, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 25703 + }, + { + "epoch": 0.25704, + "grad_norm": 0.6782850962867656, + "learning_rate": 0.003, + "loss": 4.027, + "step": 25704 + }, + { + "epoch": 0.25705, + "grad_norm": 0.6860537944354691, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 25705 + }, + { + "epoch": 0.25706, + "grad_norm": 0.8158097591612413, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 25706 + }, + { + "epoch": 0.25707, + "grad_norm": 1.1273595771592388, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 25707 + }, + { + "epoch": 0.25708, + "grad_norm": 1.1207301596591779, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 25708 + }, + { + "epoch": 0.25709, + "grad_norm": 1.058964295915854, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 25709 + }, + { + "epoch": 0.2571, + "grad_norm": 0.9343641816461868, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 25710 + }, + { + "epoch": 0.25711, + "grad_norm": 0.9226828660493398, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 25711 + }, + { + "epoch": 0.25712, + "grad_norm": 1.0302871551029846, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 25712 + }, + { + "epoch": 0.25713, + "grad_norm": 0.9643125705497843, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 25713 + }, + { + "epoch": 0.25714, + "grad_norm": 1.0836966712936023, + "learning_rate": 0.003, + "loss": 4.0885, + "step": 25714 + }, + { + "epoch": 0.25715, + "grad_norm": 0.9690842198080638, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 25715 + }, + { + "epoch": 0.25716, + "grad_norm": 1.2217463202640368, + "learning_rate": 0.003, + "loss": 4.056, + "step": 25716 + }, + { + "epoch": 0.25717, + "grad_norm": 1.115843673163455, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 25717 + }, + { + "epoch": 0.25718, + "grad_norm": 0.9141255348458905, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 25718 + }, + { + "epoch": 0.25719, + "grad_norm": 0.7814168289509926, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 25719 + }, + { + "epoch": 0.2572, + "grad_norm": 0.8702282404854796, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 25720 + }, + { + "epoch": 0.25721, + "grad_norm": 0.8872346559261546, + "learning_rate": 0.003, + "loss": 4.062, + "step": 25721 + }, + { + "epoch": 0.25722, + "grad_norm": 0.8800332032413002, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 25722 + }, + { + "epoch": 0.25723, + "grad_norm": 0.9926870576404875, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 25723 + }, + { + "epoch": 0.25724, + "grad_norm": 0.9740608546546412, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 25724 + }, + { + "epoch": 0.25725, + "grad_norm": 0.9598478971687701, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 25725 + }, + { + "epoch": 0.25726, + "grad_norm": 1.1782344971632273, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 25726 + }, + { + "epoch": 0.25727, + "grad_norm": 0.8986815958279982, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 25727 + }, + { + "epoch": 0.25728, + "grad_norm": 0.8982273220523551, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 25728 + }, + { + "epoch": 0.25729, + "grad_norm": 0.9391246262802219, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 25729 + }, + { + "epoch": 0.2573, + "grad_norm": 0.8725527593120934, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 25730 + }, + { + "epoch": 0.25731, + "grad_norm": 0.9242694734094682, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 25731 + }, + { + "epoch": 0.25732, + "grad_norm": 1.058437329058568, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 25732 + }, + { + "epoch": 0.25733, + "grad_norm": 1.0376223346138786, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 25733 + }, + { + "epoch": 0.25734, + "grad_norm": 0.8543840355948839, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 25734 + }, + { + "epoch": 0.25735, + "grad_norm": 0.7257909123798285, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 25735 + }, + { + "epoch": 0.25736, + "grad_norm": 0.7997216759158365, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 25736 + }, + { + "epoch": 0.25737, + "grad_norm": 0.9325336733075685, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 25737 + }, + { + "epoch": 0.25738, + "grad_norm": 0.980629589777745, + "learning_rate": 0.003, + "loss": 3.996, + "step": 25738 + }, + { + "epoch": 0.25739, + "grad_norm": 0.9973327509693599, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 25739 + }, + { + "epoch": 0.2574, + "grad_norm": 0.9908197866583291, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 25740 + }, + { + "epoch": 0.25741, + "grad_norm": 0.9549293432185175, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 25741 + }, + { + "epoch": 0.25742, + "grad_norm": 1.0225836593225683, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 25742 + }, + { + "epoch": 0.25743, + "grad_norm": 1.1774744232298078, + "learning_rate": 0.003, + "loss": 4.03, + "step": 25743 + }, + { + "epoch": 0.25744, + "grad_norm": 0.8781227743866601, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 25744 + }, + { + "epoch": 0.25745, + "grad_norm": 0.6880503915672852, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 25745 + }, + { + "epoch": 0.25746, + "grad_norm": 0.6580475514599924, + "learning_rate": 0.003, + "loss": 4.056, + "step": 25746 + }, + { + "epoch": 0.25747, + "grad_norm": 0.6756631140243022, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 25747 + }, + { + "epoch": 0.25748, + "grad_norm": 0.7968680113685896, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 25748 + }, + { + "epoch": 0.25749, + "grad_norm": 0.8426893521184446, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 25749 + }, + { + "epoch": 0.2575, + "grad_norm": 0.795421932609609, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 25750 + }, + { + "epoch": 0.25751, + "grad_norm": 0.769803752182991, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 25751 + }, + { + "epoch": 0.25752, + "grad_norm": 0.8231547235797607, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 25752 + }, + { + "epoch": 0.25753, + "grad_norm": 0.7707796307353454, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 25753 + }, + { + "epoch": 0.25754, + "grad_norm": 0.7720654428495404, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 25754 + }, + { + "epoch": 0.25755, + "grad_norm": 0.8182178589963739, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 25755 + }, + { + "epoch": 0.25756, + "grad_norm": 0.7238280484282918, + "learning_rate": 0.003, + "loss": 4.05, + "step": 25756 + }, + { + "epoch": 0.25757, + "grad_norm": 0.6772948109937411, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 25757 + }, + { + "epoch": 0.25758, + "grad_norm": 0.7678719313834955, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 25758 + }, + { + "epoch": 0.25759, + "grad_norm": 0.9344363985664037, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 25759 + }, + { + "epoch": 0.2576, + "grad_norm": 1.154656001649035, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 25760 + }, + { + "epoch": 0.25761, + "grad_norm": 0.7646643440630714, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 25761 + }, + { + "epoch": 0.25762, + "grad_norm": 0.6659631380371734, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 25762 + }, + { + "epoch": 0.25763, + "grad_norm": 0.7176069338262329, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 25763 + }, + { + "epoch": 0.25764, + "grad_norm": 0.7030459219151513, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 25764 + }, + { + "epoch": 0.25765, + "grad_norm": 0.9200535271219515, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 25765 + }, + { + "epoch": 0.25766, + "grad_norm": 1.2085399986251164, + "learning_rate": 0.003, + "loss": 4.054, + "step": 25766 + }, + { + "epoch": 0.25767, + "grad_norm": 0.9070711000996539, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 25767 + }, + { + "epoch": 0.25768, + "grad_norm": 0.9551722803549869, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 25768 + }, + { + "epoch": 0.25769, + "grad_norm": 1.0473348674472607, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 25769 + }, + { + "epoch": 0.2577, + "grad_norm": 0.9034143418257037, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 25770 + }, + { + "epoch": 0.25771, + "grad_norm": 0.9576962150044424, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 25771 + }, + { + "epoch": 0.25772, + "grad_norm": 0.8674523085836866, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 25772 + }, + { + "epoch": 0.25773, + "grad_norm": 0.8693565533491876, + "learning_rate": 0.003, + "loss": 4.045, + "step": 25773 + }, + { + "epoch": 0.25774, + "grad_norm": 0.9475099252831082, + "learning_rate": 0.003, + "loss": 4.047, + "step": 25774 + }, + { + "epoch": 0.25775, + "grad_norm": 1.0200939862649179, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 25775 + }, + { + "epoch": 0.25776, + "grad_norm": 0.9954123108957649, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 25776 + }, + { + "epoch": 0.25777, + "grad_norm": 0.897136140992167, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 25777 + }, + { + "epoch": 0.25778, + "grad_norm": 0.8759373911773831, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 25778 + }, + { + "epoch": 0.25779, + "grad_norm": 0.8813415194178811, + "learning_rate": 0.003, + "loss": 4.069, + "step": 25779 + }, + { + "epoch": 0.2578, + "grad_norm": 0.9494621897883795, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 25780 + }, + { + "epoch": 0.25781, + "grad_norm": 1.0759895692576578, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 25781 + }, + { + "epoch": 0.25782, + "grad_norm": 1.0767392451194029, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 25782 + }, + { + "epoch": 0.25783, + "grad_norm": 0.99383370567182, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 25783 + }, + { + "epoch": 0.25784, + "grad_norm": 1.103720313872647, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 25784 + }, + { + "epoch": 0.25785, + "grad_norm": 1.0266387934492864, + "learning_rate": 0.003, + "loss": 4.064, + "step": 25785 + }, + { + "epoch": 0.25786, + "grad_norm": 0.925162090251453, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 25786 + }, + { + "epoch": 0.25787, + "grad_norm": 0.7942677904027319, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 25787 + }, + { + "epoch": 0.25788, + "grad_norm": 0.8909008635908302, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 25788 + }, + { + "epoch": 0.25789, + "grad_norm": 0.9280757163826667, + "learning_rate": 0.003, + "loss": 4.028, + "step": 25789 + }, + { + "epoch": 0.2579, + "grad_norm": 0.7453131310347371, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 25790 + }, + { + "epoch": 0.25791, + "grad_norm": 0.7665388665222183, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 25791 + }, + { + "epoch": 0.25792, + "grad_norm": 0.7868354289513428, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 25792 + }, + { + "epoch": 0.25793, + "grad_norm": 0.6918202009526567, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 25793 + }, + { + "epoch": 0.25794, + "grad_norm": 0.7564816172491939, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 25794 + }, + { + "epoch": 0.25795, + "grad_norm": 1.0367794617615937, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 25795 + }, + { + "epoch": 0.25796, + "grad_norm": 1.1524336035187397, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 25796 + }, + { + "epoch": 0.25797, + "grad_norm": 0.7277175467416978, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 25797 + }, + { + "epoch": 0.25798, + "grad_norm": 0.7094363324290363, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 25798 + }, + { + "epoch": 0.25799, + "grad_norm": 0.6298351131852827, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 25799 + }, + { + "epoch": 0.258, + "grad_norm": 0.6481622423664767, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 25800 + }, + { + "epoch": 0.25801, + "grad_norm": 0.5779186357375955, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 25801 + }, + { + "epoch": 0.25802, + "grad_norm": 0.5033090271479658, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 25802 + }, + { + "epoch": 0.25803, + "grad_norm": 0.5136676267862368, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 25803 + }, + { + "epoch": 0.25804, + "grad_norm": 0.5945167478390637, + "learning_rate": 0.003, + "loss": 4.04, + "step": 25804 + }, + { + "epoch": 0.25805, + "grad_norm": 0.651430175051021, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 25805 + }, + { + "epoch": 0.25806, + "grad_norm": 0.8020861174781325, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 25806 + }, + { + "epoch": 0.25807, + "grad_norm": 1.0394046100369074, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 25807 + }, + { + "epoch": 0.25808, + "grad_norm": 1.1820717808178223, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 25808 + }, + { + "epoch": 0.25809, + "grad_norm": 0.9585555198895713, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 25809 + }, + { + "epoch": 0.2581, + "grad_norm": 1.0524183857996274, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 25810 + }, + { + "epoch": 0.25811, + "grad_norm": 0.9594371755258403, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 25811 + }, + { + "epoch": 0.25812, + "grad_norm": 0.8475604564211586, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 25812 + }, + { + "epoch": 0.25813, + "grad_norm": 0.8442216772149274, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 25813 + }, + { + "epoch": 0.25814, + "grad_norm": 1.0356576458822522, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 25814 + }, + { + "epoch": 0.25815, + "grad_norm": 1.2569038573350582, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 25815 + }, + { + "epoch": 0.25816, + "grad_norm": 0.6622131297530263, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 25816 + }, + { + "epoch": 0.25817, + "grad_norm": 0.6320970930100901, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 25817 + }, + { + "epoch": 0.25818, + "grad_norm": 0.67163893254025, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 25818 + }, + { + "epoch": 0.25819, + "grad_norm": 0.6654415833982926, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 25819 + }, + { + "epoch": 0.2582, + "grad_norm": 0.8112589598221958, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25820 + }, + { + "epoch": 0.25821, + "grad_norm": 0.9196013931368918, + "learning_rate": 0.003, + "loss": 3.9979, + "step": 25821 + }, + { + "epoch": 0.25822, + "grad_norm": 1.0425288442262701, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 25822 + }, + { + "epoch": 0.25823, + "grad_norm": 0.8425794297654894, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 25823 + }, + { + "epoch": 0.25824, + "grad_norm": 0.7968565549634317, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 25824 + }, + { + "epoch": 0.25825, + "grad_norm": 0.8450979405567284, + "learning_rate": 0.003, + "loss": 4.046, + "step": 25825 + }, + { + "epoch": 0.25826, + "grad_norm": 0.8863244259587216, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 25826 + }, + { + "epoch": 0.25827, + "grad_norm": 1.0289805153391658, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 25827 + }, + { + "epoch": 0.25828, + "grad_norm": 1.1437242005962354, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 25828 + }, + { + "epoch": 0.25829, + "grad_norm": 0.8939463470666488, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 25829 + }, + { + "epoch": 0.2583, + "grad_norm": 0.8803308972100106, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 25830 + }, + { + "epoch": 0.25831, + "grad_norm": 0.7933552530329523, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 25831 + }, + { + "epoch": 0.25832, + "grad_norm": 0.9356777367398023, + "learning_rate": 0.003, + "loss": 4.049, + "step": 25832 + }, + { + "epoch": 0.25833, + "grad_norm": 0.9516656042341312, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 25833 + }, + { + "epoch": 0.25834, + "grad_norm": 0.9193313586915596, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 25834 + }, + { + "epoch": 0.25835, + "grad_norm": 0.9634382042760945, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 25835 + }, + { + "epoch": 0.25836, + "grad_norm": 0.9335316656342327, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 25836 + }, + { + "epoch": 0.25837, + "grad_norm": 0.8970002937832366, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 25837 + }, + { + "epoch": 0.25838, + "grad_norm": 0.9901839838475006, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 25838 + }, + { + "epoch": 0.25839, + "grad_norm": 1.2037838117017072, + "learning_rate": 0.003, + "loss": 4.0906, + "step": 25839 + }, + { + "epoch": 0.2584, + "grad_norm": 0.8802229977940653, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 25840 + }, + { + "epoch": 0.25841, + "grad_norm": 0.882073749070165, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 25841 + }, + { + "epoch": 0.25842, + "grad_norm": 1.0082580681189421, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 25842 + }, + { + "epoch": 0.25843, + "grad_norm": 0.874982647275984, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 25843 + }, + { + "epoch": 0.25844, + "grad_norm": 0.8621735796180681, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 25844 + }, + { + "epoch": 0.25845, + "grad_norm": 0.8932734465674992, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 25845 + }, + { + "epoch": 0.25846, + "grad_norm": 1.0257582556690623, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 25846 + }, + { + "epoch": 0.25847, + "grad_norm": 1.2066263872531944, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 25847 + }, + { + "epoch": 0.25848, + "grad_norm": 0.8650901957642076, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 25848 + }, + { + "epoch": 0.25849, + "grad_norm": 0.8667146177648227, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 25849 + }, + { + "epoch": 0.2585, + "grad_norm": 0.9662269336997142, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 25850 + }, + { + "epoch": 0.25851, + "grad_norm": 1.2057527719941996, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 25851 + }, + { + "epoch": 0.25852, + "grad_norm": 0.8808538520293002, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 25852 + }, + { + "epoch": 0.25853, + "grad_norm": 0.8309043526801995, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 25853 + }, + { + "epoch": 0.25854, + "grad_norm": 0.9413281953063685, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 25854 + }, + { + "epoch": 0.25855, + "grad_norm": 0.9191799449942011, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 25855 + }, + { + "epoch": 0.25856, + "grad_norm": 0.9198332894436225, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 25856 + }, + { + "epoch": 0.25857, + "grad_norm": 0.9436250340763263, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 25857 + }, + { + "epoch": 0.25858, + "grad_norm": 1.001050141894837, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 25858 + }, + { + "epoch": 0.25859, + "grad_norm": 0.9844950994709294, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 25859 + }, + { + "epoch": 0.2586, + "grad_norm": 0.8786801301635868, + "learning_rate": 0.003, + "loss": 4.055, + "step": 25860 + }, + { + "epoch": 0.25861, + "grad_norm": 0.7180851277303991, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 25861 + }, + { + "epoch": 0.25862, + "grad_norm": 0.6390804457022464, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 25862 + }, + { + "epoch": 0.25863, + "grad_norm": 0.5646052416997042, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 25863 + }, + { + "epoch": 0.25864, + "grad_norm": 0.5339932331561656, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 25864 + }, + { + "epoch": 0.25865, + "grad_norm": 0.5683587474693323, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 25865 + }, + { + "epoch": 0.25866, + "grad_norm": 0.5984531195999825, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 25866 + }, + { + "epoch": 0.25867, + "grad_norm": 0.5521101672393206, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 25867 + }, + { + "epoch": 0.25868, + "grad_norm": 0.6007304567688321, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 25868 + }, + { + "epoch": 0.25869, + "grad_norm": 0.6559460457821347, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 25869 + }, + { + "epoch": 0.2587, + "grad_norm": 0.866258462499432, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 25870 + }, + { + "epoch": 0.25871, + "grad_norm": 1.2472532541149866, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 25871 + }, + { + "epoch": 0.25872, + "grad_norm": 0.8446041709917281, + "learning_rate": 0.003, + "loss": 4.04, + "step": 25872 + }, + { + "epoch": 0.25873, + "grad_norm": 0.6451099728966582, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 25873 + }, + { + "epoch": 0.25874, + "grad_norm": 0.621428619108411, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 25874 + }, + { + "epoch": 0.25875, + "grad_norm": 0.6419477296677296, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 25875 + }, + { + "epoch": 0.25876, + "grad_norm": 0.7309942038750866, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 25876 + }, + { + "epoch": 0.25877, + "grad_norm": 0.9223989918023651, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 25877 + }, + { + "epoch": 0.25878, + "grad_norm": 1.1371450193314694, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 25878 + }, + { + "epoch": 0.25879, + "grad_norm": 0.9127424359215026, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 25879 + }, + { + "epoch": 0.2588, + "grad_norm": 0.898105141764287, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 25880 + }, + { + "epoch": 0.25881, + "grad_norm": 0.8870293237976957, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 25881 + }, + { + "epoch": 0.25882, + "grad_norm": 0.8571869344215011, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 25882 + }, + { + "epoch": 0.25883, + "grad_norm": 0.8373273506868274, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 25883 + }, + { + "epoch": 0.25884, + "grad_norm": 0.8422612089063182, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 25884 + }, + { + "epoch": 0.25885, + "grad_norm": 0.9146282554344322, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 25885 + }, + { + "epoch": 0.25886, + "grad_norm": 0.7739463592685634, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 25886 + }, + { + "epoch": 0.25887, + "grad_norm": 0.7562467325592032, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 25887 + }, + { + "epoch": 0.25888, + "grad_norm": 0.9399885995128254, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 25888 + }, + { + "epoch": 0.25889, + "grad_norm": 1.1139726599018331, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 25889 + }, + { + "epoch": 0.2589, + "grad_norm": 0.956585460755578, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 25890 + }, + { + "epoch": 0.25891, + "grad_norm": 1.048416275823911, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 25891 + }, + { + "epoch": 0.25892, + "grad_norm": 1.3350114298512115, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 25892 + }, + { + "epoch": 0.25893, + "grad_norm": 0.8264402436976467, + "learning_rate": 0.003, + "loss": 4.067, + "step": 25893 + }, + { + "epoch": 0.25894, + "grad_norm": 0.8854505957219764, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 25894 + }, + { + "epoch": 0.25895, + "grad_norm": 0.958151037817533, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 25895 + }, + { + "epoch": 0.25896, + "grad_norm": 1.068271233532324, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 25896 + }, + { + "epoch": 0.25897, + "grad_norm": 0.7958327895186796, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 25897 + }, + { + "epoch": 0.25898, + "grad_norm": 0.8441338680320615, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 25898 + }, + { + "epoch": 0.25899, + "grad_norm": 0.7808939378255455, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 25899 + }, + { + "epoch": 0.259, + "grad_norm": 0.7333410233655898, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 25900 + }, + { + "epoch": 0.25901, + "grad_norm": 0.7421629819934148, + "learning_rate": 0.003, + "loss": 4.073, + "step": 25901 + }, + { + "epoch": 0.25902, + "grad_norm": 0.769639933952084, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 25902 + }, + { + "epoch": 0.25903, + "grad_norm": 0.7747913705562472, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 25903 + }, + { + "epoch": 0.25904, + "grad_norm": 1.0656976329485086, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 25904 + }, + { + "epoch": 0.25905, + "grad_norm": 1.2563630347617631, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 25905 + }, + { + "epoch": 0.25906, + "grad_norm": 0.8854321809695271, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 25906 + }, + { + "epoch": 0.25907, + "grad_norm": 0.8059072358321804, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 25907 + }, + { + "epoch": 0.25908, + "grad_norm": 0.8138147215454772, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 25908 + }, + { + "epoch": 0.25909, + "grad_norm": 0.7817918998199807, + "learning_rate": 0.003, + "loss": 4.061, + "step": 25909 + }, + { + "epoch": 0.2591, + "grad_norm": 0.736903925131948, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 25910 + }, + { + "epoch": 0.25911, + "grad_norm": 0.7987525143998383, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 25911 + }, + { + "epoch": 0.25912, + "grad_norm": 0.835157078093261, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 25912 + }, + { + "epoch": 0.25913, + "grad_norm": 0.8077293035560028, + "learning_rate": 0.003, + "loss": 4.026, + "step": 25913 + }, + { + "epoch": 0.25914, + "grad_norm": 0.7429075380762054, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 25914 + }, + { + "epoch": 0.25915, + "grad_norm": 0.6890816593703376, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 25915 + }, + { + "epoch": 0.25916, + "grad_norm": 0.7534742554035114, + "learning_rate": 0.003, + "loss": 4.059, + "step": 25916 + }, + { + "epoch": 0.25917, + "grad_norm": 0.9248898216498914, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 25917 + }, + { + "epoch": 0.25918, + "grad_norm": 1.225960246634099, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 25918 + }, + { + "epoch": 0.25919, + "grad_norm": 1.107042752603581, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 25919 + }, + { + "epoch": 0.2592, + "grad_norm": 0.8927881538546854, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 25920 + }, + { + "epoch": 0.25921, + "grad_norm": 0.9997249781762862, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 25921 + }, + { + "epoch": 0.25922, + "grad_norm": 1.118074978306857, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 25922 + }, + { + "epoch": 0.25923, + "grad_norm": 0.9360974885277822, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 25923 + }, + { + "epoch": 0.25924, + "grad_norm": 0.9429028883868438, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 25924 + }, + { + "epoch": 0.25925, + "grad_norm": 0.8601733760442539, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 25925 + }, + { + "epoch": 0.25926, + "grad_norm": 0.8769605528666334, + "learning_rate": 0.003, + "loss": 4.054, + "step": 25926 + }, + { + "epoch": 0.25927, + "grad_norm": 0.8481890702461181, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 25927 + }, + { + "epoch": 0.25928, + "grad_norm": 0.8661471111995374, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 25928 + }, + { + "epoch": 0.25929, + "grad_norm": 0.7393976948562561, + "learning_rate": 0.003, + "loss": 4.048, + "step": 25929 + }, + { + "epoch": 0.2593, + "grad_norm": 0.7644322353547663, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 25930 + }, + { + "epoch": 0.25931, + "grad_norm": 0.7570464830441276, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 25931 + }, + { + "epoch": 0.25932, + "grad_norm": 0.7584386440374044, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 25932 + }, + { + "epoch": 0.25933, + "grad_norm": 0.8084914853375832, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 25933 + }, + { + "epoch": 0.25934, + "grad_norm": 0.9295254742130984, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 25934 + }, + { + "epoch": 0.25935, + "grad_norm": 0.9964237938787681, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 25935 + }, + { + "epoch": 0.25936, + "grad_norm": 0.9839443778873309, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 25936 + }, + { + "epoch": 0.25937, + "grad_norm": 0.9517870221082645, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 25937 + }, + { + "epoch": 0.25938, + "grad_norm": 0.8718998656602899, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 25938 + }, + { + "epoch": 0.25939, + "grad_norm": 0.8505759838773718, + "learning_rate": 0.003, + "loss": 4.053, + "step": 25939 + }, + { + "epoch": 0.2594, + "grad_norm": 0.7463630794894784, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 25940 + }, + { + "epoch": 0.25941, + "grad_norm": 0.626229569035502, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 25941 + }, + { + "epoch": 0.25942, + "grad_norm": 0.49408710913084736, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 25942 + }, + { + "epoch": 0.25943, + "grad_norm": 0.5257735751132183, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 25943 + }, + { + "epoch": 0.25944, + "grad_norm": 0.5924549057047243, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 25944 + }, + { + "epoch": 0.25945, + "grad_norm": 0.6983411196812239, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 25945 + }, + { + "epoch": 0.25946, + "grad_norm": 0.888046310634318, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 25946 + }, + { + "epoch": 0.25947, + "grad_norm": 1.1124840738111583, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 25947 + }, + { + "epoch": 0.25948, + "grad_norm": 1.228698154302832, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 25948 + }, + { + "epoch": 0.25949, + "grad_norm": 0.9747231765418726, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 25949 + }, + { + "epoch": 0.2595, + "grad_norm": 0.9428329202593738, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 25950 + }, + { + "epoch": 0.25951, + "grad_norm": 1.181130542815705, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 25951 + }, + { + "epoch": 0.25952, + "grad_norm": 1.029283320988199, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 25952 + }, + { + "epoch": 0.25953, + "grad_norm": 0.9231522500756801, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 25953 + }, + { + "epoch": 0.25954, + "grad_norm": 0.9145587183891746, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 25954 + }, + { + "epoch": 0.25955, + "grad_norm": 0.8742515667345413, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 25955 + }, + { + "epoch": 0.25956, + "grad_norm": 0.8784317182165235, + "learning_rate": 0.003, + "loss": 4.05, + "step": 25956 + }, + { + "epoch": 0.25957, + "grad_norm": 0.7918525254031997, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 25957 + }, + { + "epoch": 0.25958, + "grad_norm": 0.77294678671553, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 25958 + }, + { + "epoch": 0.25959, + "grad_norm": 0.7908012745806853, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 25959 + }, + { + "epoch": 0.2596, + "grad_norm": 0.7282710268878122, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 25960 + }, + { + "epoch": 0.25961, + "grad_norm": 0.7346603375071983, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 25961 + }, + { + "epoch": 0.25962, + "grad_norm": 0.7603862860837758, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 25962 + }, + { + "epoch": 0.25963, + "grad_norm": 0.7799600352324344, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 25963 + }, + { + "epoch": 0.25964, + "grad_norm": 0.9075938870325732, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 25964 + }, + { + "epoch": 0.25965, + "grad_norm": 1.129640597125766, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 25965 + }, + { + "epoch": 0.25966, + "grad_norm": 1.2533015232673987, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 25966 + }, + { + "epoch": 0.25967, + "grad_norm": 0.7429426303594748, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 25967 + }, + { + "epoch": 0.25968, + "grad_norm": 0.6540424254883969, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 25968 + }, + { + "epoch": 0.25969, + "grad_norm": 0.647684024516191, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 25969 + }, + { + "epoch": 0.2597, + "grad_norm": 0.6269519995778846, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 25970 + }, + { + "epoch": 0.25971, + "grad_norm": 0.5695166804661257, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 25971 + }, + { + "epoch": 0.25972, + "grad_norm": 0.6019006743048572, + "learning_rate": 0.003, + "loss": 4.0003, + "step": 25972 + }, + { + "epoch": 0.25973, + "grad_norm": 0.6422169016009158, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 25973 + }, + { + "epoch": 0.25974, + "grad_norm": 0.739067701987218, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 25974 + }, + { + "epoch": 0.25975, + "grad_norm": 1.0781997776379828, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 25975 + }, + { + "epoch": 0.25976, + "grad_norm": 1.3148599174145454, + "learning_rate": 0.003, + "loss": 4.078, + "step": 25976 + }, + { + "epoch": 0.25977, + "grad_norm": 0.7925418266882831, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 25977 + }, + { + "epoch": 0.25978, + "grad_norm": 0.7807032651954832, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 25978 + }, + { + "epoch": 0.25979, + "grad_norm": 0.7000639361401566, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 25979 + }, + { + "epoch": 0.2598, + "grad_norm": 0.64633918234036, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 25980 + }, + { + "epoch": 0.25981, + "grad_norm": 0.7300634029945527, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 25981 + }, + { + "epoch": 0.25982, + "grad_norm": 0.7933567370638227, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 25982 + }, + { + "epoch": 0.25983, + "grad_norm": 0.8403981009703373, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 25983 + }, + { + "epoch": 0.25984, + "grad_norm": 0.8285355489614983, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 25984 + }, + { + "epoch": 0.25985, + "grad_norm": 0.7728346434405934, + "learning_rate": 0.003, + "loss": 4.0055, + "step": 25985 + }, + { + "epoch": 0.25986, + "grad_norm": 0.8797368426171809, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 25986 + }, + { + "epoch": 0.25987, + "grad_norm": 1.3317383596723136, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 25987 + }, + { + "epoch": 0.25988, + "grad_norm": 0.7740636664422671, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 25988 + }, + { + "epoch": 0.25989, + "grad_norm": 0.6182379072113148, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 25989 + }, + { + "epoch": 0.2599, + "grad_norm": 0.712512953438655, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 25990 + }, + { + "epoch": 0.25991, + "grad_norm": 0.7915284614381275, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 25991 + }, + { + "epoch": 0.25992, + "grad_norm": 0.88120967077237, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 25992 + }, + { + "epoch": 0.25993, + "grad_norm": 1.013443159211642, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 25993 + }, + { + "epoch": 0.25994, + "grad_norm": 1.0538616180157208, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 25994 + }, + { + "epoch": 0.25995, + "grad_norm": 0.8287732721296354, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 25995 + }, + { + "epoch": 0.25996, + "grad_norm": 0.753926989635847, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 25996 + }, + { + "epoch": 0.25997, + "grad_norm": 0.700423687167093, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 25997 + }, + { + "epoch": 0.25998, + "grad_norm": 0.7266771733902084, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 25998 + }, + { + "epoch": 0.25999, + "grad_norm": 0.871108245073119, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 25999 + }, + { + "epoch": 0.26, + "grad_norm": 1.2009645309298072, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 26000 + }, + { + "epoch": 0.26001, + "grad_norm": 1.2097418553810149, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 26001 + }, + { + "epoch": 0.26002, + "grad_norm": 0.8352746471524711, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 26002 + }, + { + "epoch": 0.26003, + "grad_norm": 0.7338772790170216, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 26003 + }, + { + "epoch": 0.26004, + "grad_norm": 0.7757394096883197, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 26004 + }, + { + "epoch": 0.26005, + "grad_norm": 0.8829887573540036, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 26005 + }, + { + "epoch": 0.26006, + "grad_norm": 0.9329015266747533, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 26006 + }, + { + "epoch": 0.26007, + "grad_norm": 0.965185850757229, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 26007 + }, + { + "epoch": 0.26008, + "grad_norm": 1.0040216341454218, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 26008 + }, + { + "epoch": 0.26009, + "grad_norm": 0.9504447047903598, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 26009 + }, + { + "epoch": 0.2601, + "grad_norm": 0.8408871255891638, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 26010 + }, + { + "epoch": 0.26011, + "grad_norm": 0.9130050950447212, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 26011 + }, + { + "epoch": 0.26012, + "grad_norm": 1.1612430050413889, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 26012 + }, + { + "epoch": 0.26013, + "grad_norm": 1.273213791480937, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 26013 + }, + { + "epoch": 0.26014, + "grad_norm": 0.7340124097636823, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 26014 + }, + { + "epoch": 0.26015, + "grad_norm": 0.697058503789482, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 26015 + }, + { + "epoch": 0.26016, + "grad_norm": 0.7251512152129509, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 26016 + }, + { + "epoch": 0.26017, + "grad_norm": 0.8297913356683271, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 26017 + }, + { + "epoch": 0.26018, + "grad_norm": 0.8372902861293796, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 26018 + }, + { + "epoch": 0.26019, + "grad_norm": 0.7562682980642035, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 26019 + }, + { + "epoch": 0.2602, + "grad_norm": 0.8254067989960779, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 26020 + }, + { + "epoch": 0.26021, + "grad_norm": 0.9526137880010397, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 26021 + }, + { + "epoch": 0.26022, + "grad_norm": 0.9736319958034201, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 26022 + }, + { + "epoch": 0.26023, + "grad_norm": 0.9702953713394484, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 26023 + }, + { + "epoch": 0.26024, + "grad_norm": 1.098826588956019, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 26024 + }, + { + "epoch": 0.26025, + "grad_norm": 0.8332824081363539, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 26025 + }, + { + "epoch": 0.26026, + "grad_norm": 0.843185822311754, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 26026 + }, + { + "epoch": 0.26027, + "grad_norm": 1.0066888019556928, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 26027 + }, + { + "epoch": 0.26028, + "grad_norm": 1.2365332437837289, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 26028 + }, + { + "epoch": 0.26029, + "grad_norm": 0.8474961503811502, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 26029 + }, + { + "epoch": 0.2603, + "grad_norm": 0.8377934579253402, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 26030 + }, + { + "epoch": 0.26031, + "grad_norm": 0.9188469687326098, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 26031 + }, + { + "epoch": 0.26032, + "grad_norm": 0.9687397684167955, + "learning_rate": 0.003, + "loss": 4.073, + "step": 26032 + }, + { + "epoch": 0.26033, + "grad_norm": 1.0865205811001855, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 26033 + }, + { + "epoch": 0.26034, + "grad_norm": 0.9555354968937448, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 26034 + }, + { + "epoch": 0.26035, + "grad_norm": 0.9097132353144967, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 26035 + }, + { + "epoch": 0.26036, + "grad_norm": 0.9316146616774199, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 26036 + }, + { + "epoch": 0.26037, + "grad_norm": 0.926009909966239, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 26037 + }, + { + "epoch": 0.26038, + "grad_norm": 0.9256877583114714, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 26038 + }, + { + "epoch": 0.26039, + "grad_norm": 0.9784970497027995, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 26039 + }, + { + "epoch": 0.2604, + "grad_norm": 1.0993479034786777, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 26040 + }, + { + "epoch": 0.26041, + "grad_norm": 0.985593785836954, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 26041 + }, + { + "epoch": 0.26042, + "grad_norm": 1.0330897354353592, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26042 + }, + { + "epoch": 0.26043, + "grad_norm": 0.8675892979754002, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 26043 + }, + { + "epoch": 0.26044, + "grad_norm": 0.8615616200991423, + "learning_rate": 0.003, + "loss": 4.081, + "step": 26044 + }, + { + "epoch": 0.26045, + "grad_norm": 0.835683431740189, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 26045 + }, + { + "epoch": 0.26046, + "grad_norm": 1.036041550582916, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 26046 + }, + { + "epoch": 0.26047, + "grad_norm": 1.1675796906912497, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 26047 + }, + { + "epoch": 0.26048, + "grad_norm": 0.8528364773244653, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 26048 + }, + { + "epoch": 0.26049, + "grad_norm": 0.6994713501558297, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 26049 + }, + { + "epoch": 0.2605, + "grad_norm": 0.77891095079453, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 26050 + }, + { + "epoch": 0.26051, + "grad_norm": 0.7596221570654217, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 26051 + }, + { + "epoch": 0.26052, + "grad_norm": 0.8673844761868408, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 26052 + }, + { + "epoch": 0.26053, + "grad_norm": 0.962025463777746, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26053 + }, + { + "epoch": 0.26054, + "grad_norm": 0.9701191897599432, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 26054 + }, + { + "epoch": 0.26055, + "grad_norm": 1.187393061210973, + "learning_rate": 0.003, + "loss": 4.072, + "step": 26055 + }, + { + "epoch": 0.26056, + "grad_norm": 1.003114948348615, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 26056 + }, + { + "epoch": 0.26057, + "grad_norm": 0.8812584057119388, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 26057 + }, + { + "epoch": 0.26058, + "grad_norm": 0.9409559852394075, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 26058 + }, + { + "epoch": 0.26059, + "grad_norm": 1.0709931655234577, + "learning_rate": 0.003, + "loss": 4.0935, + "step": 26059 + }, + { + "epoch": 0.2606, + "grad_norm": 0.9630774659088162, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 26060 + }, + { + "epoch": 0.26061, + "grad_norm": 1.016432703663005, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 26061 + }, + { + "epoch": 0.26062, + "grad_norm": 1.0353506780361699, + "learning_rate": 0.003, + "loss": 4.022, + "step": 26062 + }, + { + "epoch": 0.26063, + "grad_norm": 0.8411446179884322, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 26063 + }, + { + "epoch": 0.26064, + "grad_norm": 0.7113642924166592, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 26064 + }, + { + "epoch": 0.26065, + "grad_norm": 0.5496002794049029, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 26065 + }, + { + "epoch": 0.26066, + "grad_norm": 0.5486693501352954, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 26066 + }, + { + "epoch": 0.26067, + "grad_norm": 0.5370418031320295, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 26067 + }, + { + "epoch": 0.26068, + "grad_norm": 0.46506180992826696, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 26068 + }, + { + "epoch": 0.26069, + "grad_norm": 0.4760793267323303, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 26069 + }, + { + "epoch": 0.2607, + "grad_norm": 0.5179597923720156, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 26070 + }, + { + "epoch": 0.26071, + "grad_norm": 0.5236738804807307, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 26071 + }, + { + "epoch": 0.26072, + "grad_norm": 0.5466667465663434, + "learning_rate": 0.003, + "loss": 3.9953, + "step": 26072 + }, + { + "epoch": 0.26073, + "grad_norm": 0.6466389647249953, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 26073 + }, + { + "epoch": 0.26074, + "grad_norm": 0.7534005417879434, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 26074 + }, + { + "epoch": 0.26075, + "grad_norm": 0.8601144925728131, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 26075 + }, + { + "epoch": 0.26076, + "grad_norm": 0.9531081274450106, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 26076 + }, + { + "epoch": 0.26077, + "grad_norm": 0.7739487192822412, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 26077 + }, + { + "epoch": 0.26078, + "grad_norm": 0.7751696289825718, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 26078 + }, + { + "epoch": 0.26079, + "grad_norm": 0.9032741244597159, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 26079 + }, + { + "epoch": 0.2608, + "grad_norm": 1.1918145156003386, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 26080 + }, + { + "epoch": 0.26081, + "grad_norm": 1.0750482025945935, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 26081 + }, + { + "epoch": 0.26082, + "grad_norm": 0.9504724947176573, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 26082 + }, + { + "epoch": 0.26083, + "grad_norm": 0.9287246561891422, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 26083 + }, + { + "epoch": 0.26084, + "grad_norm": 0.9045879816655447, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 26084 + }, + { + "epoch": 0.26085, + "grad_norm": 0.9018666372160579, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 26085 + }, + { + "epoch": 0.26086, + "grad_norm": 0.795614001086891, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 26086 + }, + { + "epoch": 0.26087, + "grad_norm": 0.8571378805568793, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 26087 + }, + { + "epoch": 0.26088, + "grad_norm": 0.9868071123234646, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 26088 + }, + { + "epoch": 0.26089, + "grad_norm": 1.0702366253554527, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 26089 + }, + { + "epoch": 0.2609, + "grad_norm": 0.8855751449590396, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 26090 + }, + { + "epoch": 0.26091, + "grad_norm": 0.8961938798809969, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26091 + }, + { + "epoch": 0.26092, + "grad_norm": 0.8155684210360971, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 26092 + }, + { + "epoch": 0.26093, + "grad_norm": 0.8211536793960583, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 26093 + }, + { + "epoch": 0.26094, + "grad_norm": 0.7358550554289822, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 26094 + }, + { + "epoch": 0.26095, + "grad_norm": 0.6850162783000483, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 26095 + }, + { + "epoch": 0.26096, + "grad_norm": 0.8040050321004595, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 26096 + }, + { + "epoch": 0.26097, + "grad_norm": 1.0008675953964583, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 26097 + }, + { + "epoch": 0.26098, + "grad_norm": 1.2297217774394595, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 26098 + }, + { + "epoch": 0.26099, + "grad_norm": 0.9678521083940245, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 26099 + }, + { + "epoch": 0.261, + "grad_norm": 1.0434990907975106, + "learning_rate": 0.003, + "loss": 4.042, + "step": 26100 + }, + { + "epoch": 0.26101, + "grad_norm": 1.0267214274996164, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 26101 + }, + { + "epoch": 0.26102, + "grad_norm": 0.9678018786810181, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 26102 + }, + { + "epoch": 0.26103, + "grad_norm": 0.9302899086120066, + "learning_rate": 0.003, + "loss": 4.083, + "step": 26103 + }, + { + "epoch": 0.26104, + "grad_norm": 0.8949080051980559, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 26104 + }, + { + "epoch": 0.26105, + "grad_norm": 0.8318899747996377, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 26105 + }, + { + "epoch": 0.26106, + "grad_norm": 0.8818053063582686, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 26106 + }, + { + "epoch": 0.26107, + "grad_norm": 0.8848169930994063, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 26107 + }, + { + "epoch": 0.26108, + "grad_norm": 0.9860821520833363, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 26108 + }, + { + "epoch": 0.26109, + "grad_norm": 1.0504841548627208, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 26109 + }, + { + "epoch": 0.2611, + "grad_norm": 1.0465797551518605, + "learning_rate": 0.003, + "loss": 4.0954, + "step": 26110 + }, + { + "epoch": 0.26111, + "grad_norm": 1.0489188083218413, + "learning_rate": 0.003, + "loss": 4.036, + "step": 26111 + }, + { + "epoch": 0.26112, + "grad_norm": 1.028654500705115, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 26112 + }, + { + "epoch": 0.26113, + "grad_norm": 1.122170536375792, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 26113 + }, + { + "epoch": 0.26114, + "grad_norm": 0.7577094639098627, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 26114 + }, + { + "epoch": 0.26115, + "grad_norm": 0.7327647971810204, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 26115 + }, + { + "epoch": 0.26116, + "grad_norm": 0.6981746510064534, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 26116 + }, + { + "epoch": 0.26117, + "grad_norm": 0.7053277219712358, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 26117 + }, + { + "epoch": 0.26118, + "grad_norm": 0.6520060481545832, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 26118 + }, + { + "epoch": 0.26119, + "grad_norm": 0.6403097855909565, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 26119 + }, + { + "epoch": 0.2612, + "grad_norm": 0.6293790481219621, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 26120 + }, + { + "epoch": 0.26121, + "grad_norm": 0.6739906758605874, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 26121 + }, + { + "epoch": 0.26122, + "grad_norm": 0.6769967545240873, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 26122 + }, + { + "epoch": 0.26123, + "grad_norm": 0.6271804544581302, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 26123 + }, + { + "epoch": 0.26124, + "grad_norm": 0.6248084816110033, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 26124 + }, + { + "epoch": 0.26125, + "grad_norm": 0.7015609503084236, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 26125 + }, + { + "epoch": 0.26126, + "grad_norm": 0.7173271461637023, + "learning_rate": 0.003, + "loss": 4.038, + "step": 26126 + }, + { + "epoch": 0.26127, + "grad_norm": 0.76880893259204, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 26127 + }, + { + "epoch": 0.26128, + "grad_norm": 0.8546971782805463, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 26128 + }, + { + "epoch": 0.26129, + "grad_norm": 0.9262169223679229, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 26129 + }, + { + "epoch": 0.2613, + "grad_norm": 1.209562202995274, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 26130 + }, + { + "epoch": 0.26131, + "grad_norm": 0.9837295238600589, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 26131 + }, + { + "epoch": 0.26132, + "grad_norm": 1.2164612596155162, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 26132 + }, + { + "epoch": 0.26133, + "grad_norm": 0.9052805546299899, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 26133 + }, + { + "epoch": 0.26134, + "grad_norm": 0.7958188646132287, + "learning_rate": 0.003, + "loss": 4.049, + "step": 26134 + }, + { + "epoch": 0.26135, + "grad_norm": 0.7228169526352054, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 26135 + }, + { + "epoch": 0.26136, + "grad_norm": 0.822663743890147, + "learning_rate": 0.003, + "loss": 4.044, + "step": 26136 + }, + { + "epoch": 0.26137, + "grad_norm": 0.8900026489173738, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 26137 + }, + { + "epoch": 0.26138, + "grad_norm": 0.9498592760010331, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 26138 + }, + { + "epoch": 0.26139, + "grad_norm": 1.0160936820117044, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 26139 + }, + { + "epoch": 0.2614, + "grad_norm": 1.0441574991131934, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 26140 + }, + { + "epoch": 0.26141, + "grad_norm": 1.0778389621924822, + "learning_rate": 0.003, + "loss": 4.0881, + "step": 26141 + }, + { + "epoch": 0.26142, + "grad_norm": 0.9421381994374114, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 26142 + }, + { + "epoch": 0.26143, + "grad_norm": 0.8478442866010983, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 26143 + }, + { + "epoch": 0.26144, + "grad_norm": 0.9981132603959519, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 26144 + }, + { + "epoch": 0.26145, + "grad_norm": 1.4113166116982694, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 26145 + }, + { + "epoch": 0.26146, + "grad_norm": 0.7299746261314736, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 26146 + }, + { + "epoch": 0.26147, + "grad_norm": 0.7337085940973237, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 26147 + }, + { + "epoch": 0.26148, + "grad_norm": 0.7967423526101783, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 26148 + }, + { + "epoch": 0.26149, + "grad_norm": 0.9571080840385005, + "learning_rate": 0.003, + "loss": 4.0877, + "step": 26149 + }, + { + "epoch": 0.2615, + "grad_norm": 1.2442136447377572, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 26150 + }, + { + "epoch": 0.26151, + "grad_norm": 0.7451181392919868, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 26151 + }, + { + "epoch": 0.26152, + "grad_norm": 0.7971577003875165, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 26152 + }, + { + "epoch": 0.26153, + "grad_norm": 0.9217824954397043, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 26153 + }, + { + "epoch": 0.26154, + "grad_norm": 1.0535173348144642, + "learning_rate": 0.003, + "loss": 4.056, + "step": 26154 + }, + { + "epoch": 0.26155, + "grad_norm": 0.9875766000485343, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 26155 + }, + { + "epoch": 0.26156, + "grad_norm": 0.870727216162887, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 26156 + }, + { + "epoch": 0.26157, + "grad_norm": 0.8974823005065079, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 26157 + }, + { + "epoch": 0.26158, + "grad_norm": 0.8342047578780025, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 26158 + }, + { + "epoch": 0.26159, + "grad_norm": 0.871464791418511, + "learning_rate": 0.003, + "loss": 4.1044, + "step": 26159 + }, + { + "epoch": 0.2616, + "grad_norm": 0.9328411279255361, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 26160 + }, + { + "epoch": 0.26161, + "grad_norm": 0.9795277450175764, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 26161 + }, + { + "epoch": 0.26162, + "grad_norm": 1.0699961638310336, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 26162 + }, + { + "epoch": 0.26163, + "grad_norm": 1.1135678867890897, + "learning_rate": 0.003, + "loss": 4.049, + "step": 26163 + }, + { + "epoch": 0.26164, + "grad_norm": 0.9913040261567545, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 26164 + }, + { + "epoch": 0.26165, + "grad_norm": 1.0709051919317556, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 26165 + }, + { + "epoch": 0.26166, + "grad_norm": 0.9786837065347199, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 26166 + }, + { + "epoch": 0.26167, + "grad_norm": 0.7946176374456817, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 26167 + }, + { + "epoch": 0.26168, + "grad_norm": 0.7487414672438271, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 26168 + }, + { + "epoch": 0.26169, + "grad_norm": 0.622999121167594, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 26169 + }, + { + "epoch": 0.2617, + "grad_norm": 0.6523425293751464, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 26170 + }, + { + "epoch": 0.26171, + "grad_norm": 0.6278395896515819, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 26171 + }, + { + "epoch": 0.26172, + "grad_norm": 0.6654667915237673, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 26172 + }, + { + "epoch": 0.26173, + "grad_norm": 0.6911176692910286, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 26173 + }, + { + "epoch": 0.26174, + "grad_norm": 0.7927240238585901, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 26174 + }, + { + "epoch": 0.26175, + "grad_norm": 0.8665871166802704, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 26175 + }, + { + "epoch": 0.26176, + "grad_norm": 0.8554582668413623, + "learning_rate": 0.003, + "loss": 4.055, + "step": 26176 + }, + { + "epoch": 0.26177, + "grad_norm": 0.8548705452997132, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 26177 + }, + { + "epoch": 0.26178, + "grad_norm": 0.811262256766966, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 26178 + }, + { + "epoch": 0.26179, + "grad_norm": 0.8672019491786611, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 26179 + }, + { + "epoch": 0.2618, + "grad_norm": 1.0226958696872295, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 26180 + }, + { + "epoch": 0.26181, + "grad_norm": 1.1288692742829252, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 26181 + }, + { + "epoch": 0.26182, + "grad_norm": 0.8351683694338491, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 26182 + }, + { + "epoch": 0.26183, + "grad_norm": 0.8036886678071687, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 26183 + }, + { + "epoch": 0.26184, + "grad_norm": 0.7596885242566551, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 26184 + }, + { + "epoch": 0.26185, + "grad_norm": 0.7761230407309635, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 26185 + }, + { + "epoch": 0.26186, + "grad_norm": 0.8463799400896802, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 26186 + }, + { + "epoch": 0.26187, + "grad_norm": 0.8452550112625654, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 26187 + }, + { + "epoch": 0.26188, + "grad_norm": 0.8819258247251434, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 26188 + }, + { + "epoch": 0.26189, + "grad_norm": 1.0704953500861512, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 26189 + }, + { + "epoch": 0.2619, + "grad_norm": 1.2149785995897582, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 26190 + }, + { + "epoch": 0.26191, + "grad_norm": 0.7742188476812769, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 26191 + }, + { + "epoch": 0.26192, + "grad_norm": 0.7510838877588842, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26192 + }, + { + "epoch": 0.26193, + "grad_norm": 0.8361943717285526, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 26193 + }, + { + "epoch": 0.26194, + "grad_norm": 0.9338079918428045, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 26194 + }, + { + "epoch": 0.26195, + "grad_norm": 0.8596427759634822, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 26195 + }, + { + "epoch": 0.26196, + "grad_norm": 0.9016987025684433, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 26196 + }, + { + "epoch": 0.26197, + "grad_norm": 0.967231069974892, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 26197 + }, + { + "epoch": 0.26198, + "grad_norm": 0.9288960270670875, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 26198 + }, + { + "epoch": 0.26199, + "grad_norm": 1.0255603408997926, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 26199 + }, + { + "epoch": 0.262, + "grad_norm": 1.0640304321704845, + "learning_rate": 0.003, + "loss": 4.081, + "step": 26200 + }, + { + "epoch": 0.26201, + "grad_norm": 0.8324910995753343, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 26201 + }, + { + "epoch": 0.26202, + "grad_norm": 0.893605807412069, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 26202 + }, + { + "epoch": 0.26203, + "grad_norm": 1.05870122480406, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 26203 + }, + { + "epoch": 0.26204, + "grad_norm": 0.9771788798866383, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 26204 + }, + { + "epoch": 0.26205, + "grad_norm": 0.8401702345381304, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 26205 + }, + { + "epoch": 0.26206, + "grad_norm": 0.710617038260591, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 26206 + }, + { + "epoch": 0.26207, + "grad_norm": 0.6238455966692761, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 26207 + }, + { + "epoch": 0.26208, + "grad_norm": 0.6600554828945892, + "learning_rate": 0.003, + "loss": 4.038, + "step": 26208 + }, + { + "epoch": 0.26209, + "grad_norm": 0.7162131750535364, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 26209 + }, + { + "epoch": 0.2621, + "grad_norm": 0.69481936448279, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 26210 + }, + { + "epoch": 0.26211, + "grad_norm": 0.6786574098826635, + "learning_rate": 0.003, + "loss": 3.9968, + "step": 26211 + }, + { + "epoch": 0.26212, + "grad_norm": 0.7168254288215564, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 26212 + }, + { + "epoch": 0.26213, + "grad_norm": 0.6548952978420925, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 26213 + }, + { + "epoch": 0.26214, + "grad_norm": 0.6887107235912903, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 26214 + }, + { + "epoch": 0.26215, + "grad_norm": 0.7664994610839558, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 26215 + }, + { + "epoch": 0.26216, + "grad_norm": 0.8945874303937961, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 26216 + }, + { + "epoch": 0.26217, + "grad_norm": 1.1132401905276665, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 26217 + }, + { + "epoch": 0.26218, + "grad_norm": 1.0629038939921511, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 26218 + }, + { + "epoch": 0.26219, + "grad_norm": 1.0615366999980875, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 26219 + }, + { + "epoch": 0.2622, + "grad_norm": 1.008818814954345, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 26220 + }, + { + "epoch": 0.26221, + "grad_norm": 0.9650317343948729, + "learning_rate": 0.003, + "loss": 4.063, + "step": 26221 + }, + { + "epoch": 0.26222, + "grad_norm": 1.1103801658025791, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 26222 + }, + { + "epoch": 0.26223, + "grad_norm": 1.0145793555295273, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 26223 + }, + { + "epoch": 0.26224, + "grad_norm": 0.9732576958500327, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 26224 + }, + { + "epoch": 0.26225, + "grad_norm": 0.9401817363836049, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 26225 + }, + { + "epoch": 0.26226, + "grad_norm": 0.8462411326042419, + "learning_rate": 0.003, + "loss": 4.072, + "step": 26226 + }, + { + "epoch": 0.26227, + "grad_norm": 0.8273540810131589, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 26227 + }, + { + "epoch": 0.26228, + "grad_norm": 0.7856260091465705, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 26228 + }, + { + "epoch": 0.26229, + "grad_norm": 0.8135587875147215, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 26229 + }, + { + "epoch": 0.2623, + "grad_norm": 1.0124533895174466, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 26230 + }, + { + "epoch": 0.26231, + "grad_norm": 1.3424260946475752, + "learning_rate": 0.003, + "loss": 4.081, + "step": 26231 + }, + { + "epoch": 0.26232, + "grad_norm": 0.8872451637430662, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 26232 + }, + { + "epoch": 0.26233, + "grad_norm": 0.7754398705892791, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 26233 + }, + { + "epoch": 0.26234, + "grad_norm": 0.6529116539139486, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 26234 + }, + { + "epoch": 0.26235, + "grad_norm": 0.6422858716029861, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 26235 + }, + { + "epoch": 0.26236, + "grad_norm": 0.6175926059224309, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 26236 + }, + { + "epoch": 0.26237, + "grad_norm": 0.6940176293482264, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 26237 + }, + { + "epoch": 0.26238, + "grad_norm": 0.7484724898267537, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 26238 + }, + { + "epoch": 0.26239, + "grad_norm": 0.7748681911845664, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 26239 + }, + { + "epoch": 0.2624, + "grad_norm": 0.8185752623992257, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 26240 + }, + { + "epoch": 0.26241, + "grad_norm": 0.7735752237594188, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 26241 + }, + { + "epoch": 0.26242, + "grad_norm": 0.7142137024420664, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 26242 + }, + { + "epoch": 0.26243, + "grad_norm": 0.6691683193115119, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 26243 + }, + { + "epoch": 0.26244, + "grad_norm": 0.7120853425076942, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 26244 + }, + { + "epoch": 0.26245, + "grad_norm": 0.7004128980488831, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 26245 + }, + { + "epoch": 0.26246, + "grad_norm": 0.7831553544282576, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 26246 + }, + { + "epoch": 0.26247, + "grad_norm": 1.1322478964914258, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 26247 + }, + { + "epoch": 0.26248, + "grad_norm": 1.1843122835277173, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 26248 + }, + { + "epoch": 0.26249, + "grad_norm": 0.8871184361582932, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 26249 + }, + { + "epoch": 0.2625, + "grad_norm": 0.8031743251717228, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 26250 + }, + { + "epoch": 0.26251, + "grad_norm": 0.7044472498809049, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 26251 + }, + { + "epoch": 0.26252, + "grad_norm": 0.7487721964501101, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 26252 + }, + { + "epoch": 0.26253, + "grad_norm": 0.8621985479181946, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 26253 + }, + { + "epoch": 0.26254, + "grad_norm": 1.061993957823432, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 26254 + }, + { + "epoch": 0.26255, + "grad_norm": 1.1407790435576035, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 26255 + }, + { + "epoch": 0.26256, + "grad_norm": 0.8616823012900878, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 26256 + }, + { + "epoch": 0.26257, + "grad_norm": 0.9025329451937258, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 26257 + }, + { + "epoch": 0.26258, + "grad_norm": 1.0079900688881447, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 26258 + }, + { + "epoch": 0.26259, + "grad_norm": 1.1295140187489214, + "learning_rate": 0.003, + "loss": 4.04, + "step": 26259 + }, + { + "epoch": 0.2626, + "grad_norm": 0.790986819904, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 26260 + }, + { + "epoch": 0.26261, + "grad_norm": 0.7521691366434736, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 26261 + }, + { + "epoch": 0.26262, + "grad_norm": 0.7301074122856195, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 26262 + }, + { + "epoch": 0.26263, + "grad_norm": 0.7896891443906977, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 26263 + }, + { + "epoch": 0.26264, + "grad_norm": 0.9550362655461249, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 26264 + }, + { + "epoch": 0.26265, + "grad_norm": 1.197437017414975, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 26265 + }, + { + "epoch": 0.26266, + "grad_norm": 1.207926531338263, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 26266 + }, + { + "epoch": 0.26267, + "grad_norm": 0.9167232558741086, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 26267 + }, + { + "epoch": 0.26268, + "grad_norm": 0.9163525081509245, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 26268 + }, + { + "epoch": 0.26269, + "grad_norm": 1.0679378402947535, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 26269 + }, + { + "epoch": 0.2627, + "grad_norm": 0.9775004088351289, + "learning_rate": 0.003, + "loss": 4.041, + "step": 26270 + }, + { + "epoch": 0.26271, + "grad_norm": 0.8798112366115134, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 26271 + }, + { + "epoch": 0.26272, + "grad_norm": 0.8468671773409967, + "learning_rate": 0.003, + "loss": 4.048, + "step": 26272 + }, + { + "epoch": 0.26273, + "grad_norm": 0.8489561677812455, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 26273 + }, + { + "epoch": 0.26274, + "grad_norm": 0.8134026922098382, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 26274 + }, + { + "epoch": 0.26275, + "grad_norm": 0.920998239063929, + "learning_rate": 0.003, + "loss": 4.017, + "step": 26275 + }, + { + "epoch": 0.26276, + "grad_norm": 0.9647789386408926, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 26276 + }, + { + "epoch": 0.26277, + "grad_norm": 0.9093502748505667, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 26277 + }, + { + "epoch": 0.26278, + "grad_norm": 0.8114768874614045, + "learning_rate": 0.003, + "loss": 4.01, + "step": 26278 + }, + { + "epoch": 0.26279, + "grad_norm": 0.7694505761861851, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 26279 + }, + { + "epoch": 0.2628, + "grad_norm": 0.8628714682379884, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 26280 + }, + { + "epoch": 0.26281, + "grad_norm": 0.7811901650739501, + "learning_rate": 0.003, + "loss": 4.043, + "step": 26281 + }, + { + "epoch": 0.26282, + "grad_norm": 0.9238183684618442, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 26282 + }, + { + "epoch": 0.26283, + "grad_norm": 1.0915570172819549, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 26283 + }, + { + "epoch": 0.26284, + "grad_norm": 1.212763631289959, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 26284 + }, + { + "epoch": 0.26285, + "grad_norm": 0.74442577299926, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 26285 + }, + { + "epoch": 0.26286, + "grad_norm": 0.6725979852285053, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 26286 + }, + { + "epoch": 0.26287, + "grad_norm": 0.6577899737244505, + "learning_rate": 0.003, + "loss": 4.0911, + "step": 26287 + }, + { + "epoch": 0.26288, + "grad_norm": 0.6457362773646927, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 26288 + }, + { + "epoch": 0.26289, + "grad_norm": 0.8286101723706436, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 26289 + }, + { + "epoch": 0.2629, + "grad_norm": 0.8679813901989959, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 26290 + }, + { + "epoch": 0.26291, + "grad_norm": 0.7165778630461782, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 26291 + }, + { + "epoch": 0.26292, + "grad_norm": 0.6984354100131633, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 26292 + }, + { + "epoch": 0.26293, + "grad_norm": 0.8256529659186522, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 26293 + }, + { + "epoch": 0.26294, + "grad_norm": 1.0901196600608516, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 26294 + }, + { + "epoch": 0.26295, + "grad_norm": 1.2113541274854491, + "learning_rate": 0.003, + "loss": 4.047, + "step": 26295 + }, + { + "epoch": 0.26296, + "grad_norm": 0.7920236954832262, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 26296 + }, + { + "epoch": 0.26297, + "grad_norm": 0.6582118160018114, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 26297 + }, + { + "epoch": 0.26298, + "grad_norm": 0.5970007620304251, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 26298 + }, + { + "epoch": 0.26299, + "grad_norm": 0.6278891863428021, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 26299 + }, + { + "epoch": 0.263, + "grad_norm": 0.6279120283927472, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 26300 + }, + { + "epoch": 0.26301, + "grad_norm": 0.6091578889114159, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26301 + }, + { + "epoch": 0.26302, + "grad_norm": 0.6728227960727188, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 26302 + }, + { + "epoch": 0.26303, + "grad_norm": 0.7225328872988693, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 26303 + }, + { + "epoch": 0.26304, + "grad_norm": 0.7150045251330915, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26304 + }, + { + "epoch": 0.26305, + "grad_norm": 0.8965134883394871, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 26305 + }, + { + "epoch": 0.26306, + "grad_norm": 1.2208677908935968, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 26306 + }, + { + "epoch": 0.26307, + "grad_norm": 0.8682731274917596, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 26307 + }, + { + "epoch": 0.26308, + "grad_norm": 0.8922736464182072, + "learning_rate": 0.003, + "loss": 4.059, + "step": 26308 + }, + { + "epoch": 0.26309, + "grad_norm": 1.0760558985185473, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 26309 + }, + { + "epoch": 0.2631, + "grad_norm": 1.0676096075131747, + "learning_rate": 0.003, + "loss": 4.032, + "step": 26310 + }, + { + "epoch": 0.26311, + "grad_norm": 1.100043151002186, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 26311 + }, + { + "epoch": 0.26312, + "grad_norm": 1.0106332974486276, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 26312 + }, + { + "epoch": 0.26313, + "grad_norm": 1.0293597174838511, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 26313 + }, + { + "epoch": 0.26314, + "grad_norm": 0.9618948467981816, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 26314 + }, + { + "epoch": 0.26315, + "grad_norm": 1.0351998663337603, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 26315 + }, + { + "epoch": 0.26316, + "grad_norm": 0.9432089852548535, + "learning_rate": 0.003, + "loss": 4.061, + "step": 26316 + }, + { + "epoch": 0.26317, + "grad_norm": 0.9961526705061539, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 26317 + }, + { + "epoch": 0.26318, + "grad_norm": 1.1133722203640881, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 26318 + }, + { + "epoch": 0.26319, + "grad_norm": 0.7571019654191382, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 26319 + }, + { + "epoch": 0.2632, + "grad_norm": 0.768280436814758, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 26320 + }, + { + "epoch": 0.26321, + "grad_norm": 0.7590249305878454, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 26321 + }, + { + "epoch": 0.26322, + "grad_norm": 0.8143382745718001, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 26322 + }, + { + "epoch": 0.26323, + "grad_norm": 0.8721505023259459, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 26323 + }, + { + "epoch": 0.26324, + "grad_norm": 0.7406563134746335, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 26324 + }, + { + "epoch": 0.26325, + "grad_norm": 0.6945285002428451, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 26325 + }, + { + "epoch": 0.26326, + "grad_norm": 0.636047233918367, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 26326 + }, + { + "epoch": 0.26327, + "grad_norm": 0.654385451886071, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 26327 + }, + { + "epoch": 0.26328, + "grad_norm": 0.7383077330479317, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 26328 + }, + { + "epoch": 0.26329, + "grad_norm": 0.9451589602728051, + "learning_rate": 0.003, + "loss": 4.031, + "step": 26329 + }, + { + "epoch": 0.2633, + "grad_norm": 1.0790389165242444, + "learning_rate": 0.003, + "loss": 4.059, + "step": 26330 + }, + { + "epoch": 0.26331, + "grad_norm": 1.0465190465778913, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 26331 + }, + { + "epoch": 0.26332, + "grad_norm": 0.9378987592958155, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 26332 + }, + { + "epoch": 0.26333, + "grad_norm": 0.9786756403401775, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 26333 + }, + { + "epoch": 0.26334, + "grad_norm": 1.0588949194447288, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 26334 + }, + { + "epoch": 0.26335, + "grad_norm": 0.8342044133140121, + "learning_rate": 0.003, + "loss": 4.058, + "step": 26335 + }, + { + "epoch": 0.26336, + "grad_norm": 0.8507909769948382, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 26336 + }, + { + "epoch": 0.26337, + "grad_norm": 0.8209158519816615, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 26337 + }, + { + "epoch": 0.26338, + "grad_norm": 0.770291250093039, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 26338 + }, + { + "epoch": 0.26339, + "grad_norm": 0.9089123283005075, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 26339 + }, + { + "epoch": 0.2634, + "grad_norm": 1.1997642238070532, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 26340 + }, + { + "epoch": 0.26341, + "grad_norm": 0.9267601696290195, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 26341 + }, + { + "epoch": 0.26342, + "grad_norm": 0.9594968756001173, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 26342 + }, + { + "epoch": 0.26343, + "grad_norm": 1.0390175574294294, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 26343 + }, + { + "epoch": 0.26344, + "grad_norm": 1.1079896743378188, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 26344 + }, + { + "epoch": 0.26345, + "grad_norm": 0.8730707308834204, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 26345 + }, + { + "epoch": 0.26346, + "grad_norm": 0.8313016534289472, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 26346 + }, + { + "epoch": 0.26347, + "grad_norm": 0.8231652965647966, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 26347 + }, + { + "epoch": 0.26348, + "grad_norm": 0.8591856289309846, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 26348 + }, + { + "epoch": 0.26349, + "grad_norm": 0.8266083093254825, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 26349 + }, + { + "epoch": 0.2635, + "grad_norm": 0.9466743690274186, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 26350 + }, + { + "epoch": 0.26351, + "grad_norm": 1.0046860483149092, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 26351 + }, + { + "epoch": 0.26352, + "grad_norm": 1.1949230200591339, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 26352 + }, + { + "epoch": 0.26353, + "grad_norm": 0.8390139849462757, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 26353 + }, + { + "epoch": 0.26354, + "grad_norm": 0.7202718760362946, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 26354 + }, + { + "epoch": 0.26355, + "grad_norm": 0.8649998998430727, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 26355 + }, + { + "epoch": 0.26356, + "grad_norm": 0.8705055911584321, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 26356 + }, + { + "epoch": 0.26357, + "grad_norm": 0.8534090092226855, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 26357 + }, + { + "epoch": 0.26358, + "grad_norm": 0.9313325043444313, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 26358 + }, + { + "epoch": 0.26359, + "grad_norm": 1.1250027959631315, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 26359 + }, + { + "epoch": 0.2636, + "grad_norm": 1.0470279151080315, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 26360 + }, + { + "epoch": 0.26361, + "grad_norm": 1.02541984005741, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 26361 + }, + { + "epoch": 0.26362, + "grad_norm": 0.9602641336378365, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 26362 + }, + { + "epoch": 0.26363, + "grad_norm": 0.9364129022590112, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 26363 + }, + { + "epoch": 0.26364, + "grad_norm": 0.9307252377674574, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 26364 + }, + { + "epoch": 0.26365, + "grad_norm": 0.920306162620304, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 26365 + }, + { + "epoch": 0.26366, + "grad_norm": 0.9154178405861482, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 26366 + }, + { + "epoch": 0.26367, + "grad_norm": 0.937245208004888, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 26367 + }, + { + "epoch": 0.26368, + "grad_norm": 0.8398934953416446, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 26368 + }, + { + "epoch": 0.26369, + "grad_norm": 0.8173147407236, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 26369 + }, + { + "epoch": 0.2637, + "grad_norm": 0.7058333892613555, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 26370 + }, + { + "epoch": 0.26371, + "grad_norm": 0.6329275884277615, + "learning_rate": 0.003, + "loss": 4.037, + "step": 26371 + }, + { + "epoch": 0.26372, + "grad_norm": 0.728975976734911, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 26372 + }, + { + "epoch": 0.26373, + "grad_norm": 0.815344001364777, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 26373 + }, + { + "epoch": 0.26374, + "grad_norm": 0.9109008919062535, + "learning_rate": 0.003, + "loss": 4.0018, + "step": 26374 + }, + { + "epoch": 0.26375, + "grad_norm": 0.8803127956664503, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 26375 + }, + { + "epoch": 0.26376, + "grad_norm": 0.9407807448634652, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 26376 + }, + { + "epoch": 0.26377, + "grad_norm": 1.0657081824778805, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 26377 + }, + { + "epoch": 0.26378, + "grad_norm": 1.0181504109402226, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 26378 + }, + { + "epoch": 0.26379, + "grad_norm": 0.9214829771083111, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 26379 + }, + { + "epoch": 0.2638, + "grad_norm": 0.9342647661995772, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 26380 + }, + { + "epoch": 0.26381, + "grad_norm": 1.0947123414395898, + "learning_rate": 0.003, + "loss": 4.048, + "step": 26381 + }, + { + "epoch": 0.26382, + "grad_norm": 1.0246836464745697, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 26382 + }, + { + "epoch": 0.26383, + "grad_norm": 0.9703983020976162, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 26383 + }, + { + "epoch": 0.26384, + "grad_norm": 1.0284449878057695, + "learning_rate": 0.003, + "loss": 4.063, + "step": 26384 + }, + { + "epoch": 0.26385, + "grad_norm": 0.8717133495388695, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 26385 + }, + { + "epoch": 0.26386, + "grad_norm": 0.6963126463025701, + "learning_rate": 0.003, + "loss": 4.077, + "step": 26386 + }, + { + "epoch": 0.26387, + "grad_norm": 0.7560600662986483, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 26387 + }, + { + "epoch": 0.26388, + "grad_norm": 0.7112745384693496, + "learning_rate": 0.003, + "loss": 4.023, + "step": 26388 + }, + { + "epoch": 0.26389, + "grad_norm": 0.766828323395047, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 26389 + }, + { + "epoch": 0.2639, + "grad_norm": 0.6386782695691284, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 26390 + }, + { + "epoch": 0.26391, + "grad_norm": 0.6710948808489234, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 26391 + }, + { + "epoch": 0.26392, + "grad_norm": 0.7956219639057001, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 26392 + }, + { + "epoch": 0.26393, + "grad_norm": 0.8575451057485323, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 26393 + }, + { + "epoch": 0.26394, + "grad_norm": 0.9106084091652044, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 26394 + }, + { + "epoch": 0.26395, + "grad_norm": 1.067579032975518, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 26395 + }, + { + "epoch": 0.26396, + "grad_norm": 1.162232723376386, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 26396 + }, + { + "epoch": 0.26397, + "grad_norm": 0.9517569294306354, + "learning_rate": 0.003, + "loss": 4.052, + "step": 26397 + }, + { + "epoch": 0.26398, + "grad_norm": 0.8701270956448659, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 26398 + }, + { + "epoch": 0.26399, + "grad_norm": 0.7005957270105875, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 26399 + }, + { + "epoch": 0.264, + "grad_norm": 0.7143917165320282, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 26400 + }, + { + "epoch": 0.26401, + "grad_norm": 0.8254167017687327, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 26401 + }, + { + "epoch": 0.26402, + "grad_norm": 0.8650248959337753, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 26402 + }, + { + "epoch": 0.26403, + "grad_norm": 0.8191146834273513, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 26403 + }, + { + "epoch": 0.26404, + "grad_norm": 0.7578882041876643, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 26404 + }, + { + "epoch": 0.26405, + "grad_norm": 0.6705536445210004, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 26405 + }, + { + "epoch": 0.26406, + "grad_norm": 0.5864020268483535, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 26406 + }, + { + "epoch": 0.26407, + "grad_norm": 0.6249825674102845, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 26407 + }, + { + "epoch": 0.26408, + "grad_norm": 0.6175129399805311, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 26408 + }, + { + "epoch": 0.26409, + "grad_norm": 0.6476694851788265, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 26409 + }, + { + "epoch": 0.2641, + "grad_norm": 0.7625158024795464, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 26410 + }, + { + "epoch": 0.26411, + "grad_norm": 0.860459364848734, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 26411 + }, + { + "epoch": 0.26412, + "grad_norm": 1.0812651847513521, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 26412 + }, + { + "epoch": 0.26413, + "grad_norm": 1.0489418788409184, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 26413 + }, + { + "epoch": 0.26414, + "grad_norm": 0.7989897817720344, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 26414 + }, + { + "epoch": 0.26415, + "grad_norm": 0.8408677030070899, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 26415 + }, + { + "epoch": 0.26416, + "grad_norm": 0.9688824286156928, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 26416 + }, + { + "epoch": 0.26417, + "grad_norm": 0.8830501400318521, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 26417 + }, + { + "epoch": 0.26418, + "grad_norm": 0.8254920285957976, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 26418 + }, + { + "epoch": 0.26419, + "grad_norm": 0.7508440341993747, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 26419 + }, + { + "epoch": 0.2642, + "grad_norm": 0.7669802797847566, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 26420 + }, + { + "epoch": 0.26421, + "grad_norm": 0.8896881044472471, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 26421 + }, + { + "epoch": 0.26422, + "grad_norm": 1.1927831258741326, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 26422 + }, + { + "epoch": 0.26423, + "grad_norm": 0.9048049104330993, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 26423 + }, + { + "epoch": 0.26424, + "grad_norm": 0.982882329856891, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 26424 + }, + { + "epoch": 0.26425, + "grad_norm": 1.135873243157521, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 26425 + }, + { + "epoch": 0.26426, + "grad_norm": 1.1127915458170283, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 26426 + }, + { + "epoch": 0.26427, + "grad_norm": 1.0516026474370734, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 26427 + }, + { + "epoch": 0.26428, + "grad_norm": 1.130422845744541, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 26428 + }, + { + "epoch": 0.26429, + "grad_norm": 0.9970937210173137, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 26429 + }, + { + "epoch": 0.2643, + "grad_norm": 0.9862627116646423, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 26430 + }, + { + "epoch": 0.26431, + "grad_norm": 1.0545008421663888, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 26431 + }, + { + "epoch": 0.26432, + "grad_norm": 1.0297866480393942, + "learning_rate": 0.003, + "loss": 4.066, + "step": 26432 + }, + { + "epoch": 0.26433, + "grad_norm": 0.9480779968013529, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 26433 + }, + { + "epoch": 0.26434, + "grad_norm": 0.831788867926559, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 26434 + }, + { + "epoch": 0.26435, + "grad_norm": 0.6840378094823714, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 26435 + }, + { + "epoch": 0.26436, + "grad_norm": 0.6891648057766006, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 26436 + }, + { + "epoch": 0.26437, + "grad_norm": 0.8463507408971332, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 26437 + }, + { + "epoch": 0.26438, + "grad_norm": 1.075623620930203, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 26438 + }, + { + "epoch": 0.26439, + "grad_norm": 1.0543542680845774, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 26439 + }, + { + "epoch": 0.2644, + "grad_norm": 0.8031729208632075, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 26440 + }, + { + "epoch": 0.26441, + "grad_norm": 0.6550500802019965, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 26441 + }, + { + "epoch": 0.26442, + "grad_norm": 0.7573350753134994, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 26442 + }, + { + "epoch": 0.26443, + "grad_norm": 0.8889633407157393, + "learning_rate": 0.003, + "loss": 4.057, + "step": 26443 + }, + { + "epoch": 0.26444, + "grad_norm": 0.8955200498594348, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 26444 + }, + { + "epoch": 0.26445, + "grad_norm": 0.9411621158959186, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 26445 + }, + { + "epoch": 0.26446, + "grad_norm": 0.9985013275378802, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 26446 + }, + { + "epoch": 0.26447, + "grad_norm": 0.9707548495962474, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 26447 + }, + { + "epoch": 0.26448, + "grad_norm": 0.9277198050581533, + "learning_rate": 0.003, + "loss": 4.037, + "step": 26448 + }, + { + "epoch": 0.26449, + "grad_norm": 0.9758363009872599, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 26449 + }, + { + "epoch": 0.2645, + "grad_norm": 1.0001416446769837, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 26450 + }, + { + "epoch": 0.26451, + "grad_norm": 0.9800255214045345, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 26451 + }, + { + "epoch": 0.26452, + "grad_norm": 0.8565490459972365, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 26452 + }, + { + "epoch": 0.26453, + "grad_norm": 0.9113251032337254, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 26453 + }, + { + "epoch": 0.26454, + "grad_norm": 0.8389117178014288, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 26454 + }, + { + "epoch": 0.26455, + "grad_norm": 0.8331910636525025, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 26455 + }, + { + "epoch": 0.26456, + "grad_norm": 0.988341975817765, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 26456 + }, + { + "epoch": 0.26457, + "grad_norm": 1.0997198014297673, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 26457 + }, + { + "epoch": 0.26458, + "grad_norm": 0.8729624608622639, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 26458 + }, + { + "epoch": 0.26459, + "grad_norm": 0.7501123040205954, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 26459 + }, + { + "epoch": 0.2646, + "grad_norm": 0.7474377281299847, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 26460 + }, + { + "epoch": 0.26461, + "grad_norm": 0.7592649104106839, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 26461 + }, + { + "epoch": 0.26462, + "grad_norm": 0.6984067964799132, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 26462 + }, + { + "epoch": 0.26463, + "grad_norm": 0.7209472962451595, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 26463 + }, + { + "epoch": 0.26464, + "grad_norm": 0.7756276331832247, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 26464 + }, + { + "epoch": 0.26465, + "grad_norm": 0.8138205000370119, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 26465 + }, + { + "epoch": 0.26466, + "grad_norm": 0.9047164533691512, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 26466 + }, + { + "epoch": 0.26467, + "grad_norm": 0.9716370291731774, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 26467 + }, + { + "epoch": 0.26468, + "grad_norm": 0.9335191023201734, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 26468 + }, + { + "epoch": 0.26469, + "grad_norm": 0.8486803427402103, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 26469 + }, + { + "epoch": 0.2647, + "grad_norm": 0.8864848257748309, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 26470 + }, + { + "epoch": 0.26471, + "grad_norm": 0.7726791846119326, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 26471 + }, + { + "epoch": 0.26472, + "grad_norm": 0.7895070265189678, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 26472 + }, + { + "epoch": 0.26473, + "grad_norm": 0.8065934503354619, + "learning_rate": 0.003, + "loss": 4.016, + "step": 26473 + }, + { + "epoch": 0.26474, + "grad_norm": 0.8564721178785649, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 26474 + }, + { + "epoch": 0.26475, + "grad_norm": 1.1012056109420556, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 26475 + }, + { + "epoch": 0.26476, + "grad_norm": 0.9574911297667191, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 26476 + }, + { + "epoch": 0.26477, + "grad_norm": 0.784224530597972, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 26477 + }, + { + "epoch": 0.26478, + "grad_norm": 0.7366702338549552, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 26478 + }, + { + "epoch": 0.26479, + "grad_norm": 0.7600814438220768, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 26479 + }, + { + "epoch": 0.2648, + "grad_norm": 0.6873411325365798, + "learning_rate": 0.003, + "loss": 4.013, + "step": 26480 + }, + { + "epoch": 0.26481, + "grad_norm": 0.8400139834751884, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 26481 + }, + { + "epoch": 0.26482, + "grad_norm": 0.9999519491565146, + "learning_rate": 0.003, + "loss": 4.048, + "step": 26482 + }, + { + "epoch": 0.26483, + "grad_norm": 1.1854447759389781, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 26483 + }, + { + "epoch": 0.26484, + "grad_norm": 1.0205160308075745, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 26484 + }, + { + "epoch": 0.26485, + "grad_norm": 1.1415264366021833, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 26485 + }, + { + "epoch": 0.26486, + "grad_norm": 0.8793647690282758, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 26486 + }, + { + "epoch": 0.26487, + "grad_norm": 0.8017486036852204, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 26487 + }, + { + "epoch": 0.26488, + "grad_norm": 0.8008961164999143, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 26488 + }, + { + "epoch": 0.26489, + "grad_norm": 0.6818561500421075, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 26489 + }, + { + "epoch": 0.2649, + "grad_norm": 0.6749483346555479, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 26490 + }, + { + "epoch": 0.26491, + "grad_norm": 0.7011862927103796, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 26491 + }, + { + "epoch": 0.26492, + "grad_norm": 0.6877318658292878, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 26492 + }, + { + "epoch": 0.26493, + "grad_norm": 0.6786471718017335, + "learning_rate": 0.003, + "loss": 4.041, + "step": 26493 + }, + { + "epoch": 0.26494, + "grad_norm": 0.735619168290087, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 26494 + }, + { + "epoch": 0.26495, + "grad_norm": 0.9610149087782505, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 26495 + }, + { + "epoch": 0.26496, + "grad_norm": 1.056513198255653, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 26496 + }, + { + "epoch": 0.26497, + "grad_norm": 0.9183423889386664, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 26497 + }, + { + "epoch": 0.26498, + "grad_norm": 0.9464937449545988, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 26498 + }, + { + "epoch": 0.26499, + "grad_norm": 0.9812872537939151, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 26499 + }, + { + "epoch": 0.265, + "grad_norm": 1.0155022739525765, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 26500 + }, + { + "epoch": 0.26501, + "grad_norm": 0.9136252388732765, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 26501 + }, + { + "epoch": 0.26502, + "grad_norm": 0.8765427392869011, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 26502 + }, + { + "epoch": 0.26503, + "grad_norm": 0.8466278537553633, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 26503 + }, + { + "epoch": 0.26504, + "grad_norm": 0.7211701668090768, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 26504 + }, + { + "epoch": 0.26505, + "grad_norm": 0.709792348323559, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 26505 + }, + { + "epoch": 0.26506, + "grad_norm": 0.7034961963687268, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 26506 + }, + { + "epoch": 0.26507, + "grad_norm": 0.6790286507036759, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 26507 + }, + { + "epoch": 0.26508, + "grad_norm": 0.700781624201002, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 26508 + }, + { + "epoch": 0.26509, + "grad_norm": 0.7503016717391072, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 26509 + }, + { + "epoch": 0.2651, + "grad_norm": 0.784468647487236, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 26510 + }, + { + "epoch": 0.26511, + "grad_norm": 0.7751826820267655, + "learning_rate": 0.003, + "loss": 4.0025, + "step": 26511 + }, + { + "epoch": 0.26512, + "grad_norm": 0.8780569885062741, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 26512 + }, + { + "epoch": 0.26513, + "grad_norm": 1.085374624179859, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 26513 + }, + { + "epoch": 0.26514, + "grad_norm": 1.1160428927385875, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 26514 + }, + { + "epoch": 0.26515, + "grad_norm": 0.9932386606909059, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 26515 + }, + { + "epoch": 0.26516, + "grad_norm": 1.0655882167144841, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 26516 + }, + { + "epoch": 0.26517, + "grad_norm": 0.9838914878374098, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 26517 + }, + { + "epoch": 0.26518, + "grad_norm": 0.8094547759809417, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 26518 + }, + { + "epoch": 0.26519, + "grad_norm": 0.8420611730861844, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 26519 + }, + { + "epoch": 0.2652, + "grad_norm": 0.8634655364182763, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 26520 + }, + { + "epoch": 0.26521, + "grad_norm": 0.7959372676856116, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 26521 + }, + { + "epoch": 0.26522, + "grad_norm": 0.8248503569057528, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 26522 + }, + { + "epoch": 0.26523, + "grad_norm": 0.8965138341376468, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 26523 + }, + { + "epoch": 0.26524, + "grad_norm": 0.9519387129918376, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 26524 + }, + { + "epoch": 0.26525, + "grad_norm": 1.0522508229583183, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 26525 + }, + { + "epoch": 0.26526, + "grad_norm": 1.0568340427509701, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 26526 + }, + { + "epoch": 0.26527, + "grad_norm": 0.7951132429805443, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 26527 + }, + { + "epoch": 0.26528, + "grad_norm": 0.6629097337672287, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 26528 + }, + { + "epoch": 0.26529, + "grad_norm": 0.693422785260737, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 26529 + }, + { + "epoch": 0.2653, + "grad_norm": 0.8102985054367646, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 26530 + }, + { + "epoch": 0.26531, + "grad_norm": 1.0457243342227878, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 26531 + }, + { + "epoch": 0.26532, + "grad_norm": 1.1034054100830928, + "learning_rate": 0.003, + "loss": 4.035, + "step": 26532 + }, + { + "epoch": 0.26533, + "grad_norm": 0.8492093041777092, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 26533 + }, + { + "epoch": 0.26534, + "grad_norm": 0.7864015775622379, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 26534 + }, + { + "epoch": 0.26535, + "grad_norm": 0.8570386894433144, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 26535 + }, + { + "epoch": 0.26536, + "grad_norm": 0.8470269466946738, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 26536 + }, + { + "epoch": 0.26537, + "grad_norm": 0.8912428953016255, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 26537 + }, + { + "epoch": 0.26538, + "grad_norm": 1.0054688366721016, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 26538 + }, + { + "epoch": 0.26539, + "grad_norm": 0.9994619654800855, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 26539 + }, + { + "epoch": 0.2654, + "grad_norm": 1.0291891437504905, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 26540 + }, + { + "epoch": 0.26541, + "grad_norm": 0.9449779482929114, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 26541 + }, + { + "epoch": 0.26542, + "grad_norm": 0.8208802094563792, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 26542 + }, + { + "epoch": 0.26543, + "grad_norm": 0.7685638652637271, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 26543 + }, + { + "epoch": 0.26544, + "grad_norm": 0.8094918722457962, + "learning_rate": 0.003, + "loss": 4.083, + "step": 26544 + }, + { + "epoch": 0.26545, + "grad_norm": 0.7840593683450747, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 26545 + }, + { + "epoch": 0.26546, + "grad_norm": 0.9234567777721385, + "learning_rate": 0.003, + "loss": 4.045, + "step": 26546 + }, + { + "epoch": 0.26547, + "grad_norm": 0.9856434068206794, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 26547 + }, + { + "epoch": 0.26548, + "grad_norm": 1.135634091543436, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 26548 + }, + { + "epoch": 0.26549, + "grad_norm": 1.0180052598127605, + "learning_rate": 0.003, + "loss": 4.033, + "step": 26549 + }, + { + "epoch": 0.2655, + "grad_norm": 0.9917420436301264, + "learning_rate": 0.003, + "loss": 4.044, + "step": 26550 + }, + { + "epoch": 0.26551, + "grad_norm": 0.9076913636791657, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 26551 + }, + { + "epoch": 0.26552, + "grad_norm": 0.9014803711508053, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 26552 + }, + { + "epoch": 0.26553, + "grad_norm": 1.0031744020518278, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 26553 + }, + { + "epoch": 0.26554, + "grad_norm": 1.0310122879897317, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 26554 + }, + { + "epoch": 0.26555, + "grad_norm": 0.8630879336333662, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 26555 + }, + { + "epoch": 0.26556, + "grad_norm": 0.8482757208657407, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 26556 + }, + { + "epoch": 0.26557, + "grad_norm": 0.7921461539860246, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 26557 + }, + { + "epoch": 0.26558, + "grad_norm": 0.7062404825350208, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 26558 + }, + { + "epoch": 0.26559, + "grad_norm": 0.6434779495896984, + "learning_rate": 0.003, + "loss": 4.0101, + "step": 26559 + }, + { + "epoch": 0.2656, + "grad_norm": 0.6294746359856249, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 26560 + }, + { + "epoch": 0.26561, + "grad_norm": 0.6249647283990358, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 26561 + }, + { + "epoch": 0.26562, + "grad_norm": 0.6582266833177596, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 26562 + }, + { + "epoch": 0.26563, + "grad_norm": 0.8005843260633685, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 26563 + }, + { + "epoch": 0.26564, + "grad_norm": 1.0339601551323987, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 26564 + }, + { + "epoch": 0.26565, + "grad_norm": 1.1444726807610428, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 26565 + }, + { + "epoch": 0.26566, + "grad_norm": 0.7976268976031328, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 26566 + }, + { + "epoch": 0.26567, + "grad_norm": 0.7478091974341207, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 26567 + }, + { + "epoch": 0.26568, + "grad_norm": 0.7015358804726706, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 26568 + }, + { + "epoch": 0.26569, + "grad_norm": 0.6686937589930685, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 26569 + }, + { + "epoch": 0.2657, + "grad_norm": 0.6293193000193683, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 26570 + }, + { + "epoch": 0.26571, + "grad_norm": 0.5893256276572801, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 26571 + }, + { + "epoch": 0.26572, + "grad_norm": 0.6267665797984254, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 26572 + }, + { + "epoch": 0.26573, + "grad_norm": 0.7180731884210094, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 26573 + }, + { + "epoch": 0.26574, + "grad_norm": 0.7918770362948485, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 26574 + }, + { + "epoch": 0.26575, + "grad_norm": 0.8707971390235899, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 26575 + }, + { + "epoch": 0.26576, + "grad_norm": 1.0742671391332812, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 26576 + }, + { + "epoch": 0.26577, + "grad_norm": 1.059372407254524, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 26577 + }, + { + "epoch": 0.26578, + "grad_norm": 1.1173540771526385, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 26578 + }, + { + "epoch": 0.26579, + "grad_norm": 0.9334784311256933, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 26579 + }, + { + "epoch": 0.2658, + "grad_norm": 0.912483535758802, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 26580 + }, + { + "epoch": 0.26581, + "grad_norm": 0.9360021821549197, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 26581 + }, + { + "epoch": 0.26582, + "grad_norm": 0.917177744568296, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 26582 + }, + { + "epoch": 0.26583, + "grad_norm": 0.901200366271798, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 26583 + }, + { + "epoch": 0.26584, + "grad_norm": 0.9932961224541949, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 26584 + }, + { + "epoch": 0.26585, + "grad_norm": 0.7550484216826971, + "learning_rate": 0.003, + "loss": 4.059, + "step": 26585 + }, + { + "epoch": 0.26586, + "grad_norm": 0.6904213183744595, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 26586 + }, + { + "epoch": 0.26587, + "grad_norm": 0.7900128851326818, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 26587 + }, + { + "epoch": 0.26588, + "grad_norm": 0.8212886738583991, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 26588 + }, + { + "epoch": 0.26589, + "grad_norm": 0.8057590103896375, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 26589 + }, + { + "epoch": 0.2659, + "grad_norm": 0.8719420126745442, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 26590 + }, + { + "epoch": 0.26591, + "grad_norm": 1.0644395475080315, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 26591 + }, + { + "epoch": 0.26592, + "grad_norm": 1.0969912106384265, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 26592 + }, + { + "epoch": 0.26593, + "grad_norm": 1.0004021089407946, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 26593 + }, + { + "epoch": 0.26594, + "grad_norm": 1.1891625980656515, + "learning_rate": 0.003, + "loss": 4.046, + "step": 26594 + }, + { + "epoch": 0.26595, + "grad_norm": 1.0830354449787332, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 26595 + }, + { + "epoch": 0.26596, + "grad_norm": 0.8559300310113501, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 26596 + }, + { + "epoch": 0.26597, + "grad_norm": 0.7465984818710449, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 26597 + }, + { + "epoch": 0.26598, + "grad_norm": 0.6944511962179045, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 26598 + }, + { + "epoch": 0.26599, + "grad_norm": 0.6569263790868736, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 26599 + }, + { + "epoch": 0.266, + "grad_norm": 0.6727183475044749, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 26600 + }, + { + "epoch": 0.26601, + "grad_norm": 0.7363523814241277, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 26601 + }, + { + "epoch": 0.26602, + "grad_norm": 0.8898304489470836, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 26602 + }, + { + "epoch": 0.26603, + "grad_norm": 1.1792337032717648, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 26603 + }, + { + "epoch": 0.26604, + "grad_norm": 0.9382278959328094, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 26604 + }, + { + "epoch": 0.26605, + "grad_norm": 0.7412230866211287, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 26605 + }, + { + "epoch": 0.26606, + "grad_norm": 0.7061273718104436, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 26606 + }, + { + "epoch": 0.26607, + "grad_norm": 0.6419652278942862, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 26607 + }, + { + "epoch": 0.26608, + "grad_norm": 0.624108405526765, + "learning_rate": 0.003, + "loss": 4.087, + "step": 26608 + }, + { + "epoch": 0.26609, + "grad_norm": 0.7260998774620782, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 26609 + }, + { + "epoch": 0.2661, + "grad_norm": 0.8414296134654446, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 26610 + }, + { + "epoch": 0.26611, + "grad_norm": 1.0252421096378903, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 26611 + }, + { + "epoch": 0.26612, + "grad_norm": 0.972697906522464, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 26612 + }, + { + "epoch": 0.26613, + "grad_norm": 0.963674305976449, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 26613 + }, + { + "epoch": 0.26614, + "grad_norm": 1.0442225451968776, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 26614 + }, + { + "epoch": 0.26615, + "grad_norm": 0.9154175965331495, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 26615 + }, + { + "epoch": 0.26616, + "grad_norm": 0.6497602554719503, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 26616 + }, + { + "epoch": 0.26617, + "grad_norm": 0.6361910625565259, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 26617 + }, + { + "epoch": 0.26618, + "grad_norm": 0.8205470989904912, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 26618 + }, + { + "epoch": 0.26619, + "grad_norm": 0.9543037316112705, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 26619 + }, + { + "epoch": 0.2662, + "grad_norm": 1.1498093701608683, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 26620 + }, + { + "epoch": 0.26621, + "grad_norm": 1.0314979822258623, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 26621 + }, + { + "epoch": 0.26622, + "grad_norm": 1.0414088383615803, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 26622 + }, + { + "epoch": 0.26623, + "grad_norm": 0.9187221942093965, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 26623 + }, + { + "epoch": 0.26624, + "grad_norm": 0.8826723217229712, + "learning_rate": 0.003, + "loss": 4.066, + "step": 26624 + }, + { + "epoch": 0.26625, + "grad_norm": 1.010207166660528, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 26625 + }, + { + "epoch": 0.26626, + "grad_norm": 0.9263115765655748, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 26626 + }, + { + "epoch": 0.26627, + "grad_norm": 0.8464879776659414, + "learning_rate": 0.003, + "loss": 4.0973, + "step": 26627 + }, + { + "epoch": 0.26628, + "grad_norm": 0.8312012033056366, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 26628 + }, + { + "epoch": 0.26629, + "grad_norm": 0.8650003532080943, + "learning_rate": 0.003, + "loss": 4.05, + "step": 26629 + }, + { + "epoch": 0.2663, + "grad_norm": 0.9670055546956314, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 26630 + }, + { + "epoch": 0.26631, + "grad_norm": 1.2759445776401157, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 26631 + }, + { + "epoch": 0.26632, + "grad_norm": 1.0237304026362863, + "learning_rate": 0.003, + "loss": 4.1034, + "step": 26632 + }, + { + "epoch": 0.26633, + "grad_norm": 1.0414362802994714, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 26633 + }, + { + "epoch": 0.26634, + "grad_norm": 1.1134138074174076, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 26634 + }, + { + "epoch": 0.26635, + "grad_norm": 1.0725962625003227, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 26635 + }, + { + "epoch": 0.26636, + "grad_norm": 1.1260254195629908, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 26636 + }, + { + "epoch": 0.26637, + "grad_norm": 0.9208476223812743, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 26637 + }, + { + "epoch": 0.26638, + "grad_norm": 0.8533590723517842, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 26638 + }, + { + "epoch": 0.26639, + "grad_norm": 0.8689167892321508, + "learning_rate": 0.003, + "loss": 4.097, + "step": 26639 + }, + { + "epoch": 0.2664, + "grad_norm": 0.7972294977074371, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 26640 + }, + { + "epoch": 0.26641, + "grad_norm": 0.7861236885164111, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 26641 + }, + { + "epoch": 0.26642, + "grad_norm": 0.9182536278111603, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 26642 + }, + { + "epoch": 0.26643, + "grad_norm": 0.9651958892753376, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 26643 + }, + { + "epoch": 0.26644, + "grad_norm": 0.9055922456649463, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 26644 + }, + { + "epoch": 0.26645, + "grad_norm": 0.7906809965198481, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 26645 + }, + { + "epoch": 0.26646, + "grad_norm": 0.7125052906612545, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 26646 + }, + { + "epoch": 0.26647, + "grad_norm": 0.7732075907172865, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 26647 + }, + { + "epoch": 0.26648, + "grad_norm": 0.8617790578171289, + "learning_rate": 0.003, + "loss": 4.073, + "step": 26648 + }, + { + "epoch": 0.26649, + "grad_norm": 0.8229307983839296, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 26649 + }, + { + "epoch": 0.2665, + "grad_norm": 0.802973432232185, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 26650 + }, + { + "epoch": 0.26651, + "grad_norm": 0.7832406968666211, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 26651 + }, + { + "epoch": 0.26652, + "grad_norm": 0.8143777204867714, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 26652 + }, + { + "epoch": 0.26653, + "grad_norm": 0.8553751372748348, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 26653 + }, + { + "epoch": 0.26654, + "grad_norm": 0.8902401302342804, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 26654 + }, + { + "epoch": 0.26655, + "grad_norm": 0.8658398350047829, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 26655 + }, + { + "epoch": 0.26656, + "grad_norm": 0.7817725862644421, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 26656 + }, + { + "epoch": 0.26657, + "grad_norm": 0.7151587655078991, + "learning_rate": 0.003, + "loss": 4.023, + "step": 26657 + }, + { + "epoch": 0.26658, + "grad_norm": 0.7822227505750222, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 26658 + }, + { + "epoch": 0.26659, + "grad_norm": 0.85944374577339, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 26659 + }, + { + "epoch": 0.2666, + "grad_norm": 1.0289665509090975, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 26660 + }, + { + "epoch": 0.26661, + "grad_norm": 1.2521364519932363, + "learning_rate": 0.003, + "loss": 4.061, + "step": 26661 + }, + { + "epoch": 0.26662, + "grad_norm": 0.819131146464348, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 26662 + }, + { + "epoch": 0.26663, + "grad_norm": 0.7119951588668726, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 26663 + }, + { + "epoch": 0.26664, + "grad_norm": 0.728751474300894, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 26664 + }, + { + "epoch": 0.26665, + "grad_norm": 0.7264442585143859, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 26665 + }, + { + "epoch": 0.26666, + "grad_norm": 0.7460528191940624, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 26666 + }, + { + "epoch": 0.26667, + "grad_norm": 0.6649698464189977, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 26667 + }, + { + "epoch": 0.26668, + "grad_norm": 0.5378801838459671, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 26668 + }, + { + "epoch": 0.26669, + "grad_norm": 0.5632099155812313, + "learning_rate": 0.003, + "loss": 4.034, + "step": 26669 + }, + { + "epoch": 0.2667, + "grad_norm": 0.5350368434627366, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 26670 + }, + { + "epoch": 0.26671, + "grad_norm": 0.5303576151520706, + "learning_rate": 0.003, + "loss": 4.0039, + "step": 26671 + }, + { + "epoch": 0.26672, + "grad_norm": 0.6109151039924823, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 26672 + }, + { + "epoch": 0.26673, + "grad_norm": 0.796179812841856, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 26673 + }, + { + "epoch": 0.26674, + "grad_norm": 1.0923526359029165, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 26674 + }, + { + "epoch": 0.26675, + "grad_norm": 1.0762238704000684, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 26675 + }, + { + "epoch": 0.26676, + "grad_norm": 1.0557974279745912, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 26676 + }, + { + "epoch": 0.26677, + "grad_norm": 0.9822702705691877, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 26677 + }, + { + "epoch": 0.26678, + "grad_norm": 1.0966666886212464, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 26678 + }, + { + "epoch": 0.26679, + "grad_norm": 0.7858873010454793, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 26679 + }, + { + "epoch": 0.2668, + "grad_norm": 0.6076935779388353, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 26680 + }, + { + "epoch": 0.26681, + "grad_norm": 0.6412059021445111, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 26681 + }, + { + "epoch": 0.26682, + "grad_norm": 0.8000140632199488, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 26682 + }, + { + "epoch": 0.26683, + "grad_norm": 0.912916014743871, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 26683 + }, + { + "epoch": 0.26684, + "grad_norm": 0.9923446803688599, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 26684 + }, + { + "epoch": 0.26685, + "grad_norm": 1.207405723846636, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 26685 + }, + { + "epoch": 0.26686, + "grad_norm": 0.8520035522621076, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 26686 + }, + { + "epoch": 0.26687, + "grad_norm": 0.8113852889498812, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 26687 + }, + { + "epoch": 0.26688, + "grad_norm": 0.9038452044667784, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 26688 + }, + { + "epoch": 0.26689, + "grad_norm": 0.9815309739157995, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 26689 + }, + { + "epoch": 0.2669, + "grad_norm": 1.0388893042418477, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 26690 + }, + { + "epoch": 0.26691, + "grad_norm": 0.9291636438353094, + "learning_rate": 0.003, + "loss": 4.0868, + "step": 26691 + }, + { + "epoch": 0.26692, + "grad_norm": 0.9440179886077221, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 26692 + }, + { + "epoch": 0.26693, + "grad_norm": 1.0742388681689106, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 26693 + }, + { + "epoch": 0.26694, + "grad_norm": 0.9951746493562946, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 26694 + }, + { + "epoch": 0.26695, + "grad_norm": 0.8817734937788403, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 26695 + }, + { + "epoch": 0.26696, + "grad_norm": 0.860687491951749, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 26696 + }, + { + "epoch": 0.26697, + "grad_norm": 1.0688683442937588, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 26697 + }, + { + "epoch": 0.26698, + "grad_norm": 0.9522765700180879, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 26698 + }, + { + "epoch": 0.26699, + "grad_norm": 0.9027802009966653, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 26699 + }, + { + "epoch": 0.267, + "grad_norm": 1.0561183837657433, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 26700 + }, + { + "epoch": 0.26701, + "grad_norm": 1.0675308573497253, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 26701 + }, + { + "epoch": 0.26702, + "grad_norm": 1.028566822243012, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 26702 + }, + { + "epoch": 0.26703, + "grad_norm": 0.9313649923505639, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 26703 + }, + { + "epoch": 0.26704, + "grad_norm": 0.785645078280347, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 26704 + }, + { + "epoch": 0.26705, + "grad_norm": 0.6892208234298768, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 26705 + }, + { + "epoch": 0.26706, + "grad_norm": 0.5772106607160312, + "learning_rate": 0.003, + "loss": 4.044, + "step": 26706 + }, + { + "epoch": 0.26707, + "grad_norm": 0.6333217733057827, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 26707 + }, + { + "epoch": 0.26708, + "grad_norm": 0.7968952712638063, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 26708 + }, + { + "epoch": 0.26709, + "grad_norm": 0.992929318826041, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 26709 + }, + { + "epoch": 0.2671, + "grad_norm": 1.0900549571691964, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 26710 + }, + { + "epoch": 0.26711, + "grad_norm": 0.8046750135462912, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 26711 + }, + { + "epoch": 0.26712, + "grad_norm": 0.8794702623575531, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 26712 + }, + { + "epoch": 0.26713, + "grad_norm": 0.895768487197142, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 26713 + }, + { + "epoch": 0.26714, + "grad_norm": 0.7857887607120044, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 26714 + }, + { + "epoch": 0.26715, + "grad_norm": 0.8334878305798049, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 26715 + }, + { + "epoch": 0.26716, + "grad_norm": 0.9199768839994255, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 26716 + }, + { + "epoch": 0.26717, + "grad_norm": 0.9510913854053771, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 26717 + }, + { + "epoch": 0.26718, + "grad_norm": 0.9733343841124151, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 26718 + }, + { + "epoch": 0.26719, + "grad_norm": 0.9300557931902645, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 26719 + }, + { + "epoch": 0.2672, + "grad_norm": 0.8673640555025532, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 26720 + }, + { + "epoch": 0.26721, + "grad_norm": 0.7590328475857064, + "learning_rate": 0.003, + "loss": 4.025, + "step": 26721 + }, + { + "epoch": 0.26722, + "grad_norm": 0.85164985943911, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 26722 + }, + { + "epoch": 0.26723, + "grad_norm": 0.8797060503741002, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 26723 + }, + { + "epoch": 0.26724, + "grad_norm": 1.0742166966975317, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 26724 + }, + { + "epoch": 0.26725, + "grad_norm": 1.0109969540785737, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 26725 + }, + { + "epoch": 0.26726, + "grad_norm": 1.0907737540438218, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 26726 + }, + { + "epoch": 0.26727, + "grad_norm": 0.9942324362436002, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 26727 + }, + { + "epoch": 0.26728, + "grad_norm": 0.8371243951848143, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 26728 + }, + { + "epoch": 0.26729, + "grad_norm": 0.6872804004639314, + "learning_rate": 0.003, + "loss": 4.06, + "step": 26729 + }, + { + "epoch": 0.2673, + "grad_norm": 0.591674397944938, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 26730 + }, + { + "epoch": 0.26731, + "grad_norm": 0.6606672414326421, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 26731 + }, + { + "epoch": 0.26732, + "grad_norm": 0.6572819178724009, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 26732 + }, + { + "epoch": 0.26733, + "grad_norm": 0.6940561532330184, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 26733 + }, + { + "epoch": 0.26734, + "grad_norm": 0.6965391264343574, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 26734 + }, + { + "epoch": 0.26735, + "grad_norm": 0.7501205854225137, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 26735 + }, + { + "epoch": 0.26736, + "grad_norm": 0.849406258319405, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 26736 + }, + { + "epoch": 0.26737, + "grad_norm": 0.8531018082178712, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 26737 + }, + { + "epoch": 0.26738, + "grad_norm": 0.9511979581549274, + "learning_rate": 0.003, + "loss": 4.052, + "step": 26738 + }, + { + "epoch": 0.26739, + "grad_norm": 1.2107113828172498, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 26739 + }, + { + "epoch": 0.2674, + "grad_norm": 0.8612989313071496, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 26740 + }, + { + "epoch": 0.26741, + "grad_norm": 0.7283285608965303, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 26741 + }, + { + "epoch": 0.26742, + "grad_norm": 0.7379148522520755, + "learning_rate": 0.003, + "loss": 4.039, + "step": 26742 + }, + { + "epoch": 0.26743, + "grad_norm": 0.7174040269715107, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 26743 + }, + { + "epoch": 0.26744, + "grad_norm": 0.8308836330838764, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 26744 + }, + { + "epoch": 0.26745, + "grad_norm": 0.8196239611600561, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 26745 + }, + { + "epoch": 0.26746, + "grad_norm": 0.8682085238476234, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 26746 + }, + { + "epoch": 0.26747, + "grad_norm": 0.9603288625812608, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26747 + }, + { + "epoch": 0.26748, + "grad_norm": 1.0198777595068194, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 26748 + }, + { + "epoch": 0.26749, + "grad_norm": 1.2845640757730779, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 26749 + }, + { + "epoch": 0.2675, + "grad_norm": 1.032885193362763, + "learning_rate": 0.003, + "loss": 4.047, + "step": 26750 + }, + { + "epoch": 0.26751, + "grad_norm": 1.1613661321720825, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 26751 + }, + { + "epoch": 0.26752, + "grad_norm": 0.8530085664308429, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 26752 + }, + { + "epoch": 0.26753, + "grad_norm": 0.7014666181493231, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 26753 + }, + { + "epoch": 0.26754, + "grad_norm": 0.6796645147335215, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 26754 + }, + { + "epoch": 0.26755, + "grad_norm": 0.6562142494094009, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 26755 + }, + { + "epoch": 0.26756, + "grad_norm": 0.6026007136599633, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 26756 + }, + { + "epoch": 0.26757, + "grad_norm": 0.5699672937551186, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 26757 + }, + { + "epoch": 0.26758, + "grad_norm": 0.576757488510103, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 26758 + }, + { + "epoch": 0.26759, + "grad_norm": 0.6704464721641104, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 26759 + }, + { + "epoch": 0.2676, + "grad_norm": 0.9361903069061588, + "learning_rate": 0.003, + "loss": 4.046, + "step": 26760 + }, + { + "epoch": 0.26761, + "grad_norm": 1.2288749481239083, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 26761 + }, + { + "epoch": 0.26762, + "grad_norm": 0.9443992511767147, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26762 + }, + { + "epoch": 0.26763, + "grad_norm": 0.9785956628929858, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 26763 + }, + { + "epoch": 0.26764, + "grad_norm": 0.9163626702234071, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 26764 + }, + { + "epoch": 0.26765, + "grad_norm": 0.840137909902933, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 26765 + }, + { + "epoch": 0.26766, + "grad_norm": 0.7684002808570489, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 26766 + }, + { + "epoch": 0.26767, + "grad_norm": 0.7570456762377298, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 26767 + }, + { + "epoch": 0.26768, + "grad_norm": 0.7883064882330026, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 26768 + }, + { + "epoch": 0.26769, + "grad_norm": 0.7510368380664076, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 26769 + }, + { + "epoch": 0.2677, + "grad_norm": 0.8714694913319477, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 26770 + }, + { + "epoch": 0.26771, + "grad_norm": 0.9931551188063039, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 26771 + }, + { + "epoch": 0.26772, + "grad_norm": 0.8787342561461627, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 26772 + }, + { + "epoch": 0.26773, + "grad_norm": 0.8621318653400925, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 26773 + }, + { + "epoch": 0.26774, + "grad_norm": 0.8325085381923846, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 26774 + }, + { + "epoch": 0.26775, + "grad_norm": 0.9047825263675746, + "learning_rate": 0.003, + "loss": 4.057, + "step": 26775 + }, + { + "epoch": 0.26776, + "grad_norm": 1.1501523171767354, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 26776 + }, + { + "epoch": 0.26777, + "grad_norm": 1.269977820533238, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 26777 + }, + { + "epoch": 0.26778, + "grad_norm": 0.8090218286372404, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 26778 + }, + { + "epoch": 0.26779, + "grad_norm": 0.5805851621821513, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26779 + }, + { + "epoch": 0.2678, + "grad_norm": 0.6929948907046206, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 26780 + }, + { + "epoch": 0.26781, + "grad_norm": 0.8412630997929835, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 26781 + }, + { + "epoch": 0.26782, + "grad_norm": 1.0243521372555395, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 26782 + }, + { + "epoch": 0.26783, + "grad_norm": 1.0856315744361238, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 26783 + }, + { + "epoch": 0.26784, + "grad_norm": 0.8348121730140173, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 26784 + }, + { + "epoch": 0.26785, + "grad_norm": 0.7378561634094751, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 26785 + }, + { + "epoch": 0.26786, + "grad_norm": 0.8226386741847183, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 26786 + }, + { + "epoch": 0.26787, + "grad_norm": 0.8791824139867304, + "learning_rate": 0.003, + "loss": 4.047, + "step": 26787 + }, + { + "epoch": 0.26788, + "grad_norm": 0.9121155198908483, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 26788 + }, + { + "epoch": 0.26789, + "grad_norm": 1.0041298023638419, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 26789 + }, + { + "epoch": 0.2679, + "grad_norm": 1.154655682264059, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 26790 + }, + { + "epoch": 0.26791, + "grad_norm": 0.7453429369776192, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 26791 + }, + { + "epoch": 0.26792, + "grad_norm": 0.7944509654961418, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 26792 + }, + { + "epoch": 0.26793, + "grad_norm": 0.8993588114106738, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 26793 + }, + { + "epoch": 0.26794, + "grad_norm": 0.9500213961892805, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 26794 + }, + { + "epoch": 0.26795, + "grad_norm": 1.0357962594391055, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 26795 + }, + { + "epoch": 0.26796, + "grad_norm": 0.8152045205170331, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 26796 + }, + { + "epoch": 0.26797, + "grad_norm": 0.7699667054161323, + "learning_rate": 0.003, + "loss": 4.056, + "step": 26797 + }, + { + "epoch": 0.26798, + "grad_norm": 0.7538410406382684, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 26798 + }, + { + "epoch": 0.26799, + "grad_norm": 0.8229955553368276, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 26799 + }, + { + "epoch": 0.268, + "grad_norm": 0.9507927465758806, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 26800 + }, + { + "epoch": 0.26801, + "grad_norm": 1.061773974768729, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 26801 + }, + { + "epoch": 0.26802, + "grad_norm": 0.9851158659326185, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 26802 + }, + { + "epoch": 0.26803, + "grad_norm": 0.9839605172370657, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 26803 + }, + { + "epoch": 0.26804, + "grad_norm": 1.1199766359260959, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 26804 + }, + { + "epoch": 0.26805, + "grad_norm": 0.9321880203486483, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 26805 + }, + { + "epoch": 0.26806, + "grad_norm": 0.7683828914268355, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 26806 + }, + { + "epoch": 0.26807, + "grad_norm": 0.7264836985086381, + "learning_rate": 0.003, + "loss": 4.033, + "step": 26807 + }, + { + "epoch": 0.26808, + "grad_norm": 0.6947152787508986, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 26808 + }, + { + "epoch": 0.26809, + "grad_norm": 0.7887681249681788, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 26809 + }, + { + "epoch": 0.2681, + "grad_norm": 0.7766616004275769, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 26810 + }, + { + "epoch": 0.26811, + "grad_norm": 0.8162115777546617, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 26811 + }, + { + "epoch": 0.26812, + "grad_norm": 1.00836242098642, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 26812 + }, + { + "epoch": 0.26813, + "grad_norm": 1.1787009291832173, + "learning_rate": 0.003, + "loss": 4.064, + "step": 26813 + }, + { + "epoch": 0.26814, + "grad_norm": 0.8555377053320512, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 26814 + }, + { + "epoch": 0.26815, + "grad_norm": 0.7196022469281201, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 26815 + }, + { + "epoch": 0.26816, + "grad_norm": 0.7315588476246786, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 26816 + }, + { + "epoch": 0.26817, + "grad_norm": 0.8199991250667189, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 26817 + }, + { + "epoch": 0.26818, + "grad_norm": 0.9238319690410037, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 26818 + }, + { + "epoch": 0.26819, + "grad_norm": 0.7632163132511808, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 26819 + }, + { + "epoch": 0.2682, + "grad_norm": 0.6998543010023512, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 26820 + }, + { + "epoch": 0.26821, + "grad_norm": 0.7116082409956874, + "learning_rate": 0.003, + "loss": 4.061, + "step": 26821 + }, + { + "epoch": 0.26822, + "grad_norm": 0.7422357109023314, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 26822 + }, + { + "epoch": 0.26823, + "grad_norm": 0.8270071713610251, + "learning_rate": 0.003, + "loss": 4.029, + "step": 26823 + }, + { + "epoch": 0.26824, + "grad_norm": 0.9616981269291839, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 26824 + }, + { + "epoch": 0.26825, + "grad_norm": 1.1042249593304798, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 26825 + }, + { + "epoch": 0.26826, + "grad_norm": 0.9855217666287078, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 26826 + }, + { + "epoch": 0.26827, + "grad_norm": 0.9745472722582985, + "learning_rate": 0.003, + "loss": 4.027, + "step": 26827 + }, + { + "epoch": 0.26828, + "grad_norm": 0.8503705073044941, + "learning_rate": 0.003, + "loss": 4.07, + "step": 26828 + }, + { + "epoch": 0.26829, + "grad_norm": 0.7778703583133462, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 26829 + }, + { + "epoch": 0.2683, + "grad_norm": 0.7495433301340197, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 26830 + }, + { + "epoch": 0.26831, + "grad_norm": 0.7790554520503469, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 26831 + }, + { + "epoch": 0.26832, + "grad_norm": 0.8573796288062671, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 26832 + }, + { + "epoch": 0.26833, + "grad_norm": 0.9980454705417464, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 26833 + }, + { + "epoch": 0.26834, + "grad_norm": 1.2160208084228539, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 26834 + }, + { + "epoch": 0.26835, + "grad_norm": 0.8427110402087028, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 26835 + }, + { + "epoch": 0.26836, + "grad_norm": 0.771167677946136, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 26836 + }, + { + "epoch": 0.26837, + "grad_norm": 0.7292664108361273, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 26837 + }, + { + "epoch": 0.26838, + "grad_norm": 0.8143570985261177, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 26838 + }, + { + "epoch": 0.26839, + "grad_norm": 0.7276168129385351, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 26839 + }, + { + "epoch": 0.2684, + "grad_norm": 0.6712678770209531, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 26840 + }, + { + "epoch": 0.26841, + "grad_norm": 0.7821541483127419, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 26841 + }, + { + "epoch": 0.26842, + "grad_norm": 0.8808361755103088, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 26842 + }, + { + "epoch": 0.26843, + "grad_norm": 1.0006668139850003, + "learning_rate": 0.003, + "loss": 4.055, + "step": 26843 + }, + { + "epoch": 0.26844, + "grad_norm": 1.1981811240725382, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 26844 + }, + { + "epoch": 0.26845, + "grad_norm": 0.9226266287032532, + "learning_rate": 0.003, + "loss": 4.045, + "step": 26845 + }, + { + "epoch": 0.26846, + "grad_norm": 0.9479866770346266, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 26846 + }, + { + "epoch": 0.26847, + "grad_norm": 0.9964166812344053, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 26847 + }, + { + "epoch": 0.26848, + "grad_norm": 0.9939089689554386, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 26848 + }, + { + "epoch": 0.26849, + "grad_norm": 0.9709551291704664, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 26849 + }, + { + "epoch": 0.2685, + "grad_norm": 1.020165551165223, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 26850 + }, + { + "epoch": 0.26851, + "grad_norm": 0.8863277947095122, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 26851 + }, + { + "epoch": 0.26852, + "grad_norm": 1.0491345460192698, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 26852 + }, + { + "epoch": 0.26853, + "grad_norm": 1.122350525012377, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 26853 + }, + { + "epoch": 0.26854, + "grad_norm": 0.8395329447220833, + "learning_rate": 0.003, + "loss": 4.034, + "step": 26854 + }, + { + "epoch": 0.26855, + "grad_norm": 0.8670364035506442, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 26855 + }, + { + "epoch": 0.26856, + "grad_norm": 0.9755359708285688, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 26856 + }, + { + "epoch": 0.26857, + "grad_norm": 1.0751348531951983, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 26857 + }, + { + "epoch": 0.26858, + "grad_norm": 0.911777289119023, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 26858 + }, + { + "epoch": 0.26859, + "grad_norm": 0.897093061950949, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 26859 + }, + { + "epoch": 0.2686, + "grad_norm": 1.0473880101989306, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 26860 + }, + { + "epoch": 0.26861, + "grad_norm": 0.7952945597290391, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 26861 + }, + { + "epoch": 0.26862, + "grad_norm": 0.6177411675039973, + "learning_rate": 0.003, + "loss": 4.044, + "step": 26862 + }, + { + "epoch": 0.26863, + "grad_norm": 0.6623986153088346, + "learning_rate": 0.003, + "loss": 4.047, + "step": 26863 + }, + { + "epoch": 0.26864, + "grad_norm": 0.7220856558057682, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26864 + }, + { + "epoch": 0.26865, + "grad_norm": 0.9358425399702608, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 26865 + }, + { + "epoch": 0.26866, + "grad_norm": 1.1931919177174442, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 26866 + }, + { + "epoch": 0.26867, + "grad_norm": 0.6797091494412336, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 26867 + }, + { + "epoch": 0.26868, + "grad_norm": 0.5875798096716746, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 26868 + }, + { + "epoch": 0.26869, + "grad_norm": 0.5814283715012318, + "learning_rate": 0.003, + "loss": 4.037, + "step": 26869 + }, + { + "epoch": 0.2687, + "grad_norm": 0.5873322351884686, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 26870 + }, + { + "epoch": 0.26871, + "grad_norm": 0.6833628487384019, + "learning_rate": 0.003, + "loss": 4.043, + "step": 26871 + }, + { + "epoch": 0.26872, + "grad_norm": 0.7927082539457238, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 26872 + }, + { + "epoch": 0.26873, + "grad_norm": 0.8809740323636197, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 26873 + }, + { + "epoch": 0.26874, + "grad_norm": 0.948659879011425, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26874 + }, + { + "epoch": 0.26875, + "grad_norm": 0.9311997267153175, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 26875 + }, + { + "epoch": 0.26876, + "grad_norm": 0.7967406915826373, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 26876 + }, + { + "epoch": 0.26877, + "grad_norm": 0.8290844738251041, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 26877 + }, + { + "epoch": 0.26878, + "grad_norm": 0.8733458167763013, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 26878 + }, + { + "epoch": 0.26879, + "grad_norm": 0.9768662749253959, + "learning_rate": 0.003, + "loss": 3.9969, + "step": 26879 + }, + { + "epoch": 0.2688, + "grad_norm": 1.048594960164772, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 26880 + }, + { + "epoch": 0.26881, + "grad_norm": 1.0123188758295774, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 26881 + }, + { + "epoch": 0.26882, + "grad_norm": 1.0862197214663303, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 26882 + }, + { + "epoch": 0.26883, + "grad_norm": 0.8773855842942945, + "learning_rate": 0.003, + "loss": 4.052, + "step": 26883 + }, + { + "epoch": 0.26884, + "grad_norm": 0.8248028156878672, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 26884 + }, + { + "epoch": 0.26885, + "grad_norm": 0.7927315459583432, + "learning_rate": 0.003, + "loss": 4.064, + "step": 26885 + }, + { + "epoch": 0.26886, + "grad_norm": 0.6773839829512543, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 26886 + }, + { + "epoch": 0.26887, + "grad_norm": 0.696430811548455, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 26887 + }, + { + "epoch": 0.26888, + "grad_norm": 0.8127898845894397, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 26888 + }, + { + "epoch": 0.26889, + "grad_norm": 0.9689792715233774, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 26889 + }, + { + "epoch": 0.2689, + "grad_norm": 1.1866731850723529, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 26890 + }, + { + "epoch": 0.26891, + "grad_norm": 0.7376585720892403, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 26891 + }, + { + "epoch": 0.26892, + "grad_norm": 0.6739327685475454, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 26892 + }, + { + "epoch": 0.26893, + "grad_norm": 0.8125013079317227, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 26893 + }, + { + "epoch": 0.26894, + "grad_norm": 0.8962227967136355, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 26894 + }, + { + "epoch": 0.26895, + "grad_norm": 0.9690916866230579, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 26895 + }, + { + "epoch": 0.26896, + "grad_norm": 0.9886794941489507, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 26896 + }, + { + "epoch": 0.26897, + "grad_norm": 0.9393189006689492, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 26897 + }, + { + "epoch": 0.26898, + "grad_norm": 0.9322092139945386, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 26898 + }, + { + "epoch": 0.26899, + "grad_norm": 0.9599102393951192, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 26899 + }, + { + "epoch": 0.269, + "grad_norm": 1.0529681803354871, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 26900 + }, + { + "epoch": 0.26901, + "grad_norm": 0.8937160858191697, + "learning_rate": 0.003, + "loss": 4.083, + "step": 26901 + }, + { + "epoch": 0.26902, + "grad_norm": 0.8697776795024742, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 26902 + }, + { + "epoch": 0.26903, + "grad_norm": 0.9441589877474087, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 26903 + }, + { + "epoch": 0.26904, + "grad_norm": 0.9408467398049117, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 26904 + }, + { + "epoch": 0.26905, + "grad_norm": 1.0427706769069847, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 26905 + }, + { + "epoch": 0.26906, + "grad_norm": 1.1779695878184737, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 26906 + }, + { + "epoch": 0.26907, + "grad_norm": 0.9261675096869896, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 26907 + }, + { + "epoch": 0.26908, + "grad_norm": 0.8907088884950782, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 26908 + }, + { + "epoch": 0.26909, + "grad_norm": 0.9754967728243714, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 26909 + }, + { + "epoch": 0.2691, + "grad_norm": 0.9227745262990029, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 26910 + }, + { + "epoch": 0.26911, + "grad_norm": 0.9354624523764706, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 26911 + }, + { + "epoch": 0.26912, + "grad_norm": 0.8434797054530185, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 26912 + }, + { + "epoch": 0.26913, + "grad_norm": 0.7113691771089369, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 26913 + }, + { + "epoch": 0.26914, + "grad_norm": 0.6890992042045092, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 26914 + }, + { + "epoch": 0.26915, + "grad_norm": 0.7743484120527323, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 26915 + }, + { + "epoch": 0.26916, + "grad_norm": 0.8732887868793218, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 26916 + }, + { + "epoch": 0.26917, + "grad_norm": 0.9988592849699839, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 26917 + }, + { + "epoch": 0.26918, + "grad_norm": 1.1767479433472368, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 26918 + }, + { + "epoch": 0.26919, + "grad_norm": 0.8185723291049996, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 26919 + }, + { + "epoch": 0.2692, + "grad_norm": 0.7013998169985814, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 26920 + }, + { + "epoch": 0.26921, + "grad_norm": 0.6495103391659941, + "learning_rate": 0.003, + "loss": 4.055, + "step": 26921 + }, + { + "epoch": 0.26922, + "grad_norm": 0.7492917000294076, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 26922 + }, + { + "epoch": 0.26923, + "grad_norm": 0.8915331539146565, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 26923 + }, + { + "epoch": 0.26924, + "grad_norm": 0.8773980385665381, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 26924 + }, + { + "epoch": 0.26925, + "grad_norm": 0.816513964939444, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 26925 + }, + { + "epoch": 0.26926, + "grad_norm": 0.9276444245280357, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 26926 + }, + { + "epoch": 0.26927, + "grad_norm": 0.9470220678491759, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 26927 + }, + { + "epoch": 0.26928, + "grad_norm": 0.8719539988217069, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 26928 + }, + { + "epoch": 0.26929, + "grad_norm": 0.9114960260049979, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 26929 + }, + { + "epoch": 0.2693, + "grad_norm": 0.8488171457589256, + "learning_rate": 0.003, + "loss": 4.051, + "step": 26930 + }, + { + "epoch": 0.26931, + "grad_norm": 0.7444986478644735, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 26931 + }, + { + "epoch": 0.26932, + "grad_norm": 0.7413878889613189, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 26932 + }, + { + "epoch": 0.26933, + "grad_norm": 0.6722002862335263, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 26933 + }, + { + "epoch": 0.26934, + "grad_norm": 0.6103254018920676, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 26934 + }, + { + "epoch": 0.26935, + "grad_norm": 0.6297175998152325, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 26935 + }, + { + "epoch": 0.26936, + "grad_norm": 0.7337259835500222, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 26936 + }, + { + "epoch": 0.26937, + "grad_norm": 0.9530827528919407, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 26937 + }, + { + "epoch": 0.26938, + "grad_norm": 1.2854979574138448, + "learning_rate": 0.003, + "loss": 4.037, + "step": 26938 + }, + { + "epoch": 0.26939, + "grad_norm": 0.6861287002916118, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 26939 + }, + { + "epoch": 0.2694, + "grad_norm": 0.7443794193796209, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 26940 + }, + { + "epoch": 0.26941, + "grad_norm": 0.8196112791522115, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 26941 + }, + { + "epoch": 0.26942, + "grad_norm": 0.8466972595900097, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 26942 + }, + { + "epoch": 0.26943, + "grad_norm": 0.8099766426461441, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26943 + }, + { + "epoch": 0.26944, + "grad_norm": 0.8899064059408505, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 26944 + }, + { + "epoch": 0.26945, + "grad_norm": 0.9318379120267478, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 26945 + }, + { + "epoch": 0.26946, + "grad_norm": 0.8771469781142658, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 26946 + }, + { + "epoch": 0.26947, + "grad_norm": 0.8386181897446428, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 26947 + }, + { + "epoch": 0.26948, + "grad_norm": 0.9618846668564298, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 26948 + }, + { + "epoch": 0.26949, + "grad_norm": 1.2857319011480317, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 26949 + }, + { + "epoch": 0.2695, + "grad_norm": 0.8060868370808525, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 26950 + }, + { + "epoch": 0.26951, + "grad_norm": 0.6744944940911806, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 26951 + }, + { + "epoch": 0.26952, + "grad_norm": 0.6311334445449999, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 26952 + }, + { + "epoch": 0.26953, + "grad_norm": 0.6138559541103576, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 26953 + }, + { + "epoch": 0.26954, + "grad_norm": 0.7207008896100006, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 26954 + }, + { + "epoch": 0.26955, + "grad_norm": 0.7960129467106797, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 26955 + }, + { + "epoch": 0.26956, + "grad_norm": 1.0306091938573996, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 26956 + }, + { + "epoch": 0.26957, + "grad_norm": 1.2399489206016752, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 26957 + }, + { + "epoch": 0.26958, + "grad_norm": 0.6633366064492253, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 26958 + }, + { + "epoch": 0.26959, + "grad_norm": 0.8914599232934599, + "learning_rate": 0.003, + "loss": 4.042, + "step": 26959 + }, + { + "epoch": 0.2696, + "grad_norm": 1.1988140196872528, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 26960 + }, + { + "epoch": 0.26961, + "grad_norm": 0.7619094777538519, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 26961 + }, + { + "epoch": 0.26962, + "grad_norm": 0.726379209536206, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 26962 + }, + { + "epoch": 0.26963, + "grad_norm": 0.7616517343158844, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 26963 + }, + { + "epoch": 0.26964, + "grad_norm": 0.8716842871721308, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 26964 + }, + { + "epoch": 0.26965, + "grad_norm": 0.8680209714315246, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 26965 + }, + { + "epoch": 0.26966, + "grad_norm": 0.8236976839929548, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 26966 + }, + { + "epoch": 0.26967, + "grad_norm": 0.8870763590881062, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 26967 + }, + { + "epoch": 0.26968, + "grad_norm": 0.8928321364216114, + "learning_rate": 0.003, + "loss": 4.067, + "step": 26968 + }, + { + "epoch": 0.26969, + "grad_norm": 1.107626601959848, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 26969 + }, + { + "epoch": 0.2697, + "grad_norm": 1.1838746563329594, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 26970 + }, + { + "epoch": 0.26971, + "grad_norm": 0.9514705323076532, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 26971 + }, + { + "epoch": 0.26972, + "grad_norm": 1.0454568052408633, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 26972 + }, + { + "epoch": 0.26973, + "grad_norm": 0.972162909278101, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 26973 + }, + { + "epoch": 0.26974, + "grad_norm": 0.8729060866361867, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 26974 + }, + { + "epoch": 0.26975, + "grad_norm": 0.7281073887983386, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 26975 + }, + { + "epoch": 0.26976, + "grad_norm": 0.722974268624212, + "learning_rate": 0.003, + "loss": 4.0042, + "step": 26976 + }, + { + "epoch": 0.26977, + "grad_norm": 0.7088278974867546, + "learning_rate": 0.003, + "loss": 4.055, + "step": 26977 + }, + { + "epoch": 0.26978, + "grad_norm": 0.8154381957769654, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 26978 + }, + { + "epoch": 0.26979, + "grad_norm": 0.8019128991498526, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 26979 + }, + { + "epoch": 0.2698, + "grad_norm": 0.9055559157093138, + "learning_rate": 0.003, + "loss": 4.065, + "step": 26980 + }, + { + "epoch": 0.26981, + "grad_norm": 1.03434086868687, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 26981 + }, + { + "epoch": 0.26982, + "grad_norm": 1.078501936384365, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 26982 + }, + { + "epoch": 0.26983, + "grad_norm": 0.979355409542402, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 26983 + }, + { + "epoch": 0.26984, + "grad_norm": 1.1493175807929523, + "learning_rate": 0.003, + "loss": 4.064, + "step": 26984 + }, + { + "epoch": 0.26985, + "grad_norm": 1.009332427987294, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 26985 + }, + { + "epoch": 0.26986, + "grad_norm": 0.9922820487069157, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 26986 + }, + { + "epoch": 0.26987, + "grad_norm": 0.9817967901810096, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 26987 + }, + { + "epoch": 0.26988, + "grad_norm": 0.9495963392110601, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 26988 + }, + { + "epoch": 0.26989, + "grad_norm": 1.006950027944499, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 26989 + }, + { + "epoch": 0.2699, + "grad_norm": 0.9034623035139304, + "learning_rate": 0.003, + "loss": 4.034, + "step": 26990 + }, + { + "epoch": 0.26991, + "grad_norm": 0.903049053305745, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 26991 + }, + { + "epoch": 0.26992, + "grad_norm": 0.9834822346135722, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 26992 + }, + { + "epoch": 0.26993, + "grad_norm": 0.9646853458619823, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 26993 + }, + { + "epoch": 0.26994, + "grad_norm": 0.9130505587759331, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 26994 + }, + { + "epoch": 0.26995, + "grad_norm": 0.8599653568553773, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 26995 + }, + { + "epoch": 0.26996, + "grad_norm": 0.7586051747483734, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 26996 + }, + { + "epoch": 0.26997, + "grad_norm": 0.7881460368330638, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 26997 + }, + { + "epoch": 0.26998, + "grad_norm": 0.8835350598449638, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 26998 + }, + { + "epoch": 0.26999, + "grad_norm": 1.0002215328670894, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 26999 + }, + { + "epoch": 0.27, + "grad_norm": 1.2678592106742408, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 27000 + }, + { + "epoch": 0.27001, + "grad_norm": 0.7888807958359298, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 27001 + }, + { + "epoch": 0.27002, + "grad_norm": 0.8001773270002617, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 27002 + }, + { + "epoch": 0.27003, + "grad_norm": 0.7842424291198244, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 27003 + }, + { + "epoch": 0.27004, + "grad_norm": 0.8691221990773986, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 27004 + }, + { + "epoch": 0.27005, + "grad_norm": 0.8635074004196631, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 27005 + }, + { + "epoch": 0.27006, + "grad_norm": 0.9691884263732948, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 27006 + }, + { + "epoch": 0.27007, + "grad_norm": 1.1158654969790378, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 27007 + }, + { + "epoch": 0.27008, + "grad_norm": 0.8842773181896927, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 27008 + }, + { + "epoch": 0.27009, + "grad_norm": 0.8465386885128169, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 27009 + }, + { + "epoch": 0.2701, + "grad_norm": 0.898879990236479, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 27010 + }, + { + "epoch": 0.27011, + "grad_norm": 0.8585009785537153, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 27011 + }, + { + "epoch": 0.27012, + "grad_norm": 0.8656905129709597, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 27012 + }, + { + "epoch": 0.27013, + "grad_norm": 0.8727755915044394, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 27013 + }, + { + "epoch": 0.27014, + "grad_norm": 1.0102060053541189, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 27014 + }, + { + "epoch": 0.27015, + "grad_norm": 0.9879825045425192, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 27015 + }, + { + "epoch": 0.27016, + "grad_norm": 0.9505205842954543, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 27016 + }, + { + "epoch": 0.27017, + "grad_norm": 0.901354675971913, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 27017 + }, + { + "epoch": 0.27018, + "grad_norm": 0.8583664233017747, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 27018 + }, + { + "epoch": 0.27019, + "grad_norm": 0.6834413529053646, + "learning_rate": 0.003, + "loss": 4.046, + "step": 27019 + }, + { + "epoch": 0.2702, + "grad_norm": 0.6460403116168174, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 27020 + }, + { + "epoch": 0.27021, + "grad_norm": 0.7822032326101864, + "learning_rate": 0.003, + "loss": 4.027, + "step": 27021 + }, + { + "epoch": 0.27022, + "grad_norm": 0.8600317593547536, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 27022 + }, + { + "epoch": 0.27023, + "grad_norm": 0.9050852518059926, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 27023 + }, + { + "epoch": 0.27024, + "grad_norm": 0.8660682573181737, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 27024 + }, + { + "epoch": 0.27025, + "grad_norm": 0.7668665843089587, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 27025 + }, + { + "epoch": 0.27026, + "grad_norm": 0.7728861933718528, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 27026 + }, + { + "epoch": 0.27027, + "grad_norm": 0.8220203474732518, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 27027 + }, + { + "epoch": 0.27028, + "grad_norm": 0.9214167674101033, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 27028 + }, + { + "epoch": 0.27029, + "grad_norm": 0.9083241540942567, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 27029 + }, + { + "epoch": 0.2703, + "grad_norm": 0.8370697180268717, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 27030 + }, + { + "epoch": 0.27031, + "grad_norm": 0.7671989562499478, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 27031 + }, + { + "epoch": 0.27032, + "grad_norm": 0.6628690520304105, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 27032 + }, + { + "epoch": 0.27033, + "grad_norm": 0.5982827661267192, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 27033 + }, + { + "epoch": 0.27034, + "grad_norm": 0.6243591715428094, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 27034 + }, + { + "epoch": 0.27035, + "grad_norm": 0.652055125391747, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 27035 + }, + { + "epoch": 0.27036, + "grad_norm": 0.7788987968893006, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 27036 + }, + { + "epoch": 0.27037, + "grad_norm": 0.8685538520831548, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 27037 + }, + { + "epoch": 0.27038, + "grad_norm": 1.2172404263937173, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 27038 + }, + { + "epoch": 0.27039, + "grad_norm": 0.995505119093717, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 27039 + }, + { + "epoch": 0.2704, + "grad_norm": 0.9476755162293994, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 27040 + }, + { + "epoch": 0.27041, + "grad_norm": 0.879207148536449, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 27041 + }, + { + "epoch": 0.27042, + "grad_norm": 0.7897383323367847, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 27042 + }, + { + "epoch": 0.27043, + "grad_norm": 0.7309952298592794, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 27043 + }, + { + "epoch": 0.27044, + "grad_norm": 0.8461307769230635, + "learning_rate": 0.003, + "loss": 4.049, + "step": 27044 + }, + { + "epoch": 0.27045, + "grad_norm": 0.9665470438237945, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 27045 + }, + { + "epoch": 0.27046, + "grad_norm": 0.9448790184090723, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 27046 + }, + { + "epoch": 0.27047, + "grad_norm": 1.0562687809824105, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 27047 + }, + { + "epoch": 0.27048, + "grad_norm": 0.9311199705402079, + "learning_rate": 0.003, + "loss": 3.9959, + "step": 27048 + }, + { + "epoch": 0.27049, + "grad_norm": 0.7690817110686762, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 27049 + }, + { + "epoch": 0.2705, + "grad_norm": 0.6781421195807013, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 27050 + }, + { + "epoch": 0.27051, + "grad_norm": 0.6388413510725814, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 27051 + }, + { + "epoch": 0.27052, + "grad_norm": 0.6114523095010685, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 27052 + }, + { + "epoch": 0.27053, + "grad_norm": 0.6098368230554445, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 27053 + }, + { + "epoch": 0.27054, + "grad_norm": 0.6168358878423139, + "learning_rate": 0.003, + "loss": 4.029, + "step": 27054 + }, + { + "epoch": 0.27055, + "grad_norm": 0.6570552168736712, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 27055 + }, + { + "epoch": 0.27056, + "grad_norm": 0.6483617128459038, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 27056 + }, + { + "epoch": 0.27057, + "grad_norm": 0.6756006612211097, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 27057 + }, + { + "epoch": 0.27058, + "grad_norm": 0.7435872660058324, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 27058 + }, + { + "epoch": 0.27059, + "grad_norm": 0.7840719312028, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 27059 + }, + { + "epoch": 0.2706, + "grad_norm": 0.9207617430543449, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27060 + }, + { + "epoch": 0.27061, + "grad_norm": 1.136363784082638, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 27061 + }, + { + "epoch": 0.27062, + "grad_norm": 0.8025122709638268, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 27062 + }, + { + "epoch": 0.27063, + "grad_norm": 0.9095490548245695, + "learning_rate": 0.003, + "loss": 4.009, + "step": 27063 + }, + { + "epoch": 0.27064, + "grad_norm": 1.2042498680299538, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 27064 + }, + { + "epoch": 0.27065, + "grad_norm": 1.0800332635163825, + "learning_rate": 0.003, + "loss": 4.008, + "step": 27065 + }, + { + "epoch": 0.27066, + "grad_norm": 0.8524130787896782, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 27066 + }, + { + "epoch": 0.27067, + "grad_norm": 0.865176317665912, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 27067 + }, + { + "epoch": 0.27068, + "grad_norm": 0.978232966115204, + "learning_rate": 0.003, + "loss": 4.062, + "step": 27068 + }, + { + "epoch": 0.27069, + "grad_norm": 1.0898764627062663, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 27069 + }, + { + "epoch": 0.2707, + "grad_norm": 0.8569622692460206, + "learning_rate": 0.003, + "loss": 4.0761, + "step": 27070 + }, + { + "epoch": 0.27071, + "grad_norm": 0.7894768430029436, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 27071 + }, + { + "epoch": 0.27072, + "grad_norm": 0.9362915704003091, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 27072 + }, + { + "epoch": 0.27073, + "grad_norm": 1.1120966388838927, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 27073 + }, + { + "epoch": 0.27074, + "grad_norm": 0.9224413003465409, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 27074 + }, + { + "epoch": 0.27075, + "grad_norm": 0.939181567656111, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 27075 + }, + { + "epoch": 0.27076, + "grad_norm": 0.851244091339666, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 27076 + }, + { + "epoch": 0.27077, + "grad_norm": 0.7567516484900256, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 27077 + }, + { + "epoch": 0.27078, + "grad_norm": 0.7463332781513192, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 27078 + }, + { + "epoch": 0.27079, + "grad_norm": 0.8148149789702981, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 27079 + }, + { + "epoch": 0.2708, + "grad_norm": 0.9938724862211209, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 27080 + }, + { + "epoch": 0.27081, + "grad_norm": 1.180361774920303, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 27081 + }, + { + "epoch": 0.27082, + "grad_norm": 1.1600390834129197, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 27082 + }, + { + "epoch": 0.27083, + "grad_norm": 0.9054031154576953, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 27083 + }, + { + "epoch": 0.27084, + "grad_norm": 0.7953268114392842, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 27084 + }, + { + "epoch": 0.27085, + "grad_norm": 0.7356133707957281, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 27085 + }, + { + "epoch": 0.27086, + "grad_norm": 0.8278736200139694, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 27086 + }, + { + "epoch": 0.27087, + "grad_norm": 1.0195372495536443, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 27087 + }, + { + "epoch": 0.27088, + "grad_norm": 1.0690949917266133, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 27088 + }, + { + "epoch": 0.27089, + "grad_norm": 0.8017696662672297, + "learning_rate": 0.003, + "loss": 4.072, + "step": 27089 + }, + { + "epoch": 0.2709, + "grad_norm": 0.7750292715380511, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 27090 + }, + { + "epoch": 0.27091, + "grad_norm": 0.8523388747862698, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 27091 + }, + { + "epoch": 0.27092, + "grad_norm": 0.9283613533948846, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 27092 + }, + { + "epoch": 0.27093, + "grad_norm": 1.1584453953320673, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 27093 + }, + { + "epoch": 0.27094, + "grad_norm": 0.9935158028023288, + "learning_rate": 0.003, + "loss": 4.035, + "step": 27094 + }, + { + "epoch": 0.27095, + "grad_norm": 0.9300280200864448, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 27095 + }, + { + "epoch": 0.27096, + "grad_norm": 0.7730289487522545, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 27096 + }, + { + "epoch": 0.27097, + "grad_norm": 0.7001803330413654, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 27097 + }, + { + "epoch": 0.27098, + "grad_norm": 0.731044577203183, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 27098 + }, + { + "epoch": 0.27099, + "grad_norm": 0.7050428195909509, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 27099 + }, + { + "epoch": 0.271, + "grad_norm": 0.6559706172796037, + "learning_rate": 0.003, + "loss": 4.024, + "step": 27100 + }, + { + "epoch": 0.27101, + "grad_norm": 0.7276162424392029, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 27101 + }, + { + "epoch": 0.27102, + "grad_norm": 0.8857351904951477, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 27102 + }, + { + "epoch": 0.27103, + "grad_norm": 1.070338748386965, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 27103 + }, + { + "epoch": 0.27104, + "grad_norm": 1.0632822661633448, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 27104 + }, + { + "epoch": 0.27105, + "grad_norm": 1.0528687843837061, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 27105 + }, + { + "epoch": 0.27106, + "grad_norm": 0.8160591683191389, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 27106 + }, + { + "epoch": 0.27107, + "grad_norm": 0.702026301860167, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 27107 + }, + { + "epoch": 0.27108, + "grad_norm": 0.7221685866334959, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 27108 + }, + { + "epoch": 0.27109, + "grad_norm": 0.7873405347080112, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 27109 + }, + { + "epoch": 0.2711, + "grad_norm": 0.8800639736338831, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 27110 + }, + { + "epoch": 0.27111, + "grad_norm": 0.9885116721078356, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 27111 + }, + { + "epoch": 0.27112, + "grad_norm": 0.984865766560286, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 27112 + }, + { + "epoch": 0.27113, + "grad_norm": 0.9557134089537638, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 27113 + }, + { + "epoch": 0.27114, + "grad_norm": 0.8856643151610055, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 27114 + }, + { + "epoch": 0.27115, + "grad_norm": 0.8519398837568644, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 27115 + }, + { + "epoch": 0.27116, + "grad_norm": 0.9778039974639392, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 27116 + }, + { + "epoch": 0.27117, + "grad_norm": 1.1613461738282431, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 27117 + }, + { + "epoch": 0.27118, + "grad_norm": 0.7943938679536395, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 27118 + }, + { + "epoch": 0.27119, + "grad_norm": 0.6455142657146546, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 27119 + }, + { + "epoch": 0.2712, + "grad_norm": 0.6892573230524919, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 27120 + }, + { + "epoch": 0.27121, + "grad_norm": 0.6890125558025136, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 27121 + }, + { + "epoch": 0.27122, + "grad_norm": 0.7205962020558068, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 27122 + }, + { + "epoch": 0.27123, + "grad_norm": 0.7472643585410962, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 27123 + }, + { + "epoch": 0.27124, + "grad_norm": 0.8336672277688877, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 27124 + }, + { + "epoch": 0.27125, + "grad_norm": 0.8822704681408061, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 27125 + }, + { + "epoch": 0.27126, + "grad_norm": 0.8932075841157868, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 27126 + }, + { + "epoch": 0.27127, + "grad_norm": 1.0276218172856988, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 27127 + }, + { + "epoch": 0.27128, + "grad_norm": 1.0310460081609527, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 27128 + }, + { + "epoch": 0.27129, + "grad_norm": 1.1186343399518717, + "learning_rate": 0.003, + "loss": 4.056, + "step": 27129 + }, + { + "epoch": 0.2713, + "grad_norm": 1.0456089184684987, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 27130 + }, + { + "epoch": 0.27131, + "grad_norm": 1.0875096627611427, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 27131 + }, + { + "epoch": 0.27132, + "grad_norm": 0.9541387858588575, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 27132 + }, + { + "epoch": 0.27133, + "grad_norm": 0.9677484385097217, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 27133 + }, + { + "epoch": 0.27134, + "grad_norm": 1.063895830873757, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 27134 + }, + { + "epoch": 0.27135, + "grad_norm": 0.9858596988610876, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 27135 + }, + { + "epoch": 0.27136, + "grad_norm": 0.9713079182063803, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 27136 + }, + { + "epoch": 0.27137, + "grad_norm": 0.9061910493523073, + "learning_rate": 0.003, + "loss": 4.055, + "step": 27137 + }, + { + "epoch": 0.27138, + "grad_norm": 0.861177871664807, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 27138 + }, + { + "epoch": 0.27139, + "grad_norm": 0.8723914544026439, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 27139 + }, + { + "epoch": 0.2714, + "grad_norm": 0.85586348583267, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 27140 + }, + { + "epoch": 0.27141, + "grad_norm": 0.8224170188597626, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 27141 + }, + { + "epoch": 0.27142, + "grad_norm": 0.7654828284087059, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 27142 + }, + { + "epoch": 0.27143, + "grad_norm": 0.7853944433263345, + "learning_rate": 0.003, + "loss": 4.039, + "step": 27143 + }, + { + "epoch": 0.27144, + "grad_norm": 0.762823941353146, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 27144 + }, + { + "epoch": 0.27145, + "grad_norm": 0.7591514787711713, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 27145 + }, + { + "epoch": 0.27146, + "grad_norm": 0.6642447035899998, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 27146 + }, + { + "epoch": 0.27147, + "grad_norm": 0.5599208879429333, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 27147 + }, + { + "epoch": 0.27148, + "grad_norm": 0.557031027162787, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 27148 + }, + { + "epoch": 0.27149, + "grad_norm": 0.6624987492243699, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 27149 + }, + { + "epoch": 0.2715, + "grad_norm": 0.8205766165204013, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 27150 + }, + { + "epoch": 0.27151, + "grad_norm": 1.0809207998766002, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 27151 + }, + { + "epoch": 0.27152, + "grad_norm": 1.1438267998468274, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 27152 + }, + { + "epoch": 0.27153, + "grad_norm": 0.755220917245375, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 27153 + }, + { + "epoch": 0.27154, + "grad_norm": 0.5773218145613392, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 27154 + }, + { + "epoch": 0.27155, + "grad_norm": 0.5833234859079485, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 27155 + }, + { + "epoch": 0.27156, + "grad_norm": 0.5486132756215911, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 27156 + }, + { + "epoch": 0.27157, + "grad_norm": 0.5170053161382886, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 27157 + }, + { + "epoch": 0.27158, + "grad_norm": 0.5594581439251992, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 27158 + }, + { + "epoch": 0.27159, + "grad_norm": 0.6156913287480137, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 27159 + }, + { + "epoch": 0.2716, + "grad_norm": 0.6756345566314433, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 27160 + }, + { + "epoch": 0.27161, + "grad_norm": 0.6587090753572383, + "learning_rate": 0.003, + "loss": 4.012, + "step": 27161 + }, + { + "epoch": 0.27162, + "grad_norm": 0.6783537475995175, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 27162 + }, + { + "epoch": 0.27163, + "grad_norm": 0.6963303488291372, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 27163 + }, + { + "epoch": 0.27164, + "grad_norm": 0.6358036143558932, + "learning_rate": 0.003, + "loss": 4.03, + "step": 27164 + }, + { + "epoch": 0.27165, + "grad_norm": 0.6808144367447524, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 27165 + }, + { + "epoch": 0.27166, + "grad_norm": 0.7516887342657324, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 27166 + }, + { + "epoch": 0.27167, + "grad_norm": 0.9104422701216477, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 27167 + }, + { + "epoch": 0.27168, + "grad_norm": 1.1297040554046238, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 27168 + }, + { + "epoch": 0.27169, + "grad_norm": 1.1366329671051423, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 27169 + }, + { + "epoch": 0.2717, + "grad_norm": 1.0846159002262368, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 27170 + }, + { + "epoch": 0.27171, + "grad_norm": 0.920353978863042, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 27171 + }, + { + "epoch": 0.27172, + "grad_norm": 0.867191921196321, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 27172 + }, + { + "epoch": 0.27173, + "grad_norm": 0.9742443538714569, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 27173 + }, + { + "epoch": 0.27174, + "grad_norm": 1.2222349070009826, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 27174 + }, + { + "epoch": 0.27175, + "grad_norm": 0.9470124587449692, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 27175 + }, + { + "epoch": 0.27176, + "grad_norm": 0.9986427860980546, + "learning_rate": 0.003, + "loss": 3.9932, + "step": 27176 + }, + { + "epoch": 0.27177, + "grad_norm": 1.0061134088639327, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 27177 + }, + { + "epoch": 0.27178, + "grad_norm": 1.068778278612357, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 27178 + }, + { + "epoch": 0.27179, + "grad_norm": 0.8775087932883221, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 27179 + }, + { + "epoch": 0.2718, + "grad_norm": 0.9305609233138967, + "learning_rate": 0.003, + "loss": 4.056, + "step": 27180 + }, + { + "epoch": 0.27181, + "grad_norm": 0.9861934838987505, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 27181 + }, + { + "epoch": 0.27182, + "grad_norm": 1.0686209603007657, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 27182 + }, + { + "epoch": 0.27183, + "grad_norm": 1.1943199834725478, + "learning_rate": 0.003, + "loss": 4.0862, + "step": 27183 + }, + { + "epoch": 0.27184, + "grad_norm": 0.9376355587884407, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 27184 + }, + { + "epoch": 0.27185, + "grad_norm": 0.9271383828483835, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 27185 + }, + { + "epoch": 0.27186, + "grad_norm": 1.130912385689637, + "learning_rate": 0.003, + "loss": 4.0958, + "step": 27186 + }, + { + "epoch": 0.27187, + "grad_norm": 1.2605807924324426, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 27187 + }, + { + "epoch": 0.27188, + "grad_norm": 0.8822055764532039, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 27188 + }, + { + "epoch": 0.27189, + "grad_norm": 0.7044681395386407, + "learning_rate": 0.003, + "loss": 4.051, + "step": 27189 + }, + { + "epoch": 0.2719, + "grad_norm": 0.7237612405370422, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 27190 + }, + { + "epoch": 0.27191, + "grad_norm": 0.7319327968793413, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 27191 + }, + { + "epoch": 0.27192, + "grad_norm": 0.818410428876247, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 27192 + }, + { + "epoch": 0.27193, + "grad_norm": 0.8660669763886822, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 27193 + }, + { + "epoch": 0.27194, + "grad_norm": 0.7490120690025587, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 27194 + }, + { + "epoch": 0.27195, + "grad_norm": 0.6820603575835106, + "learning_rate": 0.003, + "loss": 4.037, + "step": 27195 + }, + { + "epoch": 0.27196, + "grad_norm": 0.861805402471239, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 27196 + }, + { + "epoch": 0.27197, + "grad_norm": 1.2378956694820602, + "learning_rate": 0.003, + "loss": 4.0849, + "step": 27197 + }, + { + "epoch": 0.27198, + "grad_norm": 0.8402970718769878, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 27198 + }, + { + "epoch": 0.27199, + "grad_norm": 0.7273603596835698, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 27199 + }, + { + "epoch": 0.272, + "grad_norm": 0.7624880957382534, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 27200 + }, + { + "epoch": 0.27201, + "grad_norm": 0.7794119083905486, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 27201 + }, + { + "epoch": 0.27202, + "grad_norm": 0.758622059909281, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 27202 + }, + { + "epoch": 0.27203, + "grad_norm": 0.7356742586105665, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 27203 + }, + { + "epoch": 0.27204, + "grad_norm": 0.7246696461181776, + "learning_rate": 0.003, + "loss": 4.036, + "step": 27204 + }, + { + "epoch": 0.27205, + "grad_norm": 0.5788274821704399, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 27205 + }, + { + "epoch": 0.27206, + "grad_norm": 0.6154187907496541, + "learning_rate": 0.003, + "loss": 4.039, + "step": 27206 + }, + { + "epoch": 0.27207, + "grad_norm": 0.6407113391895068, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 27207 + }, + { + "epoch": 0.27208, + "grad_norm": 0.6930326619486366, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 27208 + }, + { + "epoch": 0.27209, + "grad_norm": 0.7517166962841816, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 27209 + }, + { + "epoch": 0.2721, + "grad_norm": 0.9916192344650318, + "learning_rate": 0.003, + "loss": 4.03, + "step": 27210 + }, + { + "epoch": 0.27211, + "grad_norm": 1.3079968052598168, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 27211 + }, + { + "epoch": 0.27212, + "grad_norm": 0.9239598896598635, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 27212 + }, + { + "epoch": 0.27213, + "grad_norm": 0.8342140539615156, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 27213 + }, + { + "epoch": 0.27214, + "grad_norm": 0.7546189689001775, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 27214 + }, + { + "epoch": 0.27215, + "grad_norm": 0.8955815721229361, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 27215 + }, + { + "epoch": 0.27216, + "grad_norm": 1.0597620520777873, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 27216 + }, + { + "epoch": 0.27217, + "grad_norm": 0.9424885710364823, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 27217 + }, + { + "epoch": 0.27218, + "grad_norm": 0.8177987315497143, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 27218 + }, + { + "epoch": 0.27219, + "grad_norm": 0.7475638949977249, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 27219 + }, + { + "epoch": 0.2722, + "grad_norm": 0.8014783029565177, + "learning_rate": 0.003, + "loss": 4.029, + "step": 27220 + }, + { + "epoch": 0.27221, + "grad_norm": 0.8215270742283234, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 27221 + }, + { + "epoch": 0.27222, + "grad_norm": 0.8347503104742771, + "learning_rate": 0.003, + "loss": 4.028, + "step": 27222 + }, + { + "epoch": 0.27223, + "grad_norm": 0.8350513510638556, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 27223 + }, + { + "epoch": 0.27224, + "grad_norm": 0.8477893458675214, + "learning_rate": 0.003, + "loss": 4.052, + "step": 27224 + }, + { + "epoch": 0.27225, + "grad_norm": 0.8355419986007538, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 27225 + }, + { + "epoch": 0.27226, + "grad_norm": 0.8938793453500699, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27226 + }, + { + "epoch": 0.27227, + "grad_norm": 0.9622692638370453, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 27227 + }, + { + "epoch": 0.27228, + "grad_norm": 1.0722030414873116, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 27228 + }, + { + "epoch": 0.27229, + "grad_norm": 1.070775154175559, + "learning_rate": 0.003, + "loss": 4.064, + "step": 27229 + }, + { + "epoch": 0.2723, + "grad_norm": 0.9539822459429551, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 27230 + }, + { + "epoch": 0.27231, + "grad_norm": 0.914327658204957, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27231 + }, + { + "epoch": 0.27232, + "grad_norm": 0.8043332402395875, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 27232 + }, + { + "epoch": 0.27233, + "grad_norm": 0.8544020222686842, + "learning_rate": 0.003, + "loss": 4.049, + "step": 27233 + }, + { + "epoch": 0.27234, + "grad_norm": 0.8237668204075224, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 27234 + }, + { + "epoch": 0.27235, + "grad_norm": 1.0479000289143938, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 27235 + }, + { + "epoch": 0.27236, + "grad_norm": 1.3968133038216284, + "learning_rate": 0.003, + "loss": 4.066, + "step": 27236 + }, + { + "epoch": 0.27237, + "grad_norm": 0.7369511324294364, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 27237 + }, + { + "epoch": 0.27238, + "grad_norm": 0.632308940995445, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 27238 + }, + { + "epoch": 0.27239, + "grad_norm": 0.7400695892031343, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 27239 + }, + { + "epoch": 0.2724, + "grad_norm": 0.7754137236259059, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 27240 + }, + { + "epoch": 0.27241, + "grad_norm": 0.8545998274122126, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 27241 + }, + { + "epoch": 0.27242, + "grad_norm": 0.9077315981367386, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 27242 + }, + { + "epoch": 0.27243, + "grad_norm": 0.9886318804741363, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 27243 + }, + { + "epoch": 0.27244, + "grad_norm": 0.9752201253308417, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 27244 + }, + { + "epoch": 0.27245, + "grad_norm": 1.0083065574250503, + "learning_rate": 0.003, + "loss": 4.008, + "step": 27245 + }, + { + "epoch": 0.27246, + "grad_norm": 1.086315963474152, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 27246 + }, + { + "epoch": 0.27247, + "grad_norm": 0.9159454827544614, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 27247 + }, + { + "epoch": 0.27248, + "grad_norm": 1.0142733265473942, + "learning_rate": 0.003, + "loss": 4.045, + "step": 27248 + }, + { + "epoch": 0.27249, + "grad_norm": 1.0016145143010495, + "learning_rate": 0.003, + "loss": 4.063, + "step": 27249 + }, + { + "epoch": 0.2725, + "grad_norm": 0.869441591027775, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 27250 + }, + { + "epoch": 0.27251, + "grad_norm": 0.8866232941368838, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 27251 + }, + { + "epoch": 0.27252, + "grad_norm": 0.9350087517829194, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 27252 + }, + { + "epoch": 0.27253, + "grad_norm": 1.1177539187283532, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 27253 + }, + { + "epoch": 0.27254, + "grad_norm": 0.8880282367857625, + "learning_rate": 0.003, + "loss": 4.03, + "step": 27254 + }, + { + "epoch": 0.27255, + "grad_norm": 0.9411424436684704, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 27255 + }, + { + "epoch": 0.27256, + "grad_norm": 0.7783286849407326, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 27256 + }, + { + "epoch": 0.27257, + "grad_norm": 0.8126324789590152, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 27257 + }, + { + "epoch": 0.27258, + "grad_norm": 0.8290273613677162, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 27258 + }, + { + "epoch": 0.27259, + "grad_norm": 0.9684653998788338, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 27259 + }, + { + "epoch": 0.2726, + "grad_norm": 0.9244270178857742, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 27260 + }, + { + "epoch": 0.27261, + "grad_norm": 0.9217923968216553, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 27261 + }, + { + "epoch": 0.27262, + "grad_norm": 0.9587059881403286, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 27262 + }, + { + "epoch": 0.27263, + "grad_norm": 1.0331796733005223, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 27263 + }, + { + "epoch": 0.27264, + "grad_norm": 1.0092460868206161, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 27264 + }, + { + "epoch": 0.27265, + "grad_norm": 0.880725136941026, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 27265 + }, + { + "epoch": 0.27266, + "grad_norm": 0.8665104284707065, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 27266 + }, + { + "epoch": 0.27267, + "grad_norm": 0.9551963454080732, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 27267 + }, + { + "epoch": 0.27268, + "grad_norm": 0.8872078032651591, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 27268 + }, + { + "epoch": 0.27269, + "grad_norm": 0.9692175837862683, + "learning_rate": 0.003, + "loss": 4.0952, + "step": 27269 + }, + { + "epoch": 0.2727, + "grad_norm": 1.074150428979464, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 27270 + }, + { + "epoch": 0.27271, + "grad_norm": 0.9143091916217028, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 27271 + }, + { + "epoch": 0.27272, + "grad_norm": 0.8822137906872127, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 27272 + }, + { + "epoch": 0.27273, + "grad_norm": 0.9499452123431154, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 27273 + }, + { + "epoch": 0.27274, + "grad_norm": 1.0456034779341927, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 27274 + }, + { + "epoch": 0.27275, + "grad_norm": 0.9729708578284676, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 27275 + }, + { + "epoch": 0.27276, + "grad_norm": 1.0633725646122136, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 27276 + }, + { + "epoch": 0.27277, + "grad_norm": 0.9537859806366207, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 27277 + }, + { + "epoch": 0.27278, + "grad_norm": 0.8493531283932716, + "learning_rate": 0.003, + "loss": 4.026, + "step": 27278 + }, + { + "epoch": 0.27279, + "grad_norm": 0.8276759932673456, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 27279 + }, + { + "epoch": 0.2728, + "grad_norm": 0.8317087605367821, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 27280 + }, + { + "epoch": 0.27281, + "grad_norm": 0.7715816180883778, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 27281 + }, + { + "epoch": 0.27282, + "grad_norm": 0.7356442549104868, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 27282 + }, + { + "epoch": 0.27283, + "grad_norm": 0.6914130122683392, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 27283 + }, + { + "epoch": 0.27284, + "grad_norm": 0.5610307407510313, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 27284 + }, + { + "epoch": 0.27285, + "grad_norm": 0.5490984524416351, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 27285 + }, + { + "epoch": 0.27286, + "grad_norm": 0.5079922041638507, + "learning_rate": 0.003, + "loss": 4.038, + "step": 27286 + }, + { + "epoch": 0.27287, + "grad_norm": 0.48904360437833155, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 27287 + }, + { + "epoch": 0.27288, + "grad_norm": 0.5240502782730533, + "learning_rate": 0.003, + "loss": 4.053, + "step": 27288 + }, + { + "epoch": 0.27289, + "grad_norm": 0.5676918185558094, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 27289 + }, + { + "epoch": 0.2729, + "grad_norm": 0.6649761945634118, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 27290 + }, + { + "epoch": 0.27291, + "grad_norm": 0.7982231627603236, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 27291 + }, + { + "epoch": 0.27292, + "grad_norm": 1.0433353132430405, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 27292 + }, + { + "epoch": 0.27293, + "grad_norm": 1.23411532504232, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 27293 + }, + { + "epoch": 0.27294, + "grad_norm": 0.7377429341283872, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 27294 + }, + { + "epoch": 0.27295, + "grad_norm": 0.7244779781529156, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 27295 + }, + { + "epoch": 0.27296, + "grad_norm": 0.7585274987002648, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 27296 + }, + { + "epoch": 0.27297, + "grad_norm": 0.657764354389836, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 27297 + }, + { + "epoch": 0.27298, + "grad_norm": 0.7360801528413442, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 27298 + }, + { + "epoch": 0.27299, + "grad_norm": 0.7987243617628677, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 27299 + }, + { + "epoch": 0.273, + "grad_norm": 0.727807379384828, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 27300 + }, + { + "epoch": 0.27301, + "grad_norm": 0.7166070650430396, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 27301 + }, + { + "epoch": 0.27302, + "grad_norm": 0.7475273477135493, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 27302 + }, + { + "epoch": 0.27303, + "grad_norm": 0.7599090479520422, + "learning_rate": 0.003, + "loss": 4.0023, + "step": 27303 + }, + { + "epoch": 0.27304, + "grad_norm": 0.9256258141417724, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 27304 + }, + { + "epoch": 0.27305, + "grad_norm": 1.1741459619162797, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 27305 + }, + { + "epoch": 0.27306, + "grad_norm": 1.0576870683499913, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 27306 + }, + { + "epoch": 0.27307, + "grad_norm": 1.0145497847098446, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 27307 + }, + { + "epoch": 0.27308, + "grad_norm": 1.0363750525419229, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 27308 + }, + { + "epoch": 0.27309, + "grad_norm": 0.9267711383795727, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 27309 + }, + { + "epoch": 0.2731, + "grad_norm": 0.8513812949719476, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 27310 + }, + { + "epoch": 0.27311, + "grad_norm": 0.8659207329557144, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 27311 + }, + { + "epoch": 0.27312, + "grad_norm": 0.9125712909286903, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 27312 + }, + { + "epoch": 0.27313, + "grad_norm": 0.9606763461215581, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 27313 + }, + { + "epoch": 0.27314, + "grad_norm": 1.0764123036690407, + "learning_rate": 0.003, + "loss": 4.052, + "step": 27314 + }, + { + "epoch": 0.27315, + "grad_norm": 0.988048159189695, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 27315 + }, + { + "epoch": 0.27316, + "grad_norm": 0.9708720998377749, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 27316 + }, + { + "epoch": 0.27317, + "grad_norm": 1.0867984117775267, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 27317 + }, + { + "epoch": 0.27318, + "grad_norm": 1.126922534519531, + "learning_rate": 0.003, + "loss": 4.062, + "step": 27318 + }, + { + "epoch": 0.27319, + "grad_norm": 0.7750799477371327, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 27319 + }, + { + "epoch": 0.2732, + "grad_norm": 0.7724907870968374, + "learning_rate": 0.003, + "loss": 4.041, + "step": 27320 + }, + { + "epoch": 0.27321, + "grad_norm": 0.7046087914483348, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 27321 + }, + { + "epoch": 0.27322, + "grad_norm": 0.6852655596043664, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 27322 + }, + { + "epoch": 0.27323, + "grad_norm": 0.8033757352943267, + "learning_rate": 0.003, + "loss": 4.0874, + "step": 27323 + }, + { + "epoch": 0.27324, + "grad_norm": 1.109236035538625, + "learning_rate": 0.003, + "loss": 4.044, + "step": 27324 + }, + { + "epoch": 0.27325, + "grad_norm": 1.0395512501729445, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 27325 + }, + { + "epoch": 0.27326, + "grad_norm": 0.9013995133173852, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 27326 + }, + { + "epoch": 0.27327, + "grad_norm": 0.844495590475074, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 27327 + }, + { + "epoch": 0.27328, + "grad_norm": 0.7659898502913669, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 27328 + }, + { + "epoch": 0.27329, + "grad_norm": 0.6924707777748702, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 27329 + }, + { + "epoch": 0.2733, + "grad_norm": 0.6234870128310842, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 27330 + }, + { + "epoch": 0.27331, + "grad_norm": 0.5935626661317915, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 27331 + }, + { + "epoch": 0.27332, + "grad_norm": 0.5866188973320777, + "learning_rate": 0.003, + "loss": 4.04, + "step": 27332 + }, + { + "epoch": 0.27333, + "grad_norm": 0.6328134470851577, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 27333 + }, + { + "epoch": 0.27334, + "grad_norm": 0.6504591690915844, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 27334 + }, + { + "epoch": 0.27335, + "grad_norm": 0.6131369011295167, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 27335 + }, + { + "epoch": 0.27336, + "grad_norm": 0.6134938414995452, + "learning_rate": 0.003, + "loss": 4.0095, + "step": 27336 + }, + { + "epoch": 0.27337, + "grad_norm": 0.6869343387012089, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 27337 + }, + { + "epoch": 0.27338, + "grad_norm": 0.7390715135592301, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 27338 + }, + { + "epoch": 0.27339, + "grad_norm": 0.854398927820215, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 27339 + }, + { + "epoch": 0.2734, + "grad_norm": 1.162037680962209, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 27340 + }, + { + "epoch": 0.27341, + "grad_norm": 1.0222914319522627, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 27341 + }, + { + "epoch": 0.27342, + "grad_norm": 1.1520932743656174, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 27342 + }, + { + "epoch": 0.27343, + "grad_norm": 0.928014530568992, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 27343 + }, + { + "epoch": 0.27344, + "grad_norm": 0.8747890143787175, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 27344 + }, + { + "epoch": 0.27345, + "grad_norm": 0.932464368254461, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 27345 + }, + { + "epoch": 0.27346, + "grad_norm": 0.8375401549403108, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 27346 + }, + { + "epoch": 0.27347, + "grad_norm": 0.8299257008350109, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 27347 + }, + { + "epoch": 0.27348, + "grad_norm": 0.8769839374378462, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 27348 + }, + { + "epoch": 0.27349, + "grad_norm": 0.9116260944959204, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 27349 + }, + { + "epoch": 0.2735, + "grad_norm": 0.8473018241071563, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 27350 + }, + { + "epoch": 0.27351, + "grad_norm": 0.8601695102948744, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 27351 + }, + { + "epoch": 0.27352, + "grad_norm": 0.8872200277474028, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 27352 + }, + { + "epoch": 0.27353, + "grad_norm": 0.9682856190451967, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 27353 + }, + { + "epoch": 0.27354, + "grad_norm": 1.2001561094784348, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 27354 + }, + { + "epoch": 0.27355, + "grad_norm": 0.9040965369455419, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 27355 + }, + { + "epoch": 0.27356, + "grad_norm": 0.8853159728029145, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 27356 + }, + { + "epoch": 0.27357, + "grad_norm": 0.8278343288338695, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 27357 + }, + { + "epoch": 0.27358, + "grad_norm": 0.7412134315406782, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 27358 + }, + { + "epoch": 0.27359, + "grad_norm": 0.795874685838528, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 27359 + }, + { + "epoch": 0.2736, + "grad_norm": 0.8578426132069809, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 27360 + }, + { + "epoch": 0.27361, + "grad_norm": 0.8974415314256757, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 27361 + }, + { + "epoch": 0.27362, + "grad_norm": 1.0447831596074633, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 27362 + }, + { + "epoch": 0.27363, + "grad_norm": 1.0749487593622333, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 27363 + }, + { + "epoch": 0.27364, + "grad_norm": 1.0393737480988867, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 27364 + }, + { + "epoch": 0.27365, + "grad_norm": 1.0361668427685058, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 27365 + }, + { + "epoch": 0.27366, + "grad_norm": 1.0467255116067917, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 27366 + }, + { + "epoch": 0.27367, + "grad_norm": 0.9317374201583356, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 27367 + }, + { + "epoch": 0.27368, + "grad_norm": 0.8158685144532749, + "learning_rate": 0.003, + "loss": 4.049, + "step": 27368 + }, + { + "epoch": 0.27369, + "grad_norm": 0.7442092980490032, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 27369 + }, + { + "epoch": 0.2737, + "grad_norm": 0.8005797589704103, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 27370 + }, + { + "epoch": 0.27371, + "grad_norm": 0.8646964937175462, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 27371 + }, + { + "epoch": 0.27372, + "grad_norm": 0.8805736504632035, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 27372 + }, + { + "epoch": 0.27373, + "grad_norm": 0.933446894596631, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 27373 + }, + { + "epoch": 0.27374, + "grad_norm": 0.9594075699102096, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 27374 + }, + { + "epoch": 0.27375, + "grad_norm": 1.086376487679832, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 27375 + }, + { + "epoch": 0.27376, + "grad_norm": 0.8352728282675678, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 27376 + }, + { + "epoch": 0.27377, + "grad_norm": 0.729801640697119, + "learning_rate": 0.003, + "loss": 4.056, + "step": 27377 + }, + { + "epoch": 0.27378, + "grad_norm": 0.7615618850242499, + "learning_rate": 0.003, + "loss": 4.056, + "step": 27378 + }, + { + "epoch": 0.27379, + "grad_norm": 0.7441213528145411, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 27379 + }, + { + "epoch": 0.2738, + "grad_norm": 0.6394841802367003, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 27380 + }, + { + "epoch": 0.27381, + "grad_norm": 0.688430857103259, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 27381 + }, + { + "epoch": 0.27382, + "grad_norm": 0.6029620243987147, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 27382 + }, + { + "epoch": 0.27383, + "grad_norm": 0.6921292474115627, + "learning_rate": 0.003, + "loss": 4.039, + "step": 27383 + }, + { + "epoch": 0.27384, + "grad_norm": 0.7716864071315577, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 27384 + }, + { + "epoch": 0.27385, + "grad_norm": 0.8137676022361442, + "learning_rate": 0.003, + "loss": 3.9986, + "step": 27385 + }, + { + "epoch": 0.27386, + "grad_norm": 0.7663945832314726, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 27386 + }, + { + "epoch": 0.27387, + "grad_norm": 0.8016130229598967, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 27387 + }, + { + "epoch": 0.27388, + "grad_norm": 0.837626049367869, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 27388 + }, + { + "epoch": 0.27389, + "grad_norm": 0.9224566397645034, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 27389 + }, + { + "epoch": 0.2739, + "grad_norm": 1.089596290370039, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 27390 + }, + { + "epoch": 0.27391, + "grad_norm": 1.1621154766160475, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 27391 + }, + { + "epoch": 0.27392, + "grad_norm": 0.9731353592843491, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 27392 + }, + { + "epoch": 0.27393, + "grad_norm": 0.9779590561639354, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 27393 + }, + { + "epoch": 0.27394, + "grad_norm": 1.0914497742748874, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 27394 + }, + { + "epoch": 0.27395, + "grad_norm": 0.9179233980120709, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 27395 + }, + { + "epoch": 0.27396, + "grad_norm": 0.9095106732159272, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 27396 + }, + { + "epoch": 0.27397, + "grad_norm": 1.0888758215089545, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 27397 + }, + { + "epoch": 0.27398, + "grad_norm": 1.099860403811806, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 27398 + }, + { + "epoch": 0.27399, + "grad_norm": 0.7409531229994823, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 27399 + }, + { + "epoch": 0.274, + "grad_norm": 0.6927064042490775, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 27400 + }, + { + "epoch": 0.27401, + "grad_norm": 0.6907288742401734, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 27401 + }, + { + "epoch": 0.27402, + "grad_norm": 0.7555666391408589, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 27402 + }, + { + "epoch": 0.27403, + "grad_norm": 0.9091445453806137, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 27403 + }, + { + "epoch": 0.27404, + "grad_norm": 1.1411906713898534, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 27404 + }, + { + "epoch": 0.27405, + "grad_norm": 0.7968811950323658, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 27405 + }, + { + "epoch": 0.27406, + "grad_norm": 0.7851245853226373, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 27406 + }, + { + "epoch": 0.27407, + "grad_norm": 0.8569170883906466, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 27407 + }, + { + "epoch": 0.27408, + "grad_norm": 0.9052556250636563, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 27408 + }, + { + "epoch": 0.27409, + "grad_norm": 0.8330541597480289, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 27409 + }, + { + "epoch": 0.2741, + "grad_norm": 0.732094492759006, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 27410 + }, + { + "epoch": 0.27411, + "grad_norm": 0.742547772747988, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 27411 + }, + { + "epoch": 0.27412, + "grad_norm": 0.7378542574442841, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 27412 + }, + { + "epoch": 0.27413, + "grad_norm": 0.7851284151614838, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 27413 + }, + { + "epoch": 0.27414, + "grad_norm": 1.0304719123773676, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 27414 + }, + { + "epoch": 0.27415, + "grad_norm": 1.2207677777607162, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 27415 + }, + { + "epoch": 0.27416, + "grad_norm": 0.8505421595909284, + "learning_rate": 0.003, + "loss": 4.065, + "step": 27416 + }, + { + "epoch": 0.27417, + "grad_norm": 0.8363953408275911, + "learning_rate": 0.003, + "loss": 4.029, + "step": 27417 + }, + { + "epoch": 0.27418, + "grad_norm": 0.905373949766154, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 27418 + }, + { + "epoch": 0.27419, + "grad_norm": 0.896003695432936, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 27419 + }, + { + "epoch": 0.2742, + "grad_norm": 0.8067627195702282, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 27420 + }, + { + "epoch": 0.27421, + "grad_norm": 0.7554626465469829, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 27421 + }, + { + "epoch": 0.27422, + "grad_norm": 0.7335073313249293, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 27422 + }, + { + "epoch": 0.27423, + "grad_norm": 0.7808773093527447, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 27423 + }, + { + "epoch": 0.27424, + "grad_norm": 0.8302458614855671, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 27424 + }, + { + "epoch": 0.27425, + "grad_norm": 1.0029314810599526, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 27425 + }, + { + "epoch": 0.27426, + "grad_norm": 1.3160540850285514, + "learning_rate": 0.003, + "loss": 4.055, + "step": 27426 + }, + { + "epoch": 0.27427, + "grad_norm": 0.8309379346634932, + "learning_rate": 0.003, + "loss": 4.072, + "step": 27427 + }, + { + "epoch": 0.27428, + "grad_norm": 0.813704016976871, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 27428 + }, + { + "epoch": 0.27429, + "grad_norm": 0.8719536254560902, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 27429 + }, + { + "epoch": 0.2743, + "grad_norm": 0.9626717658333871, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 27430 + }, + { + "epoch": 0.27431, + "grad_norm": 1.188537845017463, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 27431 + }, + { + "epoch": 0.27432, + "grad_norm": 0.833775180003765, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 27432 + }, + { + "epoch": 0.27433, + "grad_norm": 0.7038873946107546, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 27433 + }, + { + "epoch": 0.27434, + "grad_norm": 0.6979402822788379, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 27434 + }, + { + "epoch": 0.27435, + "grad_norm": 0.7241907213582798, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 27435 + }, + { + "epoch": 0.27436, + "grad_norm": 0.7733991153300931, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 27436 + }, + { + "epoch": 0.27437, + "grad_norm": 0.7992664317378999, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 27437 + }, + { + "epoch": 0.27438, + "grad_norm": 0.8387029662768279, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 27438 + }, + { + "epoch": 0.27439, + "grad_norm": 0.8699711568884091, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 27439 + }, + { + "epoch": 0.2744, + "grad_norm": 0.7649466351581194, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 27440 + }, + { + "epoch": 0.27441, + "grad_norm": 0.7269121329275984, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 27441 + }, + { + "epoch": 0.27442, + "grad_norm": 0.8942315293940386, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27442 + }, + { + "epoch": 0.27443, + "grad_norm": 1.0521431763550897, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 27443 + }, + { + "epoch": 0.27444, + "grad_norm": 1.122233410381414, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 27444 + }, + { + "epoch": 0.27445, + "grad_norm": 0.8866059190875801, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 27445 + }, + { + "epoch": 0.27446, + "grad_norm": 0.8689669941384291, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 27446 + }, + { + "epoch": 0.27447, + "grad_norm": 0.7137318570806475, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 27447 + }, + { + "epoch": 0.27448, + "grad_norm": 0.7650391889159205, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 27448 + }, + { + "epoch": 0.27449, + "grad_norm": 0.7786522584353585, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 27449 + }, + { + "epoch": 0.2745, + "grad_norm": 0.9099556178809449, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 27450 + }, + { + "epoch": 0.27451, + "grad_norm": 1.265239383200757, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 27451 + }, + { + "epoch": 0.27452, + "grad_norm": 0.9085210312891671, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 27452 + }, + { + "epoch": 0.27453, + "grad_norm": 0.8623858556357957, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 27453 + }, + { + "epoch": 0.27454, + "grad_norm": 0.798148304670742, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 27454 + }, + { + "epoch": 0.27455, + "grad_norm": 0.7619294803156953, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 27455 + }, + { + "epoch": 0.27456, + "grad_norm": 0.7288712740810295, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 27456 + }, + { + "epoch": 0.27457, + "grad_norm": 0.7439268947037079, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 27457 + }, + { + "epoch": 0.27458, + "grad_norm": 0.7473738848777058, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 27458 + }, + { + "epoch": 0.27459, + "grad_norm": 0.8173784568876616, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 27459 + }, + { + "epoch": 0.2746, + "grad_norm": 0.9099098827005555, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 27460 + }, + { + "epoch": 0.27461, + "grad_norm": 0.9607318446307068, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 27461 + }, + { + "epoch": 0.27462, + "grad_norm": 1.007414330472367, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 27462 + }, + { + "epoch": 0.27463, + "grad_norm": 0.9626430689450209, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 27463 + }, + { + "epoch": 0.27464, + "grad_norm": 1.1510292735304786, + "learning_rate": 0.003, + "loss": 4.046, + "step": 27464 + }, + { + "epoch": 0.27465, + "grad_norm": 1.0595244805374822, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 27465 + }, + { + "epoch": 0.27466, + "grad_norm": 1.1421760151141422, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 27466 + }, + { + "epoch": 0.27467, + "grad_norm": 0.8810728019106769, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 27467 + }, + { + "epoch": 0.27468, + "grad_norm": 0.8464373680201983, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 27468 + }, + { + "epoch": 0.27469, + "grad_norm": 0.9057064217496835, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 27469 + }, + { + "epoch": 0.2747, + "grad_norm": 1.017479383988474, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 27470 + }, + { + "epoch": 0.27471, + "grad_norm": 1.0301026811185738, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 27471 + }, + { + "epoch": 0.27472, + "grad_norm": 1.0500518557284348, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 27472 + }, + { + "epoch": 0.27473, + "grad_norm": 0.8434829794212577, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 27473 + }, + { + "epoch": 0.27474, + "grad_norm": 0.8940630212320206, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 27474 + }, + { + "epoch": 0.27475, + "grad_norm": 0.80898127988051, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 27475 + }, + { + "epoch": 0.27476, + "grad_norm": 0.7479332169950791, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 27476 + }, + { + "epoch": 0.27477, + "grad_norm": 0.8079512854645222, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 27477 + }, + { + "epoch": 0.27478, + "grad_norm": 0.8539775038964349, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 27478 + }, + { + "epoch": 0.27479, + "grad_norm": 0.849720706539171, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27479 + }, + { + "epoch": 0.2748, + "grad_norm": 0.956241947569029, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 27480 + }, + { + "epoch": 0.27481, + "grad_norm": 1.054270653923364, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 27481 + }, + { + "epoch": 0.27482, + "grad_norm": 1.0749110221807576, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 27482 + }, + { + "epoch": 0.27483, + "grad_norm": 0.921728764939343, + "learning_rate": 0.003, + "loss": 4.054, + "step": 27483 + }, + { + "epoch": 0.27484, + "grad_norm": 0.920436723382143, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 27484 + }, + { + "epoch": 0.27485, + "grad_norm": 0.7826230725032673, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 27485 + }, + { + "epoch": 0.27486, + "grad_norm": 0.7187615351864852, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 27486 + }, + { + "epoch": 0.27487, + "grad_norm": 0.766182594149174, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 27487 + }, + { + "epoch": 0.27488, + "grad_norm": 0.839585401808998, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 27488 + }, + { + "epoch": 0.27489, + "grad_norm": 0.807975441141832, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 27489 + }, + { + "epoch": 0.2749, + "grad_norm": 0.6721582227795454, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 27490 + }, + { + "epoch": 0.27491, + "grad_norm": 0.8145174706207421, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 27491 + }, + { + "epoch": 0.27492, + "grad_norm": 1.1230711784260712, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 27492 + }, + { + "epoch": 0.27493, + "grad_norm": 1.1832815283361864, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 27493 + }, + { + "epoch": 0.27494, + "grad_norm": 0.9458927462513933, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 27494 + }, + { + "epoch": 0.27495, + "grad_norm": 1.0128338005700075, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 27495 + }, + { + "epoch": 0.27496, + "grad_norm": 0.9553114795826317, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 27496 + }, + { + "epoch": 0.27497, + "grad_norm": 0.924488979917049, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 27497 + }, + { + "epoch": 0.27498, + "grad_norm": 0.9187470908377797, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 27498 + }, + { + "epoch": 0.27499, + "grad_norm": 0.8016636539873732, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 27499 + }, + { + "epoch": 0.275, + "grad_norm": 0.7871270932330456, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 27500 + }, + { + "epoch": 0.27501, + "grad_norm": 0.7539216212731229, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 27501 + }, + { + "epoch": 0.27502, + "grad_norm": 0.6853184314253955, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 27502 + }, + { + "epoch": 0.27503, + "grad_norm": 0.665335695209181, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 27503 + }, + { + "epoch": 0.27504, + "grad_norm": 0.6355608661549148, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 27504 + }, + { + "epoch": 0.27505, + "grad_norm": 0.6549052251639845, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 27505 + }, + { + "epoch": 0.27506, + "grad_norm": 0.6670533595753011, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 27506 + }, + { + "epoch": 0.27507, + "grad_norm": 0.7534124691921559, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 27507 + }, + { + "epoch": 0.27508, + "grad_norm": 0.9707481010121497, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 27508 + }, + { + "epoch": 0.27509, + "grad_norm": 1.0935030766444684, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 27509 + }, + { + "epoch": 0.2751, + "grad_norm": 0.8132493805216545, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 27510 + }, + { + "epoch": 0.27511, + "grad_norm": 0.766723427401172, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 27511 + }, + { + "epoch": 0.27512, + "grad_norm": 0.7671989928210858, + "learning_rate": 0.003, + "loss": 4.028, + "step": 27512 + }, + { + "epoch": 0.27513, + "grad_norm": 0.7408202184440116, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 27513 + }, + { + "epoch": 0.27514, + "grad_norm": 0.5759000077232321, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 27514 + }, + { + "epoch": 0.27515, + "grad_norm": 0.6255602149289347, + "learning_rate": 0.003, + "loss": 4.017, + "step": 27515 + }, + { + "epoch": 0.27516, + "grad_norm": 0.6872200632585111, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 27516 + }, + { + "epoch": 0.27517, + "grad_norm": 0.716969980801242, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 27517 + }, + { + "epoch": 0.27518, + "grad_norm": 0.7024144451198358, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 27518 + }, + { + "epoch": 0.27519, + "grad_norm": 0.7581706269103492, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 27519 + }, + { + "epoch": 0.2752, + "grad_norm": 0.9604720079087541, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 27520 + }, + { + "epoch": 0.27521, + "grad_norm": 1.3114665604836375, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 27521 + }, + { + "epoch": 0.27522, + "grad_norm": 0.8651653085356185, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 27522 + }, + { + "epoch": 0.27523, + "grad_norm": 0.7436616118521834, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 27523 + }, + { + "epoch": 0.27524, + "grad_norm": 0.7171113980226964, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 27524 + }, + { + "epoch": 0.27525, + "grad_norm": 0.6959938546664353, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 27525 + }, + { + "epoch": 0.27526, + "grad_norm": 0.6748813958680602, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 27526 + }, + { + "epoch": 0.27527, + "grad_norm": 0.7286452261243423, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 27527 + }, + { + "epoch": 0.27528, + "grad_norm": 0.7607686912134622, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 27528 + }, + { + "epoch": 0.27529, + "grad_norm": 0.9120006250314171, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 27529 + }, + { + "epoch": 0.2753, + "grad_norm": 1.1407073439280282, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 27530 + }, + { + "epoch": 0.27531, + "grad_norm": 0.9858709349982245, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 27531 + }, + { + "epoch": 0.27532, + "grad_norm": 0.8912552884464429, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 27532 + }, + { + "epoch": 0.27533, + "grad_norm": 0.9936590726987478, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 27533 + }, + { + "epoch": 0.27534, + "grad_norm": 1.0796492381332126, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 27534 + }, + { + "epoch": 0.27535, + "grad_norm": 0.8851203779477849, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 27535 + }, + { + "epoch": 0.27536, + "grad_norm": 0.9126429101858924, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 27536 + }, + { + "epoch": 0.27537, + "grad_norm": 0.9767616157389322, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 27537 + }, + { + "epoch": 0.27538, + "grad_norm": 1.071441070407833, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 27538 + }, + { + "epoch": 0.27539, + "grad_norm": 0.9992441676047388, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 27539 + }, + { + "epoch": 0.2754, + "grad_norm": 1.0135916923891748, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 27540 + }, + { + "epoch": 0.27541, + "grad_norm": 0.8785323301499186, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 27541 + }, + { + "epoch": 0.27542, + "grad_norm": 1.1002852366792761, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 27542 + }, + { + "epoch": 0.27543, + "grad_norm": 1.1528869493065075, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 27543 + }, + { + "epoch": 0.27544, + "grad_norm": 0.938983436433051, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 27544 + }, + { + "epoch": 0.27545, + "grad_norm": 0.9050150829001987, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 27545 + }, + { + "epoch": 0.27546, + "grad_norm": 1.0582811131301542, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 27546 + }, + { + "epoch": 0.27547, + "grad_norm": 1.003134902567243, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 27547 + }, + { + "epoch": 0.27548, + "grad_norm": 0.9018634262899375, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 27548 + }, + { + "epoch": 0.27549, + "grad_norm": 0.891913998853702, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 27549 + }, + { + "epoch": 0.2755, + "grad_norm": 0.8501577059348694, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 27550 + }, + { + "epoch": 0.27551, + "grad_norm": 0.744843275190083, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 27551 + }, + { + "epoch": 0.27552, + "grad_norm": 0.6584018775299888, + "learning_rate": 0.003, + "loss": 4.045, + "step": 27552 + }, + { + "epoch": 0.27553, + "grad_norm": 0.6382722930211042, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 27553 + }, + { + "epoch": 0.27554, + "grad_norm": 0.7032472993967472, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 27554 + }, + { + "epoch": 0.27555, + "grad_norm": 0.7862902123715984, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 27555 + }, + { + "epoch": 0.27556, + "grad_norm": 0.8914170284499748, + "learning_rate": 0.003, + "loss": 4.039, + "step": 27556 + }, + { + "epoch": 0.27557, + "grad_norm": 0.9859594638444603, + "learning_rate": 0.003, + "loss": 4.054, + "step": 27557 + }, + { + "epoch": 0.27558, + "grad_norm": 1.0768995820384843, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 27558 + }, + { + "epoch": 0.27559, + "grad_norm": 0.8836465160824759, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 27559 + }, + { + "epoch": 0.2756, + "grad_norm": 0.7832603604015481, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 27560 + }, + { + "epoch": 0.27561, + "grad_norm": 0.784387326972705, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 27561 + }, + { + "epoch": 0.27562, + "grad_norm": 0.7959746671086947, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 27562 + }, + { + "epoch": 0.27563, + "grad_norm": 0.8595512327817114, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 27563 + }, + { + "epoch": 0.27564, + "grad_norm": 0.9009483677327366, + "learning_rate": 0.003, + "loss": 4.056, + "step": 27564 + }, + { + "epoch": 0.27565, + "grad_norm": 1.138004258610621, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 27565 + }, + { + "epoch": 0.27566, + "grad_norm": 0.8948198221721799, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 27566 + }, + { + "epoch": 0.27567, + "grad_norm": 0.8385289070406653, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 27567 + }, + { + "epoch": 0.27568, + "grad_norm": 0.8779316993920455, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 27568 + }, + { + "epoch": 0.27569, + "grad_norm": 0.841311682968659, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 27569 + }, + { + "epoch": 0.2757, + "grad_norm": 0.8075555550850717, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 27570 + }, + { + "epoch": 0.27571, + "grad_norm": 0.8362522612064287, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 27571 + }, + { + "epoch": 0.27572, + "grad_norm": 0.8646047885843108, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 27572 + }, + { + "epoch": 0.27573, + "grad_norm": 0.9133019398553516, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 27573 + }, + { + "epoch": 0.27574, + "grad_norm": 1.0653767853850598, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 27574 + }, + { + "epoch": 0.27575, + "grad_norm": 0.9465006013101137, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 27575 + }, + { + "epoch": 0.27576, + "grad_norm": 0.7903861011812449, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 27576 + }, + { + "epoch": 0.27577, + "grad_norm": 0.7660488816458991, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 27577 + }, + { + "epoch": 0.27578, + "grad_norm": 0.8318119425186651, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 27578 + }, + { + "epoch": 0.27579, + "grad_norm": 0.8585689264857653, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 27579 + }, + { + "epoch": 0.2758, + "grad_norm": 0.961049629634931, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 27580 + }, + { + "epoch": 0.27581, + "grad_norm": 0.9949296033743659, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 27581 + }, + { + "epoch": 0.27582, + "grad_norm": 1.0856821375700363, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 27582 + }, + { + "epoch": 0.27583, + "grad_norm": 0.9466174818289098, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 27583 + }, + { + "epoch": 0.27584, + "grad_norm": 0.7693535128275024, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 27584 + }, + { + "epoch": 0.27585, + "grad_norm": 0.7882508565649331, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 27585 + }, + { + "epoch": 0.27586, + "grad_norm": 0.7851600824782171, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 27586 + }, + { + "epoch": 0.27587, + "grad_norm": 0.8369271888416528, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 27587 + }, + { + "epoch": 0.27588, + "grad_norm": 0.8761703635546019, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 27588 + }, + { + "epoch": 0.27589, + "grad_norm": 0.8874521755764253, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 27589 + }, + { + "epoch": 0.2759, + "grad_norm": 0.7802661002270506, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 27590 + }, + { + "epoch": 0.27591, + "grad_norm": 0.7504946951370824, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 27591 + }, + { + "epoch": 0.27592, + "grad_norm": 0.8182259858024599, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 27592 + }, + { + "epoch": 0.27593, + "grad_norm": 0.9383115398851327, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 27593 + }, + { + "epoch": 0.27594, + "grad_norm": 1.4007055002007227, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 27594 + }, + { + "epoch": 0.27595, + "grad_norm": 0.8905906054309448, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 27595 + }, + { + "epoch": 0.27596, + "grad_norm": 0.7471650573005397, + "learning_rate": 0.003, + "loss": 4.051, + "step": 27596 + }, + { + "epoch": 0.27597, + "grad_norm": 0.7293564069773956, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 27597 + }, + { + "epoch": 0.27598, + "grad_norm": 0.7604897598040367, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 27598 + }, + { + "epoch": 0.27599, + "grad_norm": 0.6899727928775439, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 27599 + }, + { + "epoch": 0.276, + "grad_norm": 0.7103487343974175, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 27600 + }, + { + "epoch": 0.27601, + "grad_norm": 0.7083491387647609, + "learning_rate": 0.003, + "loss": 4.042, + "step": 27601 + }, + { + "epoch": 0.27602, + "grad_norm": 0.7646286171983344, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 27602 + }, + { + "epoch": 0.27603, + "grad_norm": 0.8028549798392458, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 27603 + }, + { + "epoch": 0.27604, + "grad_norm": 0.636982763190343, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 27604 + }, + { + "epoch": 0.27605, + "grad_norm": 0.6238661683071908, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 27605 + }, + { + "epoch": 0.27606, + "grad_norm": 0.5868082054774105, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 27606 + }, + { + "epoch": 0.27607, + "grad_norm": 0.5850582054056035, + "learning_rate": 0.003, + "loss": 4.0095, + "step": 27607 + }, + { + "epoch": 0.27608, + "grad_norm": 0.6737459546000603, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 27608 + }, + { + "epoch": 0.27609, + "grad_norm": 0.8836877779695854, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 27609 + }, + { + "epoch": 0.2761, + "grad_norm": 1.255141632217191, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 27610 + }, + { + "epoch": 0.27611, + "grad_norm": 1.0925984441037215, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 27611 + }, + { + "epoch": 0.27612, + "grad_norm": 0.960404320336506, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 27612 + }, + { + "epoch": 0.27613, + "grad_norm": 0.9297919515876425, + "learning_rate": 0.003, + "loss": 4.059, + "step": 27613 + }, + { + "epoch": 0.27614, + "grad_norm": 0.9546812145858595, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 27614 + }, + { + "epoch": 0.27615, + "grad_norm": 0.9363572063942333, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 27615 + }, + { + "epoch": 0.27616, + "grad_norm": 0.97198755138276, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 27616 + }, + { + "epoch": 0.27617, + "grad_norm": 0.9544967067312224, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 27617 + }, + { + "epoch": 0.27618, + "grad_norm": 1.0424951225809767, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 27618 + }, + { + "epoch": 0.27619, + "grad_norm": 1.0711359292087823, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 27619 + }, + { + "epoch": 0.2762, + "grad_norm": 0.9463015061121943, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 27620 + }, + { + "epoch": 0.27621, + "grad_norm": 0.8765271848773937, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 27621 + }, + { + "epoch": 0.27622, + "grad_norm": 0.8976222426569188, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 27622 + }, + { + "epoch": 0.27623, + "grad_norm": 0.8528743900902608, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 27623 + }, + { + "epoch": 0.27624, + "grad_norm": 0.8052848319268165, + "learning_rate": 0.003, + "loss": 4.067, + "step": 27624 + }, + { + "epoch": 0.27625, + "grad_norm": 0.7841743375161037, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 27625 + }, + { + "epoch": 0.27626, + "grad_norm": 0.7190315588519719, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 27626 + }, + { + "epoch": 0.27627, + "grad_norm": 0.7902570669206066, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 27627 + }, + { + "epoch": 0.27628, + "grad_norm": 0.9025044881775321, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 27628 + }, + { + "epoch": 0.27629, + "grad_norm": 1.1028916753090239, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 27629 + }, + { + "epoch": 0.2763, + "grad_norm": 1.0281559255193735, + "learning_rate": 0.003, + "loss": 4.0804, + "step": 27630 + }, + { + "epoch": 0.27631, + "grad_norm": 0.9405605180831365, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 27631 + }, + { + "epoch": 0.27632, + "grad_norm": 0.8377842085247145, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 27632 + }, + { + "epoch": 0.27633, + "grad_norm": 0.7614447136375849, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 27633 + }, + { + "epoch": 0.27634, + "grad_norm": 0.8099139958890382, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 27634 + }, + { + "epoch": 0.27635, + "grad_norm": 0.975420920740107, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 27635 + }, + { + "epoch": 0.27636, + "grad_norm": 1.01447184244507, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 27636 + }, + { + "epoch": 0.27637, + "grad_norm": 0.751799628913652, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 27637 + }, + { + "epoch": 0.27638, + "grad_norm": 0.8312255858972802, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 27638 + }, + { + "epoch": 0.27639, + "grad_norm": 0.841837812564942, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 27639 + }, + { + "epoch": 0.2764, + "grad_norm": 0.7894997121324281, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 27640 + }, + { + "epoch": 0.27641, + "grad_norm": 0.9391192122200699, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 27641 + }, + { + "epoch": 0.27642, + "grad_norm": 1.1843979657090105, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 27642 + }, + { + "epoch": 0.27643, + "grad_norm": 1.0197919052750457, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 27643 + }, + { + "epoch": 0.27644, + "grad_norm": 0.962068415634205, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 27644 + }, + { + "epoch": 0.27645, + "grad_norm": 0.8772132090128075, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 27645 + }, + { + "epoch": 0.27646, + "grad_norm": 0.7155344949614276, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 27646 + }, + { + "epoch": 0.27647, + "grad_norm": 0.7175927346953167, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 27647 + }, + { + "epoch": 0.27648, + "grad_norm": 0.7642066023439841, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 27648 + }, + { + "epoch": 0.27649, + "grad_norm": 0.7936469094437929, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 27649 + }, + { + "epoch": 0.2765, + "grad_norm": 0.9550261762873526, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 27650 + }, + { + "epoch": 0.27651, + "grad_norm": 1.1087014569894011, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 27651 + }, + { + "epoch": 0.27652, + "grad_norm": 0.9433103406770217, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 27652 + }, + { + "epoch": 0.27653, + "grad_norm": 0.9808522056359532, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 27653 + }, + { + "epoch": 0.27654, + "grad_norm": 1.0626615940755095, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 27654 + }, + { + "epoch": 0.27655, + "grad_norm": 0.8988477472329935, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 27655 + }, + { + "epoch": 0.27656, + "grad_norm": 0.7022635848675114, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 27656 + }, + { + "epoch": 0.27657, + "grad_norm": 0.6902331602076974, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 27657 + }, + { + "epoch": 0.27658, + "grad_norm": 0.6092234783281616, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 27658 + }, + { + "epoch": 0.27659, + "grad_norm": 0.6245567561604682, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 27659 + }, + { + "epoch": 0.2766, + "grad_norm": 0.6218687775757363, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 27660 + }, + { + "epoch": 0.27661, + "grad_norm": 0.6316146707095361, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 27661 + }, + { + "epoch": 0.27662, + "grad_norm": 0.6280287932510784, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 27662 + }, + { + "epoch": 0.27663, + "grad_norm": 0.6338967256994142, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 27663 + }, + { + "epoch": 0.27664, + "grad_norm": 0.6785475798502305, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 27664 + }, + { + "epoch": 0.27665, + "grad_norm": 0.6058344629552395, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 27665 + }, + { + "epoch": 0.27666, + "grad_norm": 0.55837563696932, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 27666 + }, + { + "epoch": 0.27667, + "grad_norm": 0.7190392896959156, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 27667 + }, + { + "epoch": 0.27668, + "grad_norm": 0.8378771194293715, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 27668 + }, + { + "epoch": 0.27669, + "grad_norm": 0.960676616547277, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 27669 + }, + { + "epoch": 0.2767, + "grad_norm": 1.3467318148807308, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 27670 + }, + { + "epoch": 0.27671, + "grad_norm": 0.889003587592173, + "learning_rate": 0.003, + "loss": 4.017, + "step": 27671 + }, + { + "epoch": 0.27672, + "grad_norm": 0.8499111588801253, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 27672 + }, + { + "epoch": 0.27673, + "grad_norm": 0.8091909401426355, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 27673 + }, + { + "epoch": 0.27674, + "grad_norm": 0.850859091662269, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 27674 + }, + { + "epoch": 0.27675, + "grad_norm": 0.9169133488013691, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 27675 + }, + { + "epoch": 0.27676, + "grad_norm": 0.8726993322853502, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 27676 + }, + { + "epoch": 0.27677, + "grad_norm": 0.955287718402797, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 27677 + }, + { + "epoch": 0.27678, + "grad_norm": 1.141931390174076, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 27678 + }, + { + "epoch": 0.27679, + "grad_norm": 1.0654509741480143, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 27679 + }, + { + "epoch": 0.2768, + "grad_norm": 0.9685232779731515, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 27680 + }, + { + "epoch": 0.27681, + "grad_norm": 1.031978202924167, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 27681 + }, + { + "epoch": 0.27682, + "grad_norm": 1.1047614194958109, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 27682 + }, + { + "epoch": 0.27683, + "grad_norm": 1.006652796326168, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 27683 + }, + { + "epoch": 0.27684, + "grad_norm": 1.0480433535910694, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 27684 + }, + { + "epoch": 0.27685, + "grad_norm": 0.8575494085740141, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 27685 + }, + { + "epoch": 0.27686, + "grad_norm": 0.856129885307255, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 27686 + }, + { + "epoch": 0.27687, + "grad_norm": 0.8717641381324917, + "learning_rate": 0.003, + "loss": 4.032, + "step": 27687 + }, + { + "epoch": 0.27688, + "grad_norm": 0.8462140560921939, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 27688 + }, + { + "epoch": 0.27689, + "grad_norm": 0.8747042504794245, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 27689 + }, + { + "epoch": 0.2769, + "grad_norm": 0.9223154865300004, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 27690 + }, + { + "epoch": 0.27691, + "grad_norm": 0.9151001232505841, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 27691 + }, + { + "epoch": 0.27692, + "grad_norm": 0.8450108608904273, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 27692 + }, + { + "epoch": 0.27693, + "grad_norm": 0.8043145482427533, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 27693 + }, + { + "epoch": 0.27694, + "grad_norm": 0.8709143989284064, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 27694 + }, + { + "epoch": 0.27695, + "grad_norm": 0.9445815193785894, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 27695 + }, + { + "epoch": 0.27696, + "grad_norm": 1.1372197726118844, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 27696 + }, + { + "epoch": 0.27697, + "grad_norm": 1.038594463343923, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 27697 + }, + { + "epoch": 0.27698, + "grad_norm": 1.0990037271560327, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 27698 + }, + { + "epoch": 0.27699, + "grad_norm": 0.8703277416922371, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 27699 + }, + { + "epoch": 0.277, + "grad_norm": 0.7606106360053135, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 27700 + }, + { + "epoch": 0.27701, + "grad_norm": 0.8020572134204552, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 27701 + }, + { + "epoch": 0.27702, + "grad_norm": 0.7665352691454131, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 27702 + }, + { + "epoch": 0.27703, + "grad_norm": 0.7618684419550741, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 27703 + }, + { + "epoch": 0.27704, + "grad_norm": 0.7398081269681952, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 27704 + }, + { + "epoch": 0.27705, + "grad_norm": 0.7300913340101083, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 27705 + }, + { + "epoch": 0.27706, + "grad_norm": 0.8260155155307604, + "learning_rate": 0.003, + "loss": 4.054, + "step": 27706 + }, + { + "epoch": 0.27707, + "grad_norm": 1.0311231762592432, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 27707 + }, + { + "epoch": 0.27708, + "grad_norm": 1.221865800016454, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 27708 + }, + { + "epoch": 0.27709, + "grad_norm": 0.9180289595037172, + "learning_rate": 0.003, + "loss": 4.084, + "step": 27709 + }, + { + "epoch": 0.2771, + "grad_norm": 0.904617250474751, + "learning_rate": 0.003, + "loss": 4.04, + "step": 27710 + }, + { + "epoch": 0.27711, + "grad_norm": 0.8970116532026977, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 27711 + }, + { + "epoch": 0.27712, + "grad_norm": 1.019143393654043, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 27712 + }, + { + "epoch": 0.27713, + "grad_norm": 0.9770369873870997, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 27713 + }, + { + "epoch": 0.27714, + "grad_norm": 0.8917327758615112, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 27714 + }, + { + "epoch": 0.27715, + "grad_norm": 0.8457320913462244, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 27715 + }, + { + "epoch": 0.27716, + "grad_norm": 0.6013831020036178, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 27716 + }, + { + "epoch": 0.27717, + "grad_norm": 0.6744075948075614, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 27717 + }, + { + "epoch": 0.27718, + "grad_norm": 0.6278997933102557, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 27718 + }, + { + "epoch": 0.27719, + "grad_norm": 0.6816057736954539, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 27719 + }, + { + "epoch": 0.2772, + "grad_norm": 0.966792715822161, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 27720 + }, + { + "epoch": 0.27721, + "grad_norm": 1.1875868310405073, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 27721 + }, + { + "epoch": 0.27722, + "grad_norm": 0.762185685795832, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 27722 + }, + { + "epoch": 0.27723, + "grad_norm": 0.7251803898684855, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 27723 + }, + { + "epoch": 0.27724, + "grad_norm": 0.6723420135516522, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 27724 + }, + { + "epoch": 0.27725, + "grad_norm": 0.7282331536703474, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 27725 + }, + { + "epoch": 0.27726, + "grad_norm": 0.7686466279366605, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 27726 + }, + { + "epoch": 0.27727, + "grad_norm": 0.8175316216388241, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 27727 + }, + { + "epoch": 0.27728, + "grad_norm": 0.7615734294839817, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 27728 + }, + { + "epoch": 0.27729, + "grad_norm": 0.8723970541774475, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 27729 + }, + { + "epoch": 0.2773, + "grad_norm": 1.065607010320744, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 27730 + }, + { + "epoch": 0.27731, + "grad_norm": 1.105672580110797, + "learning_rate": 0.003, + "loss": 4.015, + "step": 27731 + }, + { + "epoch": 0.27732, + "grad_norm": 0.9363887759717392, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 27732 + }, + { + "epoch": 0.27733, + "grad_norm": 0.9546807277678666, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 27733 + }, + { + "epoch": 0.27734, + "grad_norm": 0.8131793136043184, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 27734 + }, + { + "epoch": 0.27735, + "grad_norm": 0.7093502584795353, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 27735 + }, + { + "epoch": 0.27736, + "grad_norm": 0.7857209296021429, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 27736 + }, + { + "epoch": 0.27737, + "grad_norm": 0.9755398190558405, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 27737 + }, + { + "epoch": 0.27738, + "grad_norm": 1.2710906573475445, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 27738 + }, + { + "epoch": 0.27739, + "grad_norm": 0.9398051898876182, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 27739 + }, + { + "epoch": 0.2774, + "grad_norm": 0.7677253397404238, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 27740 + }, + { + "epoch": 0.27741, + "grad_norm": 0.6551304795988674, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 27741 + }, + { + "epoch": 0.27742, + "grad_norm": 0.6629172162090192, + "learning_rate": 0.003, + "loss": 4.051, + "step": 27742 + }, + { + "epoch": 0.27743, + "grad_norm": 0.6233407396264823, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 27743 + }, + { + "epoch": 0.27744, + "grad_norm": 0.6415336499626928, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 27744 + }, + { + "epoch": 0.27745, + "grad_norm": 0.681549409725612, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 27745 + }, + { + "epoch": 0.27746, + "grad_norm": 0.7175425422338124, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 27746 + }, + { + "epoch": 0.27747, + "grad_norm": 0.6844464334830769, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 27747 + }, + { + "epoch": 0.27748, + "grad_norm": 0.7088178417616398, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 27748 + }, + { + "epoch": 0.27749, + "grad_norm": 0.749927216909801, + "learning_rate": 0.003, + "loss": 4.041, + "step": 27749 + }, + { + "epoch": 0.2775, + "grad_norm": 0.7320435509355352, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 27750 + }, + { + "epoch": 0.27751, + "grad_norm": 0.8429306860414595, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 27751 + }, + { + "epoch": 0.27752, + "grad_norm": 0.9394660326084713, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 27752 + }, + { + "epoch": 0.27753, + "grad_norm": 1.1115856979340801, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 27753 + }, + { + "epoch": 0.27754, + "grad_norm": 1.0390401652119963, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 27754 + }, + { + "epoch": 0.27755, + "grad_norm": 1.062476839063411, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 27755 + }, + { + "epoch": 0.27756, + "grad_norm": 0.912000103904525, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 27756 + }, + { + "epoch": 0.27757, + "grad_norm": 0.7499241971137838, + "learning_rate": 0.003, + "loss": 4.074, + "step": 27757 + }, + { + "epoch": 0.27758, + "grad_norm": 0.7108256655937034, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 27758 + }, + { + "epoch": 0.27759, + "grad_norm": 0.6601315273993372, + "learning_rate": 0.003, + "loss": 4.036, + "step": 27759 + }, + { + "epoch": 0.2776, + "grad_norm": 0.67463050193368, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 27760 + }, + { + "epoch": 0.27761, + "grad_norm": 0.7338474795899355, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 27761 + }, + { + "epoch": 0.27762, + "grad_norm": 0.8412634728730072, + "learning_rate": 0.003, + "loss": 4.059, + "step": 27762 + }, + { + "epoch": 0.27763, + "grad_norm": 0.9665825384238736, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 27763 + }, + { + "epoch": 0.27764, + "grad_norm": 0.9206432038971117, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 27764 + }, + { + "epoch": 0.27765, + "grad_norm": 0.9605177734202055, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 27765 + }, + { + "epoch": 0.27766, + "grad_norm": 0.9252931676289303, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 27766 + }, + { + "epoch": 0.27767, + "grad_norm": 0.8625364771646512, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 27767 + }, + { + "epoch": 0.27768, + "grad_norm": 0.7473038462076544, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27768 + }, + { + "epoch": 0.27769, + "grad_norm": 0.7457591008070078, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 27769 + }, + { + "epoch": 0.2777, + "grad_norm": 0.7951350826269067, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 27770 + }, + { + "epoch": 0.27771, + "grad_norm": 0.7543435647657978, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 27771 + }, + { + "epoch": 0.27772, + "grad_norm": 0.7990434103741723, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 27772 + }, + { + "epoch": 0.27773, + "grad_norm": 0.8693489091303673, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 27773 + }, + { + "epoch": 0.27774, + "grad_norm": 1.0409523336540192, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 27774 + }, + { + "epoch": 0.27775, + "grad_norm": 1.4820830207602074, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 27775 + }, + { + "epoch": 0.27776, + "grad_norm": 0.6784676114034849, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 27776 + }, + { + "epoch": 0.27777, + "grad_norm": 0.763688648038413, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 27777 + }, + { + "epoch": 0.27778, + "grad_norm": 0.8996288061426083, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 27778 + }, + { + "epoch": 0.27779, + "grad_norm": 0.9989176676255409, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 27779 + }, + { + "epoch": 0.2778, + "grad_norm": 1.2467734365522196, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 27780 + }, + { + "epoch": 0.27781, + "grad_norm": 0.7630680685782786, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 27781 + }, + { + "epoch": 0.27782, + "grad_norm": 0.8057477199246448, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 27782 + }, + { + "epoch": 0.27783, + "grad_norm": 0.9190716881470262, + "learning_rate": 0.003, + "loss": 4.06, + "step": 27783 + }, + { + "epoch": 0.27784, + "grad_norm": 0.8926056038234027, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 27784 + }, + { + "epoch": 0.27785, + "grad_norm": 0.756077275330324, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 27785 + }, + { + "epoch": 0.27786, + "grad_norm": 0.7531080998622893, + "learning_rate": 0.003, + "loss": 4.048, + "step": 27786 + }, + { + "epoch": 0.27787, + "grad_norm": 0.7822311049615184, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 27787 + }, + { + "epoch": 0.27788, + "grad_norm": 0.7807548625921784, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 27788 + }, + { + "epoch": 0.27789, + "grad_norm": 0.7470934696042298, + "learning_rate": 0.003, + "loss": 4.037, + "step": 27789 + }, + { + "epoch": 0.2779, + "grad_norm": 0.7326629568457709, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 27790 + }, + { + "epoch": 0.27791, + "grad_norm": 0.7586532685268151, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 27791 + }, + { + "epoch": 0.27792, + "grad_norm": 0.8208532673893526, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 27792 + }, + { + "epoch": 0.27793, + "grad_norm": 1.0064917332497292, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 27793 + }, + { + "epoch": 0.27794, + "grad_norm": 0.9505555319079347, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 27794 + }, + { + "epoch": 0.27795, + "grad_norm": 1.0600187195667736, + "learning_rate": 0.003, + "loss": 3.9918, + "step": 27795 + }, + { + "epoch": 0.27796, + "grad_norm": 1.306834770203538, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 27796 + }, + { + "epoch": 0.27797, + "grad_norm": 0.881321391732377, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 27797 + }, + { + "epoch": 0.27798, + "grad_norm": 0.8376607861776786, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 27798 + }, + { + "epoch": 0.27799, + "grad_norm": 1.0291808042421235, + "learning_rate": 0.003, + "loss": 4.0816, + "step": 27799 + }, + { + "epoch": 0.278, + "grad_norm": 1.0566389003564205, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 27800 + }, + { + "epoch": 0.27801, + "grad_norm": 0.8224660212800197, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 27801 + }, + { + "epoch": 0.27802, + "grad_norm": 0.7147539245397581, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 27802 + }, + { + "epoch": 0.27803, + "grad_norm": 0.7287328907944992, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 27803 + }, + { + "epoch": 0.27804, + "grad_norm": 0.6625093202171249, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 27804 + }, + { + "epoch": 0.27805, + "grad_norm": 0.6748706920053364, + "learning_rate": 0.003, + "loss": 4.049, + "step": 27805 + }, + { + "epoch": 0.27806, + "grad_norm": 0.7491405117399559, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 27806 + }, + { + "epoch": 0.27807, + "grad_norm": 0.8796120939308266, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 27807 + }, + { + "epoch": 0.27808, + "grad_norm": 0.8704202908653403, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 27808 + }, + { + "epoch": 0.27809, + "grad_norm": 0.9021126517072299, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 27809 + }, + { + "epoch": 0.2781, + "grad_norm": 1.1379939501998537, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 27810 + }, + { + "epoch": 0.27811, + "grad_norm": 1.2033315361788426, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 27811 + }, + { + "epoch": 0.27812, + "grad_norm": 0.9523129418648865, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 27812 + }, + { + "epoch": 0.27813, + "grad_norm": 0.8689210195531076, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 27813 + }, + { + "epoch": 0.27814, + "grad_norm": 0.8790348184112382, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 27814 + }, + { + "epoch": 0.27815, + "grad_norm": 0.8655162310969657, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 27815 + }, + { + "epoch": 0.27816, + "grad_norm": 1.0942776128486744, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 27816 + }, + { + "epoch": 0.27817, + "grad_norm": 1.2697171814000883, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 27817 + }, + { + "epoch": 0.27818, + "grad_norm": 0.7206369328185163, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 27818 + }, + { + "epoch": 0.27819, + "grad_norm": 0.732753271736042, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 27819 + }, + { + "epoch": 0.2782, + "grad_norm": 0.7121884278535716, + "learning_rate": 0.003, + "loss": 4.065, + "step": 27820 + }, + { + "epoch": 0.27821, + "grad_norm": 0.7092440202057201, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 27821 + }, + { + "epoch": 0.27822, + "grad_norm": 0.7172876608902816, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 27822 + }, + { + "epoch": 0.27823, + "grad_norm": 0.7005799500451574, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 27823 + }, + { + "epoch": 0.27824, + "grad_norm": 0.6879468521001236, + "learning_rate": 0.003, + "loss": 4.031, + "step": 27824 + }, + { + "epoch": 0.27825, + "grad_norm": 0.6661186872515242, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 27825 + }, + { + "epoch": 0.27826, + "grad_norm": 0.718285946271353, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 27826 + }, + { + "epoch": 0.27827, + "grad_norm": 0.7633395697862321, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 27827 + }, + { + "epoch": 0.27828, + "grad_norm": 0.906512637957444, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 27828 + }, + { + "epoch": 0.27829, + "grad_norm": 1.17000966656375, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 27829 + }, + { + "epoch": 0.2783, + "grad_norm": 1.0591022360665268, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 27830 + }, + { + "epoch": 0.27831, + "grad_norm": 0.9702132818955198, + "learning_rate": 0.003, + "loss": 4.057, + "step": 27831 + }, + { + "epoch": 0.27832, + "grad_norm": 1.005422100554178, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 27832 + }, + { + "epoch": 0.27833, + "grad_norm": 1.0064103001893616, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 27833 + }, + { + "epoch": 0.27834, + "grad_norm": 0.897844686510108, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 27834 + }, + { + "epoch": 0.27835, + "grad_norm": 0.8772082920356794, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 27835 + }, + { + "epoch": 0.27836, + "grad_norm": 0.8640460615442753, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 27836 + }, + { + "epoch": 0.27837, + "grad_norm": 1.021782082138233, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 27837 + }, + { + "epoch": 0.27838, + "grad_norm": 1.2280121572791725, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 27838 + }, + { + "epoch": 0.27839, + "grad_norm": 0.6790979146120029, + "learning_rate": 0.003, + "loss": 4.053, + "step": 27839 + }, + { + "epoch": 0.2784, + "grad_norm": 0.6779283847040101, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 27840 + }, + { + "epoch": 0.27841, + "grad_norm": 0.6011727576649475, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 27841 + }, + { + "epoch": 0.27842, + "grad_norm": 0.6530577998491776, + "learning_rate": 0.003, + "loss": 4.047, + "step": 27842 + }, + { + "epoch": 0.27843, + "grad_norm": 0.6988211762068519, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 27843 + }, + { + "epoch": 0.27844, + "grad_norm": 0.758140669348051, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 27844 + }, + { + "epoch": 0.27845, + "grad_norm": 0.8706249382858688, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 27845 + }, + { + "epoch": 0.27846, + "grad_norm": 1.0379495941421972, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 27846 + }, + { + "epoch": 0.27847, + "grad_norm": 1.0355749298790293, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 27847 + }, + { + "epoch": 0.27848, + "grad_norm": 0.9324102792948058, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 27848 + }, + { + "epoch": 0.27849, + "grad_norm": 1.0517577001975258, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 27849 + }, + { + "epoch": 0.2785, + "grad_norm": 1.054501901055831, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 27850 + }, + { + "epoch": 0.27851, + "grad_norm": 1.0645454539983148, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 27851 + }, + { + "epoch": 0.27852, + "grad_norm": 0.9128393142462878, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 27852 + }, + { + "epoch": 0.27853, + "grad_norm": 0.8294643471519756, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 27853 + }, + { + "epoch": 0.27854, + "grad_norm": 0.8944895294955064, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 27854 + }, + { + "epoch": 0.27855, + "grad_norm": 0.8404797169195778, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 27855 + }, + { + "epoch": 0.27856, + "grad_norm": 0.8344276714343934, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 27856 + }, + { + "epoch": 0.27857, + "grad_norm": 0.7855922640785303, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 27857 + }, + { + "epoch": 0.27858, + "grad_norm": 0.7527340349076008, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 27858 + }, + { + "epoch": 0.27859, + "grad_norm": 0.7578495181672694, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 27859 + }, + { + "epoch": 0.2786, + "grad_norm": 0.855587951705903, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 27860 + }, + { + "epoch": 0.27861, + "grad_norm": 0.9730719473873491, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 27861 + }, + { + "epoch": 0.27862, + "grad_norm": 1.0024568645874534, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 27862 + }, + { + "epoch": 0.27863, + "grad_norm": 0.9178998012763384, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 27863 + }, + { + "epoch": 0.27864, + "grad_norm": 0.9591931274023741, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 27864 + }, + { + "epoch": 0.27865, + "grad_norm": 0.9568539522302781, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 27865 + }, + { + "epoch": 0.27866, + "grad_norm": 1.104995960455937, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 27866 + }, + { + "epoch": 0.27867, + "grad_norm": 1.0054840841322914, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 27867 + }, + { + "epoch": 0.27868, + "grad_norm": 0.9740717495697179, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 27868 + }, + { + "epoch": 0.27869, + "grad_norm": 0.9480523442023439, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 27869 + }, + { + "epoch": 0.2787, + "grad_norm": 0.8669983741893688, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 27870 + }, + { + "epoch": 0.27871, + "grad_norm": 0.808746021250193, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 27871 + }, + { + "epoch": 0.27872, + "grad_norm": 0.7799212114908738, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 27872 + }, + { + "epoch": 0.27873, + "grad_norm": 0.7857771251657594, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 27873 + }, + { + "epoch": 0.27874, + "grad_norm": 0.8317399428501995, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 27874 + }, + { + "epoch": 0.27875, + "grad_norm": 0.7816429938558018, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 27875 + }, + { + "epoch": 0.27876, + "grad_norm": 0.8848283887244174, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 27876 + }, + { + "epoch": 0.27877, + "grad_norm": 0.8974894467307845, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 27877 + }, + { + "epoch": 0.27878, + "grad_norm": 0.8344496459443077, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 27878 + }, + { + "epoch": 0.27879, + "grad_norm": 0.759009629843913, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 27879 + }, + { + "epoch": 0.2788, + "grad_norm": 0.7771478714984165, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 27880 + }, + { + "epoch": 0.27881, + "grad_norm": 0.8444094214788893, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 27881 + }, + { + "epoch": 0.27882, + "grad_norm": 1.0350118746058892, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 27882 + }, + { + "epoch": 0.27883, + "grad_norm": 1.167097807004749, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 27883 + }, + { + "epoch": 0.27884, + "grad_norm": 0.8220895981337908, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 27884 + }, + { + "epoch": 0.27885, + "grad_norm": 0.7292186866562702, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 27885 + }, + { + "epoch": 0.27886, + "grad_norm": 0.7072295467894064, + "learning_rate": 0.003, + "loss": 4.0027, + "step": 27886 + }, + { + "epoch": 0.27887, + "grad_norm": 0.703643453938188, + "learning_rate": 0.003, + "loss": 4.048, + "step": 27887 + }, + { + "epoch": 0.27888, + "grad_norm": 0.7494843456747895, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 27888 + }, + { + "epoch": 0.27889, + "grad_norm": 0.714232663075034, + "learning_rate": 0.003, + "loss": 4.025, + "step": 27889 + }, + { + "epoch": 0.2789, + "grad_norm": 0.6810545861968759, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 27890 + }, + { + "epoch": 0.27891, + "grad_norm": 0.8046949058406158, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 27891 + }, + { + "epoch": 0.27892, + "grad_norm": 1.0537351057565565, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 27892 + }, + { + "epoch": 0.27893, + "grad_norm": 1.1697799065270287, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 27893 + }, + { + "epoch": 0.27894, + "grad_norm": 0.8597762833522873, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 27894 + }, + { + "epoch": 0.27895, + "grad_norm": 0.7836211790781372, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 27895 + }, + { + "epoch": 0.27896, + "grad_norm": 0.7386413972957119, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 27896 + }, + { + "epoch": 0.27897, + "grad_norm": 0.7559091363977415, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 27897 + }, + { + "epoch": 0.27898, + "grad_norm": 0.8267679791884671, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 27898 + }, + { + "epoch": 0.27899, + "grad_norm": 0.8000158462339704, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 27899 + }, + { + "epoch": 0.279, + "grad_norm": 0.7497717421124886, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 27900 + }, + { + "epoch": 0.27901, + "grad_norm": 0.7147371895301244, + "learning_rate": 0.003, + "loss": 3.9764, + "step": 27901 + }, + { + "epoch": 0.27902, + "grad_norm": 0.7504260965129792, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 27902 + }, + { + "epoch": 0.27903, + "grad_norm": 0.9012233211384367, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 27903 + }, + { + "epoch": 0.27904, + "grad_norm": 1.1271750964681806, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 27904 + }, + { + "epoch": 0.27905, + "grad_norm": 0.8573688268605598, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 27905 + }, + { + "epoch": 0.27906, + "grad_norm": 0.7403019672021436, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 27906 + }, + { + "epoch": 0.27907, + "grad_norm": 0.7838417738312761, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 27907 + }, + { + "epoch": 0.27908, + "grad_norm": 0.8114614317390165, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 27908 + }, + { + "epoch": 0.27909, + "grad_norm": 0.8669116122053429, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 27909 + }, + { + "epoch": 0.2791, + "grad_norm": 0.8283561494730468, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 27910 + }, + { + "epoch": 0.27911, + "grad_norm": 0.8966530565073961, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 27911 + }, + { + "epoch": 0.27912, + "grad_norm": 0.9516518438369279, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 27912 + }, + { + "epoch": 0.27913, + "grad_norm": 0.8684519950690195, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 27913 + }, + { + "epoch": 0.27914, + "grad_norm": 0.986947597296281, + "learning_rate": 0.003, + "loss": 4.065, + "step": 27914 + }, + { + "epoch": 0.27915, + "grad_norm": 1.1799559237577764, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 27915 + }, + { + "epoch": 0.27916, + "grad_norm": 0.7960642730075496, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 27916 + }, + { + "epoch": 0.27917, + "grad_norm": 0.8080462094755548, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 27917 + }, + { + "epoch": 0.27918, + "grad_norm": 0.918123729199587, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 27918 + }, + { + "epoch": 0.27919, + "grad_norm": 1.059007929451071, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 27919 + }, + { + "epoch": 0.2792, + "grad_norm": 0.9750033054095458, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 27920 + }, + { + "epoch": 0.27921, + "grad_norm": 0.9911738349754905, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 27921 + }, + { + "epoch": 0.27922, + "grad_norm": 1.1122616060291601, + "learning_rate": 0.003, + "loss": 4.0994, + "step": 27922 + }, + { + "epoch": 0.27923, + "grad_norm": 0.8941226664357055, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 27923 + }, + { + "epoch": 0.27924, + "grad_norm": 0.8650060418915264, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 27924 + }, + { + "epoch": 0.27925, + "grad_norm": 0.8669245934363925, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 27925 + }, + { + "epoch": 0.27926, + "grad_norm": 0.8484777056862682, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 27926 + }, + { + "epoch": 0.27927, + "grad_norm": 0.858498560942655, + "learning_rate": 0.003, + "loss": 4.054, + "step": 27927 + }, + { + "epoch": 0.27928, + "grad_norm": 0.9410009282145507, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 27928 + }, + { + "epoch": 0.27929, + "grad_norm": 1.2881395282030546, + "learning_rate": 0.003, + "loss": 4.072, + "step": 27929 + }, + { + "epoch": 0.2793, + "grad_norm": 0.9035777330005687, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 27930 + }, + { + "epoch": 0.27931, + "grad_norm": 0.9297881233274283, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 27931 + }, + { + "epoch": 0.27932, + "grad_norm": 0.9251317050320439, + "learning_rate": 0.003, + "loss": 4.072, + "step": 27932 + }, + { + "epoch": 0.27933, + "grad_norm": 1.0032493100785973, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 27933 + }, + { + "epoch": 0.27934, + "grad_norm": 1.0280865586937362, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 27934 + }, + { + "epoch": 0.27935, + "grad_norm": 0.9472871665004493, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 27935 + }, + { + "epoch": 0.27936, + "grad_norm": 0.9793774868430313, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 27936 + }, + { + "epoch": 0.27937, + "grad_norm": 0.8269134075614692, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 27937 + }, + { + "epoch": 0.27938, + "grad_norm": 0.6623674181487251, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 27938 + }, + { + "epoch": 0.27939, + "grad_norm": 0.5946702126676037, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 27939 + }, + { + "epoch": 0.2794, + "grad_norm": 0.5745201811728066, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 27940 + }, + { + "epoch": 0.27941, + "grad_norm": 0.5442978655565623, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 27941 + }, + { + "epoch": 0.27942, + "grad_norm": 0.5993411184433973, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 27942 + }, + { + "epoch": 0.27943, + "grad_norm": 0.5966575564921346, + "learning_rate": 0.003, + "loss": 4.031, + "step": 27943 + }, + { + "epoch": 0.27944, + "grad_norm": 0.6855295628045422, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 27944 + }, + { + "epoch": 0.27945, + "grad_norm": 0.8085027755667794, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 27945 + }, + { + "epoch": 0.27946, + "grad_norm": 0.9629798848073549, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 27946 + }, + { + "epoch": 0.27947, + "grad_norm": 1.2446030032556772, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 27947 + }, + { + "epoch": 0.27948, + "grad_norm": 0.745471033704625, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 27948 + }, + { + "epoch": 0.27949, + "grad_norm": 0.8154684729488978, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 27949 + }, + { + "epoch": 0.2795, + "grad_norm": 0.8525491224328593, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 27950 + }, + { + "epoch": 0.27951, + "grad_norm": 0.8603327279350443, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 27951 + }, + { + "epoch": 0.27952, + "grad_norm": 0.9088975966312394, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 27952 + }, + { + "epoch": 0.27953, + "grad_norm": 0.8036664897271155, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 27953 + }, + { + "epoch": 0.27954, + "grad_norm": 0.8411073925673561, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 27954 + }, + { + "epoch": 0.27955, + "grad_norm": 0.7654120928093795, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 27955 + }, + { + "epoch": 0.27956, + "grad_norm": 0.7851092890440728, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 27956 + }, + { + "epoch": 0.27957, + "grad_norm": 0.9332914179269372, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 27957 + }, + { + "epoch": 0.27958, + "grad_norm": 1.2131963946057815, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 27958 + }, + { + "epoch": 0.27959, + "grad_norm": 0.8694858836383407, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 27959 + }, + { + "epoch": 0.2796, + "grad_norm": 0.7841455344323105, + "learning_rate": 0.003, + "loss": 4.04, + "step": 27960 + }, + { + "epoch": 0.27961, + "grad_norm": 0.7156746192352654, + "learning_rate": 0.003, + "loss": 4.057, + "step": 27961 + }, + { + "epoch": 0.27962, + "grad_norm": 0.7433449407276909, + "learning_rate": 0.003, + "loss": 4.064, + "step": 27962 + }, + { + "epoch": 0.27963, + "grad_norm": 0.8381300417895519, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 27963 + }, + { + "epoch": 0.27964, + "grad_norm": 0.9588249602415594, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 27964 + }, + { + "epoch": 0.27965, + "grad_norm": 0.9877942687618677, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 27965 + }, + { + "epoch": 0.27966, + "grad_norm": 0.8475834498159907, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 27966 + }, + { + "epoch": 0.27967, + "grad_norm": 0.8142775563938072, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 27967 + }, + { + "epoch": 0.27968, + "grad_norm": 0.6921921034946628, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 27968 + }, + { + "epoch": 0.27969, + "grad_norm": 0.5841439697314327, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 27969 + }, + { + "epoch": 0.2797, + "grad_norm": 0.5580503829444775, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 27970 + }, + { + "epoch": 0.27971, + "grad_norm": 0.6555573268844771, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 27971 + }, + { + "epoch": 0.27972, + "grad_norm": 0.7948088913928125, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 27972 + }, + { + "epoch": 0.27973, + "grad_norm": 0.9420387324940408, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 27973 + }, + { + "epoch": 0.27974, + "grad_norm": 1.1721628055930555, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 27974 + }, + { + "epoch": 0.27975, + "grad_norm": 0.8795338698251182, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 27975 + }, + { + "epoch": 0.27976, + "grad_norm": 0.8792674692208562, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 27976 + }, + { + "epoch": 0.27977, + "grad_norm": 0.9975051050597962, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 27977 + }, + { + "epoch": 0.27978, + "grad_norm": 0.9688880546064084, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 27978 + }, + { + "epoch": 0.27979, + "grad_norm": 1.0026211651197914, + "learning_rate": 0.003, + "loss": 4.049, + "step": 27979 + }, + { + "epoch": 0.2798, + "grad_norm": 0.9932254331445153, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 27980 + }, + { + "epoch": 0.27981, + "grad_norm": 1.0276352047310178, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 27981 + }, + { + "epoch": 0.27982, + "grad_norm": 0.9487216749557594, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 27982 + }, + { + "epoch": 0.27983, + "grad_norm": 0.8655200692085009, + "learning_rate": 0.003, + "loss": 4.03, + "step": 27983 + }, + { + "epoch": 0.27984, + "grad_norm": 0.8140856034965781, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 27984 + }, + { + "epoch": 0.27985, + "grad_norm": 0.8279083627366199, + "learning_rate": 0.003, + "loss": 4.028, + "step": 27985 + }, + { + "epoch": 0.27986, + "grad_norm": 1.0088555468106388, + "learning_rate": 0.003, + "loss": 4.07, + "step": 27986 + }, + { + "epoch": 0.27987, + "grad_norm": 1.318976441954303, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 27987 + }, + { + "epoch": 0.27988, + "grad_norm": 0.804546594664036, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 27988 + }, + { + "epoch": 0.27989, + "grad_norm": 0.7651213859258367, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 27989 + }, + { + "epoch": 0.2799, + "grad_norm": 0.8939963391897597, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 27990 + }, + { + "epoch": 0.27991, + "grad_norm": 1.1177666878733, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 27991 + }, + { + "epoch": 0.27992, + "grad_norm": 0.963233675789223, + "learning_rate": 0.003, + "loss": 4.053, + "step": 27992 + }, + { + "epoch": 0.27993, + "grad_norm": 0.7847430097747101, + "learning_rate": 0.003, + "loss": 4.043, + "step": 27993 + }, + { + "epoch": 0.27994, + "grad_norm": 0.6463821898267385, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 27994 + }, + { + "epoch": 0.27995, + "grad_norm": 0.5747246805024196, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 27995 + }, + { + "epoch": 0.27996, + "grad_norm": 0.630157050553995, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 27996 + }, + { + "epoch": 0.27997, + "grad_norm": 0.6993048894990165, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 27997 + }, + { + "epoch": 0.27998, + "grad_norm": 0.7225750856606026, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 27998 + }, + { + "epoch": 0.27999, + "grad_norm": 0.6802690340141045, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 27999 + }, + { + "epoch": 0.28, + "grad_norm": 0.6978310605733629, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 28000 + }, + { + "epoch": 0.28001, + "grad_norm": 0.792401153514491, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 28001 + }, + { + "epoch": 0.28002, + "grad_norm": 0.8818046149496479, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 28002 + }, + { + "epoch": 0.28003, + "grad_norm": 0.9941890534874771, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 28003 + }, + { + "epoch": 0.28004, + "grad_norm": 1.1394295606029359, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 28004 + }, + { + "epoch": 0.28005, + "grad_norm": 1.0372082725485239, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 28005 + }, + { + "epoch": 0.28006, + "grad_norm": 1.0880496023172948, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 28006 + }, + { + "epoch": 0.28007, + "grad_norm": 0.9906354507510764, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 28007 + }, + { + "epoch": 0.28008, + "grad_norm": 1.058160062012444, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 28008 + }, + { + "epoch": 0.28009, + "grad_norm": 0.8322046229994103, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 28009 + }, + { + "epoch": 0.2801, + "grad_norm": 0.7462792892435819, + "learning_rate": 0.003, + "loss": 4.055, + "step": 28010 + }, + { + "epoch": 0.28011, + "grad_norm": 0.6365064239709616, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 28011 + }, + { + "epoch": 0.28012, + "grad_norm": 0.6200562379092251, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 28012 + }, + { + "epoch": 0.28013, + "grad_norm": 0.641220912352142, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 28013 + }, + { + "epoch": 0.28014, + "grad_norm": 0.6464114049469843, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 28014 + }, + { + "epoch": 0.28015, + "grad_norm": 0.7425767896574786, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 28015 + }, + { + "epoch": 0.28016, + "grad_norm": 1.024305623376408, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 28016 + }, + { + "epoch": 0.28017, + "grad_norm": 1.254887444811122, + "learning_rate": 0.003, + "loss": 4.08, + "step": 28017 + }, + { + "epoch": 0.28018, + "grad_norm": 0.792178635742736, + "learning_rate": 0.003, + "loss": 4.052, + "step": 28018 + }, + { + "epoch": 0.28019, + "grad_norm": 0.6883560724226341, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 28019 + }, + { + "epoch": 0.2802, + "grad_norm": 0.6968968012898124, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 28020 + }, + { + "epoch": 0.28021, + "grad_norm": 0.6993532616850903, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 28021 + }, + { + "epoch": 0.28022, + "grad_norm": 0.7043072899634024, + "learning_rate": 0.003, + "loss": 4.052, + "step": 28022 + }, + { + "epoch": 0.28023, + "grad_norm": 0.8563000656902159, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 28023 + }, + { + "epoch": 0.28024, + "grad_norm": 1.147966632861741, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 28024 + }, + { + "epoch": 0.28025, + "grad_norm": 1.0788569084610193, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 28025 + }, + { + "epoch": 0.28026, + "grad_norm": 0.8740509850335905, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 28026 + }, + { + "epoch": 0.28027, + "grad_norm": 0.8379246708998198, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 28027 + }, + { + "epoch": 0.28028, + "grad_norm": 1.0286367206852993, + "learning_rate": 0.003, + "loss": 4.093, + "step": 28028 + }, + { + "epoch": 0.28029, + "grad_norm": 0.9255602027884176, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 28029 + }, + { + "epoch": 0.2803, + "grad_norm": 0.91161934001326, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 28030 + }, + { + "epoch": 0.28031, + "grad_norm": 0.8457986495035507, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 28031 + }, + { + "epoch": 0.28032, + "grad_norm": 0.83961713159867, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 28032 + }, + { + "epoch": 0.28033, + "grad_norm": 1.0918146882887996, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 28033 + }, + { + "epoch": 0.28034, + "grad_norm": 1.2498988903773436, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 28034 + }, + { + "epoch": 0.28035, + "grad_norm": 0.7353889376054008, + "learning_rate": 0.003, + "loss": 4.051, + "step": 28035 + }, + { + "epoch": 0.28036, + "grad_norm": 0.6854077548198501, + "learning_rate": 0.003, + "loss": 4.04, + "step": 28036 + }, + { + "epoch": 0.28037, + "grad_norm": 0.777977529614316, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 28037 + }, + { + "epoch": 0.28038, + "grad_norm": 0.8462242894586985, + "learning_rate": 0.003, + "loss": 4.04, + "step": 28038 + }, + { + "epoch": 0.28039, + "grad_norm": 0.8841290128983104, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 28039 + }, + { + "epoch": 0.2804, + "grad_norm": 0.9987232041674949, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 28040 + }, + { + "epoch": 0.28041, + "grad_norm": 1.1680975591460554, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 28041 + }, + { + "epoch": 0.28042, + "grad_norm": 0.9288047118801263, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 28042 + }, + { + "epoch": 0.28043, + "grad_norm": 0.8197164782621761, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 28043 + }, + { + "epoch": 0.28044, + "grad_norm": 0.6917657643318051, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 28044 + }, + { + "epoch": 0.28045, + "grad_norm": 0.7898174947412675, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 28045 + }, + { + "epoch": 0.28046, + "grad_norm": 0.8028080103708727, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 28046 + }, + { + "epoch": 0.28047, + "grad_norm": 0.897632487293533, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 28047 + }, + { + "epoch": 0.28048, + "grad_norm": 1.012344292707383, + "learning_rate": 0.003, + "loss": 4.045, + "step": 28048 + }, + { + "epoch": 0.28049, + "grad_norm": 1.0490924384944986, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 28049 + }, + { + "epoch": 0.2805, + "grad_norm": 0.9110788096522845, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 28050 + }, + { + "epoch": 0.28051, + "grad_norm": 0.7726600021556684, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 28051 + }, + { + "epoch": 0.28052, + "grad_norm": 0.6617614650477526, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 28052 + }, + { + "epoch": 0.28053, + "grad_norm": 0.7879979558224586, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 28053 + }, + { + "epoch": 0.28054, + "grad_norm": 0.876280217785457, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 28054 + }, + { + "epoch": 0.28055, + "grad_norm": 0.8835865973401837, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 28055 + }, + { + "epoch": 0.28056, + "grad_norm": 0.9411454316014107, + "learning_rate": 0.003, + "loss": 4.069, + "step": 28056 + }, + { + "epoch": 0.28057, + "grad_norm": 0.8976891833258241, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 28057 + }, + { + "epoch": 0.28058, + "grad_norm": 0.7408487822906077, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 28058 + }, + { + "epoch": 0.28059, + "grad_norm": 0.629877688024636, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 28059 + }, + { + "epoch": 0.2806, + "grad_norm": 0.5861682481843686, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 28060 + }, + { + "epoch": 0.28061, + "grad_norm": 0.5705007456171375, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 28061 + }, + { + "epoch": 0.28062, + "grad_norm": 0.5839134571552621, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 28062 + }, + { + "epoch": 0.28063, + "grad_norm": 0.6418554943272836, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 28063 + }, + { + "epoch": 0.28064, + "grad_norm": 0.7667902324827514, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 28064 + }, + { + "epoch": 0.28065, + "grad_norm": 0.8794799413618, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 28065 + }, + { + "epoch": 0.28066, + "grad_norm": 0.8919335689815453, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 28066 + }, + { + "epoch": 0.28067, + "grad_norm": 0.973464464225869, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 28067 + }, + { + "epoch": 0.28068, + "grad_norm": 1.066227042271281, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 28068 + }, + { + "epoch": 0.28069, + "grad_norm": 0.884096687204083, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 28069 + }, + { + "epoch": 0.2807, + "grad_norm": 1.1076768044921277, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 28070 + }, + { + "epoch": 0.28071, + "grad_norm": 1.0747297559220055, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 28071 + }, + { + "epoch": 0.28072, + "grad_norm": 0.9271490189456614, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 28072 + }, + { + "epoch": 0.28073, + "grad_norm": 0.8986805297422971, + "learning_rate": 0.003, + "loss": 4.022, + "step": 28073 + }, + { + "epoch": 0.28074, + "grad_norm": 0.8471311387291113, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 28074 + }, + { + "epoch": 0.28075, + "grad_norm": 0.9196323075723072, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 28075 + }, + { + "epoch": 0.28076, + "grad_norm": 1.21950285579724, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 28076 + }, + { + "epoch": 0.28077, + "grad_norm": 1.0434121487062433, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 28077 + }, + { + "epoch": 0.28078, + "grad_norm": 0.9959024592713192, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 28078 + }, + { + "epoch": 0.28079, + "grad_norm": 1.0036524351507201, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 28079 + }, + { + "epoch": 0.2808, + "grad_norm": 0.9257760523030367, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 28080 + }, + { + "epoch": 0.28081, + "grad_norm": 0.911869163902302, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 28081 + }, + { + "epoch": 0.28082, + "grad_norm": 0.854474045736869, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 28082 + }, + { + "epoch": 0.28083, + "grad_norm": 0.7732663917305774, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 28083 + }, + { + "epoch": 0.28084, + "grad_norm": 0.7982154689464331, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 28084 + }, + { + "epoch": 0.28085, + "grad_norm": 0.7814169607075075, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 28085 + }, + { + "epoch": 0.28086, + "grad_norm": 0.7481340363832668, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 28086 + }, + { + "epoch": 0.28087, + "grad_norm": 0.742342668252692, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 28087 + }, + { + "epoch": 0.28088, + "grad_norm": 0.632289889319823, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 28088 + }, + { + "epoch": 0.28089, + "grad_norm": 0.6420516119624792, + "learning_rate": 0.003, + "loss": 4.049, + "step": 28089 + }, + { + "epoch": 0.2809, + "grad_norm": 0.6065177225097301, + "learning_rate": 0.003, + "loss": 4.02, + "step": 28090 + }, + { + "epoch": 0.28091, + "grad_norm": 0.709820383133188, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 28091 + }, + { + "epoch": 0.28092, + "grad_norm": 0.8682239534479389, + "learning_rate": 0.003, + "loss": 4.081, + "step": 28092 + }, + { + "epoch": 0.28093, + "grad_norm": 1.1145722448807889, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 28093 + }, + { + "epoch": 0.28094, + "grad_norm": 1.0995172327216474, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28094 + }, + { + "epoch": 0.28095, + "grad_norm": 0.7410414743703411, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 28095 + }, + { + "epoch": 0.28096, + "grad_norm": 0.6726409198350183, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 28096 + }, + { + "epoch": 0.28097, + "grad_norm": 0.7206661348289416, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 28097 + }, + { + "epoch": 0.28098, + "grad_norm": 0.7344826454859487, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 28098 + }, + { + "epoch": 0.28099, + "grad_norm": 0.8029258377630194, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 28099 + }, + { + "epoch": 0.281, + "grad_norm": 0.7625212298518022, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 28100 + }, + { + "epoch": 0.28101, + "grad_norm": 0.7694156933165289, + "learning_rate": 0.003, + "loss": 3.999, + "step": 28101 + }, + { + "epoch": 0.28102, + "grad_norm": 0.8013740263975098, + "learning_rate": 0.003, + "loss": 4.001, + "step": 28102 + }, + { + "epoch": 0.28103, + "grad_norm": 0.7970503553980725, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 28103 + }, + { + "epoch": 0.28104, + "grad_norm": 0.8255993888748747, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 28104 + }, + { + "epoch": 0.28105, + "grad_norm": 0.7573721900358236, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 28105 + }, + { + "epoch": 0.28106, + "grad_norm": 0.6927581237302101, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 28106 + }, + { + "epoch": 0.28107, + "grad_norm": 0.8280779631509771, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 28107 + }, + { + "epoch": 0.28108, + "grad_norm": 0.9654121443579612, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 28108 + }, + { + "epoch": 0.28109, + "grad_norm": 0.9491568637569016, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 28109 + }, + { + "epoch": 0.2811, + "grad_norm": 0.9660593689368192, + "learning_rate": 0.003, + "loss": 4.035, + "step": 28110 + }, + { + "epoch": 0.28111, + "grad_norm": 1.1707432623966816, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 28111 + }, + { + "epoch": 0.28112, + "grad_norm": 0.9824792601050164, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 28112 + }, + { + "epoch": 0.28113, + "grad_norm": 1.1128808368651908, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 28113 + }, + { + "epoch": 0.28114, + "grad_norm": 0.9436636896398909, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 28114 + }, + { + "epoch": 0.28115, + "grad_norm": 0.9298411791672035, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 28115 + }, + { + "epoch": 0.28116, + "grad_norm": 0.8693170146225, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 28116 + }, + { + "epoch": 0.28117, + "grad_norm": 0.9782251863803261, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 28117 + }, + { + "epoch": 0.28118, + "grad_norm": 0.9319861230604518, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 28118 + }, + { + "epoch": 0.28119, + "grad_norm": 0.9771027245549708, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 28119 + }, + { + "epoch": 0.2812, + "grad_norm": 0.9522616902976506, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 28120 + }, + { + "epoch": 0.28121, + "grad_norm": 0.8871586590618997, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 28121 + }, + { + "epoch": 0.28122, + "grad_norm": 0.88123372453046, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 28122 + }, + { + "epoch": 0.28123, + "grad_norm": 1.0241777694491345, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 28123 + }, + { + "epoch": 0.28124, + "grad_norm": 1.0844433168667353, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 28124 + }, + { + "epoch": 0.28125, + "grad_norm": 1.067333950774928, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 28125 + }, + { + "epoch": 0.28126, + "grad_norm": 0.8204318245407727, + "learning_rate": 0.003, + "loss": 4.043, + "step": 28126 + }, + { + "epoch": 0.28127, + "grad_norm": 0.7195118346432481, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 28127 + }, + { + "epoch": 0.28128, + "grad_norm": 0.8820027913409202, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 28128 + }, + { + "epoch": 0.28129, + "grad_norm": 0.9604981629570484, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28129 + }, + { + "epoch": 0.2813, + "grad_norm": 1.0523989474778024, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 28130 + }, + { + "epoch": 0.28131, + "grad_norm": 0.9194506133545249, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 28131 + }, + { + "epoch": 0.28132, + "grad_norm": 0.8024699801436255, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 28132 + }, + { + "epoch": 0.28133, + "grad_norm": 0.7123743620998565, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 28133 + }, + { + "epoch": 0.28134, + "grad_norm": 0.6403913547271601, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 28134 + }, + { + "epoch": 0.28135, + "grad_norm": 0.6568282637167792, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 28135 + }, + { + "epoch": 0.28136, + "grad_norm": 0.6848772464866574, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 28136 + }, + { + "epoch": 0.28137, + "grad_norm": 0.7033363618862843, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 28137 + }, + { + "epoch": 0.28138, + "grad_norm": 0.6544817485644114, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 28138 + }, + { + "epoch": 0.28139, + "grad_norm": 0.7278087113513992, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 28139 + }, + { + "epoch": 0.2814, + "grad_norm": 0.8315153272810895, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 28140 + }, + { + "epoch": 0.28141, + "grad_norm": 0.9350560687958722, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 28141 + }, + { + "epoch": 0.28142, + "grad_norm": 0.9738023901716829, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 28142 + }, + { + "epoch": 0.28143, + "grad_norm": 0.9664464244538686, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 28143 + }, + { + "epoch": 0.28144, + "grad_norm": 0.9198284352573544, + "learning_rate": 0.003, + "loss": 4.1012, + "step": 28144 + }, + { + "epoch": 0.28145, + "grad_norm": 0.7828724661638736, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 28145 + }, + { + "epoch": 0.28146, + "grad_norm": 0.8358004096559631, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 28146 + }, + { + "epoch": 0.28147, + "grad_norm": 0.9843589798857394, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 28147 + }, + { + "epoch": 0.28148, + "grad_norm": 1.0062359503984668, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 28148 + }, + { + "epoch": 0.28149, + "grad_norm": 1.0323069374152105, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 28149 + }, + { + "epoch": 0.2815, + "grad_norm": 0.9347358269610015, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 28150 + }, + { + "epoch": 0.28151, + "grad_norm": 1.085372950065363, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 28151 + }, + { + "epoch": 0.28152, + "grad_norm": 0.9397812774363618, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 28152 + }, + { + "epoch": 0.28153, + "grad_norm": 0.8734882219180954, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 28153 + }, + { + "epoch": 0.28154, + "grad_norm": 0.7814307375143057, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 28154 + }, + { + "epoch": 0.28155, + "grad_norm": 0.6952285444470059, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 28155 + }, + { + "epoch": 0.28156, + "grad_norm": 0.8397622767134504, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 28156 + }, + { + "epoch": 0.28157, + "grad_norm": 0.8209586092541783, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 28157 + }, + { + "epoch": 0.28158, + "grad_norm": 0.7181336381325689, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 28158 + }, + { + "epoch": 0.28159, + "grad_norm": 0.8604883261568401, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 28159 + }, + { + "epoch": 0.2816, + "grad_norm": 0.964771171487784, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 28160 + }, + { + "epoch": 0.28161, + "grad_norm": 0.869864790671891, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 28161 + }, + { + "epoch": 0.28162, + "grad_norm": 0.8426479989666904, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 28162 + }, + { + "epoch": 0.28163, + "grad_norm": 0.8869618603463989, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 28163 + }, + { + "epoch": 0.28164, + "grad_norm": 0.9151362014883557, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 28164 + }, + { + "epoch": 0.28165, + "grad_norm": 0.9156498708521942, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 28165 + }, + { + "epoch": 0.28166, + "grad_norm": 0.864796014710332, + "learning_rate": 0.003, + "loss": 4.07, + "step": 28166 + }, + { + "epoch": 0.28167, + "grad_norm": 0.815346099767291, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 28167 + }, + { + "epoch": 0.28168, + "grad_norm": 0.8111521259932593, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 28168 + }, + { + "epoch": 0.28169, + "grad_norm": 0.7791251075544184, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 28169 + }, + { + "epoch": 0.2817, + "grad_norm": 0.7531376211438611, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 28170 + }, + { + "epoch": 0.28171, + "grad_norm": 0.7158477732427472, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 28171 + }, + { + "epoch": 0.28172, + "grad_norm": 0.8639409042569773, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 28172 + }, + { + "epoch": 0.28173, + "grad_norm": 1.0584963011450212, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 28173 + }, + { + "epoch": 0.28174, + "grad_norm": 1.1056171315534873, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 28174 + }, + { + "epoch": 0.28175, + "grad_norm": 0.7954393135427869, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 28175 + }, + { + "epoch": 0.28176, + "grad_norm": 0.8117298131021036, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 28176 + }, + { + "epoch": 0.28177, + "grad_norm": 0.8639984696898506, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 28177 + }, + { + "epoch": 0.28178, + "grad_norm": 1.0574065925156941, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 28178 + }, + { + "epoch": 0.28179, + "grad_norm": 1.107785625765744, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 28179 + }, + { + "epoch": 0.2818, + "grad_norm": 0.8713944371064622, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 28180 + }, + { + "epoch": 0.28181, + "grad_norm": 0.8465039011656147, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 28181 + }, + { + "epoch": 0.28182, + "grad_norm": 0.8505639817588961, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 28182 + }, + { + "epoch": 0.28183, + "grad_norm": 0.7935915049053812, + "learning_rate": 0.003, + "loss": 4.021, + "step": 28183 + }, + { + "epoch": 0.28184, + "grad_norm": 0.8767463555296504, + "learning_rate": 0.003, + "loss": 3.9896, + "step": 28184 + }, + { + "epoch": 0.28185, + "grad_norm": 1.0795249011036232, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 28185 + }, + { + "epoch": 0.28186, + "grad_norm": 0.9078598609073928, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 28186 + }, + { + "epoch": 0.28187, + "grad_norm": 0.8464772800526946, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 28187 + }, + { + "epoch": 0.28188, + "grad_norm": 1.05085158654088, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 28188 + }, + { + "epoch": 0.28189, + "grad_norm": 0.9509395939647651, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 28189 + }, + { + "epoch": 0.2819, + "grad_norm": 1.1626826335663998, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 28190 + }, + { + "epoch": 0.28191, + "grad_norm": 0.8639524891972427, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 28191 + }, + { + "epoch": 0.28192, + "grad_norm": 0.8413745019266091, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 28192 + }, + { + "epoch": 0.28193, + "grad_norm": 0.9762164480373203, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 28193 + }, + { + "epoch": 0.28194, + "grad_norm": 1.1505147751752778, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 28194 + }, + { + "epoch": 0.28195, + "grad_norm": 0.8714257123710107, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 28195 + }, + { + "epoch": 0.28196, + "grad_norm": 0.8190312405652552, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 28196 + }, + { + "epoch": 0.28197, + "grad_norm": 0.7083915120832939, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 28197 + }, + { + "epoch": 0.28198, + "grad_norm": 0.7118363356784513, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 28198 + }, + { + "epoch": 0.28199, + "grad_norm": 0.7576966264696798, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 28199 + }, + { + "epoch": 0.282, + "grad_norm": 0.7013751518112019, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 28200 + }, + { + "epoch": 0.28201, + "grad_norm": 0.6924670475129973, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 28201 + }, + { + "epoch": 0.28202, + "grad_norm": 0.6701044164200314, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 28202 + }, + { + "epoch": 0.28203, + "grad_norm": 0.7147755071820038, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 28203 + }, + { + "epoch": 0.28204, + "grad_norm": 0.8550476656120515, + "learning_rate": 0.003, + "loss": 4.057, + "step": 28204 + }, + { + "epoch": 0.28205, + "grad_norm": 0.7821505551737349, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 28205 + }, + { + "epoch": 0.28206, + "grad_norm": 0.8145687099711261, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 28206 + }, + { + "epoch": 0.28207, + "grad_norm": 0.9971051065289025, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 28207 + }, + { + "epoch": 0.28208, + "grad_norm": 1.3170040492329025, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 28208 + }, + { + "epoch": 0.28209, + "grad_norm": 0.6929366740383698, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 28209 + }, + { + "epoch": 0.2821, + "grad_norm": 0.6195448235850514, + "learning_rate": 0.003, + "loss": 3.9976, + "step": 28210 + }, + { + "epoch": 0.28211, + "grad_norm": 0.6703323793680732, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 28211 + }, + { + "epoch": 0.28212, + "grad_norm": 0.7078195913303464, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28212 + }, + { + "epoch": 0.28213, + "grad_norm": 0.7259258607462531, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28213 + }, + { + "epoch": 0.28214, + "grad_norm": 0.7306179723273285, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 28214 + }, + { + "epoch": 0.28215, + "grad_norm": 0.8062555283029527, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 28215 + }, + { + "epoch": 0.28216, + "grad_norm": 0.8679362047946177, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 28216 + }, + { + "epoch": 0.28217, + "grad_norm": 0.9991109212290636, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28217 + }, + { + "epoch": 0.28218, + "grad_norm": 1.103876658031368, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 28218 + }, + { + "epoch": 0.28219, + "grad_norm": 0.9490654348234014, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 28219 + }, + { + "epoch": 0.2822, + "grad_norm": 1.0531165941479594, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 28220 + }, + { + "epoch": 0.28221, + "grad_norm": 0.9663026850558428, + "learning_rate": 0.003, + "loss": 4.056, + "step": 28221 + }, + { + "epoch": 0.28222, + "grad_norm": 0.8305585626675177, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 28222 + }, + { + "epoch": 0.28223, + "grad_norm": 0.7256774548013621, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 28223 + }, + { + "epoch": 0.28224, + "grad_norm": 0.7157274554843289, + "learning_rate": 0.003, + "loss": 4.058, + "step": 28224 + }, + { + "epoch": 0.28225, + "grad_norm": 0.7600991419009422, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 28225 + }, + { + "epoch": 0.28226, + "grad_norm": 0.5672567639339383, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 28226 + }, + { + "epoch": 0.28227, + "grad_norm": 0.6199196468163255, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 28227 + }, + { + "epoch": 0.28228, + "grad_norm": 0.7574550764533049, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 28228 + }, + { + "epoch": 0.28229, + "grad_norm": 0.8869521442159312, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 28229 + }, + { + "epoch": 0.2823, + "grad_norm": 1.1931566139936698, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 28230 + }, + { + "epoch": 0.28231, + "grad_norm": 0.956220342852974, + "learning_rate": 0.003, + "loss": 4.033, + "step": 28231 + }, + { + "epoch": 0.28232, + "grad_norm": 1.0249511084528655, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 28232 + }, + { + "epoch": 0.28233, + "grad_norm": 0.9722567028854423, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28233 + }, + { + "epoch": 0.28234, + "grad_norm": 0.8704189171986808, + "learning_rate": 0.003, + "loss": 4.051, + "step": 28234 + }, + { + "epoch": 0.28235, + "grad_norm": 0.772673969426097, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 28235 + }, + { + "epoch": 0.28236, + "grad_norm": 0.7172285002598241, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 28236 + }, + { + "epoch": 0.28237, + "grad_norm": 0.7880859565992895, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 28237 + }, + { + "epoch": 0.28238, + "grad_norm": 0.9259399436633314, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 28238 + }, + { + "epoch": 0.28239, + "grad_norm": 0.8752579024245218, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 28239 + }, + { + "epoch": 0.2824, + "grad_norm": 0.8052160750235269, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 28240 + }, + { + "epoch": 0.28241, + "grad_norm": 0.7766720364276999, + "learning_rate": 0.003, + "loss": 4.053, + "step": 28241 + }, + { + "epoch": 0.28242, + "grad_norm": 0.7218784776751477, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 28242 + }, + { + "epoch": 0.28243, + "grad_norm": 0.8005574858540452, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 28243 + }, + { + "epoch": 0.28244, + "grad_norm": 0.8391641274513753, + "learning_rate": 0.003, + "loss": 4.034, + "step": 28244 + }, + { + "epoch": 0.28245, + "grad_norm": 0.9624776705109975, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28245 + }, + { + "epoch": 0.28246, + "grad_norm": 1.1013495431357332, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 28246 + }, + { + "epoch": 0.28247, + "grad_norm": 0.9141613254983036, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 28247 + }, + { + "epoch": 0.28248, + "grad_norm": 0.9213554421512912, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 28248 + }, + { + "epoch": 0.28249, + "grad_norm": 0.8863299804477717, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 28249 + }, + { + "epoch": 0.2825, + "grad_norm": 0.9017098164315972, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 28250 + }, + { + "epoch": 0.28251, + "grad_norm": 0.965237227832434, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 28251 + }, + { + "epoch": 0.28252, + "grad_norm": 0.9150395331222423, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 28252 + }, + { + "epoch": 0.28253, + "grad_norm": 0.9923713089176346, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28253 + }, + { + "epoch": 0.28254, + "grad_norm": 1.1396821128647427, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 28254 + }, + { + "epoch": 0.28255, + "grad_norm": 1.0727447613033139, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 28255 + }, + { + "epoch": 0.28256, + "grad_norm": 1.1050199187088274, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28256 + }, + { + "epoch": 0.28257, + "grad_norm": 0.954500085955581, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 28257 + }, + { + "epoch": 0.28258, + "grad_norm": 0.9814661462062373, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 28258 + }, + { + "epoch": 0.28259, + "grad_norm": 0.8586861676574926, + "learning_rate": 0.003, + "loss": 4.034, + "step": 28259 + }, + { + "epoch": 0.2826, + "grad_norm": 0.8021755481434575, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 28260 + }, + { + "epoch": 0.28261, + "grad_norm": 0.860195487074539, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 28261 + }, + { + "epoch": 0.28262, + "grad_norm": 0.8437195768085451, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 28262 + }, + { + "epoch": 0.28263, + "grad_norm": 0.8412309030797578, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 28263 + }, + { + "epoch": 0.28264, + "grad_norm": 0.8431538632819824, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 28264 + }, + { + "epoch": 0.28265, + "grad_norm": 0.7886352619561732, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 28265 + }, + { + "epoch": 0.28266, + "grad_norm": 0.8273477841989267, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 28266 + }, + { + "epoch": 0.28267, + "grad_norm": 0.8662850287550045, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 28267 + }, + { + "epoch": 0.28268, + "grad_norm": 0.9522480760006031, + "learning_rate": 0.003, + "loss": 4.06, + "step": 28268 + }, + { + "epoch": 0.28269, + "grad_norm": 1.0905538963420323, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 28269 + }, + { + "epoch": 0.2827, + "grad_norm": 0.8752338203179475, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 28270 + }, + { + "epoch": 0.28271, + "grad_norm": 0.8117612669645988, + "learning_rate": 0.003, + "loss": 4.0985, + "step": 28271 + }, + { + "epoch": 0.28272, + "grad_norm": 0.7811353198898225, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 28272 + }, + { + "epoch": 0.28273, + "grad_norm": 0.7178233659034161, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 28273 + }, + { + "epoch": 0.28274, + "grad_norm": 0.6523313412260994, + "learning_rate": 0.003, + "loss": 3.9982, + "step": 28274 + }, + { + "epoch": 0.28275, + "grad_norm": 0.6967378029178256, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 28275 + }, + { + "epoch": 0.28276, + "grad_norm": 0.8548617523019947, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 28276 + }, + { + "epoch": 0.28277, + "grad_norm": 1.141492298654007, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 28277 + }, + { + "epoch": 0.28278, + "grad_norm": 0.9205699478385692, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 28278 + }, + { + "epoch": 0.28279, + "grad_norm": 0.7241236202250745, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 28279 + }, + { + "epoch": 0.2828, + "grad_norm": 0.7427962382256454, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 28280 + }, + { + "epoch": 0.28281, + "grad_norm": 0.8167945233820126, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 28281 + }, + { + "epoch": 0.28282, + "grad_norm": 0.8409863288531024, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 28282 + }, + { + "epoch": 0.28283, + "grad_norm": 0.8137626030495895, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 28283 + }, + { + "epoch": 0.28284, + "grad_norm": 0.9548363661514379, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 28284 + }, + { + "epoch": 0.28285, + "grad_norm": 1.0559393581044432, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 28285 + }, + { + "epoch": 0.28286, + "grad_norm": 0.9710812085799814, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 28286 + }, + { + "epoch": 0.28287, + "grad_norm": 0.8126393619955529, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 28287 + }, + { + "epoch": 0.28288, + "grad_norm": 0.8400615669447016, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 28288 + }, + { + "epoch": 0.28289, + "grad_norm": 0.9667834512644995, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 28289 + }, + { + "epoch": 0.2829, + "grad_norm": 0.9538188587627349, + "learning_rate": 0.003, + "loss": 4.023, + "step": 28290 + }, + { + "epoch": 0.28291, + "grad_norm": 0.9390098114524018, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 28291 + }, + { + "epoch": 0.28292, + "grad_norm": 0.9602841737724106, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 28292 + }, + { + "epoch": 0.28293, + "grad_norm": 0.986154003979366, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 28293 + }, + { + "epoch": 0.28294, + "grad_norm": 0.8705095940201407, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 28294 + }, + { + "epoch": 0.28295, + "grad_norm": 0.8293374347524015, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 28295 + }, + { + "epoch": 0.28296, + "grad_norm": 0.8280917798988214, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 28296 + }, + { + "epoch": 0.28297, + "grad_norm": 0.8716438947973035, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 28297 + }, + { + "epoch": 0.28298, + "grad_norm": 0.9844496754166078, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 28298 + }, + { + "epoch": 0.28299, + "grad_norm": 1.1097938424179985, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 28299 + }, + { + "epoch": 0.283, + "grad_norm": 0.938824106603746, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 28300 + }, + { + "epoch": 0.28301, + "grad_norm": 1.0744371513352338, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 28301 + }, + { + "epoch": 0.28302, + "grad_norm": 0.9635558870327866, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 28302 + }, + { + "epoch": 0.28303, + "grad_norm": 0.7714292338849613, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 28303 + }, + { + "epoch": 0.28304, + "grad_norm": 0.6330090683663562, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 28304 + }, + { + "epoch": 0.28305, + "grad_norm": 0.6467654101024599, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 28305 + }, + { + "epoch": 0.28306, + "grad_norm": 0.6487490601895508, + "learning_rate": 0.003, + "loss": 4.031, + "step": 28306 + }, + { + "epoch": 0.28307, + "grad_norm": 0.7416176037728157, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 28307 + }, + { + "epoch": 0.28308, + "grad_norm": 0.7392811899959272, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 28308 + }, + { + "epoch": 0.28309, + "grad_norm": 0.6184831422020652, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 28309 + }, + { + "epoch": 0.2831, + "grad_norm": 0.5704995144190532, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 28310 + }, + { + "epoch": 0.28311, + "grad_norm": 0.565276742654718, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 28311 + }, + { + "epoch": 0.28312, + "grad_norm": 0.6092218961226599, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 28312 + }, + { + "epoch": 0.28313, + "grad_norm": 0.6239629745158332, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 28313 + }, + { + "epoch": 0.28314, + "grad_norm": 0.6013439743049495, + "learning_rate": 0.003, + "loss": 4.031, + "step": 28314 + }, + { + "epoch": 0.28315, + "grad_norm": 0.5901684515048417, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 28315 + }, + { + "epoch": 0.28316, + "grad_norm": 0.7407785552526108, + "learning_rate": 0.003, + "loss": 4.023, + "step": 28316 + }, + { + "epoch": 0.28317, + "grad_norm": 1.0835280701329584, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 28317 + }, + { + "epoch": 0.28318, + "grad_norm": 1.0547915463961788, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 28318 + }, + { + "epoch": 0.28319, + "grad_norm": 0.8879614999663559, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 28319 + }, + { + "epoch": 0.2832, + "grad_norm": 0.8152934153885669, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 28320 + }, + { + "epoch": 0.28321, + "grad_norm": 0.7455218700188075, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 28321 + }, + { + "epoch": 0.28322, + "grad_norm": 0.7533929298474943, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 28322 + }, + { + "epoch": 0.28323, + "grad_norm": 0.8199852623212297, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 28323 + }, + { + "epoch": 0.28324, + "grad_norm": 0.8535768897844491, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 28324 + }, + { + "epoch": 0.28325, + "grad_norm": 0.8461385332323054, + "learning_rate": 0.003, + "loss": 4.0795, + "step": 28325 + }, + { + "epoch": 0.28326, + "grad_norm": 0.8737690888579969, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 28326 + }, + { + "epoch": 0.28327, + "grad_norm": 1.1527661745195132, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28327 + }, + { + "epoch": 0.28328, + "grad_norm": 0.9720401268452428, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 28328 + }, + { + "epoch": 0.28329, + "grad_norm": 0.9123470801570708, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 28329 + }, + { + "epoch": 0.2833, + "grad_norm": 1.069056923336479, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 28330 + }, + { + "epoch": 0.28331, + "grad_norm": 1.111773329457238, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 28331 + }, + { + "epoch": 0.28332, + "grad_norm": 0.8694627906320761, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 28332 + }, + { + "epoch": 0.28333, + "grad_norm": 0.8590422866612256, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 28333 + }, + { + "epoch": 0.28334, + "grad_norm": 0.8624363527129874, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 28334 + }, + { + "epoch": 0.28335, + "grad_norm": 0.8164971170824165, + "learning_rate": 0.003, + "loss": 4.017, + "step": 28335 + }, + { + "epoch": 0.28336, + "grad_norm": 0.7178766988621266, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 28336 + }, + { + "epoch": 0.28337, + "grad_norm": 0.894691151745384, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 28337 + }, + { + "epoch": 0.28338, + "grad_norm": 1.143484201048068, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 28338 + }, + { + "epoch": 0.28339, + "grad_norm": 0.9543498556747936, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 28339 + }, + { + "epoch": 0.2834, + "grad_norm": 0.8910555068110552, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 28340 + }, + { + "epoch": 0.28341, + "grad_norm": 0.7990980382175268, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 28341 + }, + { + "epoch": 0.28342, + "grad_norm": 0.7797125301228177, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 28342 + }, + { + "epoch": 0.28343, + "grad_norm": 0.7237847390391007, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 28343 + }, + { + "epoch": 0.28344, + "grad_norm": 0.7458500534930077, + "learning_rate": 0.003, + "loss": 4.051, + "step": 28344 + }, + { + "epoch": 0.28345, + "grad_norm": 0.6574466977527601, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 28345 + }, + { + "epoch": 0.28346, + "grad_norm": 0.6425277044691546, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 28346 + }, + { + "epoch": 0.28347, + "grad_norm": 0.6567406570521138, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 28347 + }, + { + "epoch": 0.28348, + "grad_norm": 0.6397604084409447, + "learning_rate": 0.003, + "loss": 4.026, + "step": 28348 + }, + { + "epoch": 0.28349, + "grad_norm": 0.678991479186897, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 28349 + }, + { + "epoch": 0.2835, + "grad_norm": 0.6580179951546377, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 28350 + }, + { + "epoch": 0.28351, + "grad_norm": 0.6684480985719111, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 28351 + }, + { + "epoch": 0.28352, + "grad_norm": 0.7891825056351334, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 28352 + }, + { + "epoch": 0.28353, + "grad_norm": 1.0495828815160515, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 28353 + }, + { + "epoch": 0.28354, + "grad_norm": 1.3209846672063401, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 28354 + }, + { + "epoch": 0.28355, + "grad_norm": 0.7136545067828034, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 28355 + }, + { + "epoch": 0.28356, + "grad_norm": 0.7744366821603748, + "learning_rate": 0.003, + "loss": 3.996, + "step": 28356 + }, + { + "epoch": 0.28357, + "grad_norm": 0.8365887140756004, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 28357 + }, + { + "epoch": 0.28358, + "grad_norm": 0.9383890589522561, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 28358 + }, + { + "epoch": 0.28359, + "grad_norm": 1.0762483131697052, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 28359 + }, + { + "epoch": 0.2836, + "grad_norm": 0.902178388705875, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 28360 + }, + { + "epoch": 0.28361, + "grad_norm": 0.7989985039848234, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 28361 + }, + { + "epoch": 0.28362, + "grad_norm": 0.9083886401810168, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 28362 + }, + { + "epoch": 0.28363, + "grad_norm": 1.1221242671112592, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 28363 + }, + { + "epoch": 0.28364, + "grad_norm": 1.1566889445859418, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 28364 + }, + { + "epoch": 0.28365, + "grad_norm": 1.0571742698791344, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 28365 + }, + { + "epoch": 0.28366, + "grad_norm": 0.8309173852494756, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 28366 + }, + { + "epoch": 0.28367, + "grad_norm": 0.7683362983834483, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 28367 + }, + { + "epoch": 0.28368, + "grad_norm": 0.9174978562329252, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 28368 + }, + { + "epoch": 0.28369, + "grad_norm": 0.9695409410455902, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 28369 + }, + { + "epoch": 0.2837, + "grad_norm": 0.9456042261756763, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 28370 + }, + { + "epoch": 0.28371, + "grad_norm": 0.9987139301760894, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 28371 + }, + { + "epoch": 0.28372, + "grad_norm": 1.0046369697478696, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 28372 + }, + { + "epoch": 0.28373, + "grad_norm": 1.0994078188059953, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 28373 + }, + { + "epoch": 0.28374, + "grad_norm": 0.9257704762740899, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 28374 + }, + { + "epoch": 0.28375, + "grad_norm": 1.0246734964479787, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 28375 + }, + { + "epoch": 0.28376, + "grad_norm": 1.2431859427491114, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 28376 + }, + { + "epoch": 0.28377, + "grad_norm": 0.9519009927485098, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 28377 + }, + { + "epoch": 0.28378, + "grad_norm": 1.018239886114814, + "learning_rate": 0.003, + "loss": 4.059, + "step": 28378 + }, + { + "epoch": 0.28379, + "grad_norm": 1.1245746836029091, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 28379 + }, + { + "epoch": 0.2838, + "grad_norm": 0.7257416121067121, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 28380 + }, + { + "epoch": 0.28381, + "grad_norm": 0.6320434353914891, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 28381 + }, + { + "epoch": 0.28382, + "grad_norm": 0.753020904458912, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 28382 + }, + { + "epoch": 0.28383, + "grad_norm": 0.7589551961477055, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 28383 + }, + { + "epoch": 0.28384, + "grad_norm": 0.7130353272980438, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28384 + }, + { + "epoch": 0.28385, + "grad_norm": 0.8514072880594915, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 28385 + }, + { + "epoch": 0.28386, + "grad_norm": 0.9048152898838031, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 28386 + }, + { + "epoch": 0.28387, + "grad_norm": 0.8696735484799988, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 28387 + }, + { + "epoch": 0.28388, + "grad_norm": 0.7677338977373028, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 28388 + }, + { + "epoch": 0.28389, + "grad_norm": 0.7447454685996496, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 28389 + }, + { + "epoch": 0.2839, + "grad_norm": 0.7552100659274816, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 28390 + }, + { + "epoch": 0.28391, + "grad_norm": 0.6248123484740445, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 28391 + }, + { + "epoch": 0.28392, + "grad_norm": 0.6776257314014614, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 28392 + }, + { + "epoch": 0.28393, + "grad_norm": 0.7622657095356815, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 28393 + }, + { + "epoch": 0.28394, + "grad_norm": 0.9814916871219838, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 28394 + }, + { + "epoch": 0.28395, + "grad_norm": 1.1303138220155156, + "learning_rate": 0.003, + "loss": 4.0921, + "step": 28395 + }, + { + "epoch": 0.28396, + "grad_norm": 0.7871564755435672, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 28396 + }, + { + "epoch": 0.28397, + "grad_norm": 0.838521828857985, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 28397 + }, + { + "epoch": 0.28398, + "grad_norm": 0.8681297655485658, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 28398 + }, + { + "epoch": 0.28399, + "grad_norm": 0.8756869478337493, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 28399 + }, + { + "epoch": 0.284, + "grad_norm": 0.9643518966616424, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 28400 + }, + { + "epoch": 0.28401, + "grad_norm": 1.1302456786450705, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 28401 + }, + { + "epoch": 0.28402, + "grad_norm": 0.8666872990194784, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 28402 + }, + { + "epoch": 0.28403, + "grad_norm": 0.832245804372531, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 28403 + }, + { + "epoch": 0.28404, + "grad_norm": 0.8856896240636183, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 28404 + }, + { + "epoch": 0.28405, + "grad_norm": 0.9115384249373091, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 28405 + }, + { + "epoch": 0.28406, + "grad_norm": 0.8725193211098602, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28406 + }, + { + "epoch": 0.28407, + "grad_norm": 0.8819107892812704, + "learning_rate": 0.003, + "loss": 4.0076, + "step": 28407 + }, + { + "epoch": 0.28408, + "grad_norm": 0.8455360368582739, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 28408 + }, + { + "epoch": 0.28409, + "grad_norm": 0.9698510361406761, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 28409 + }, + { + "epoch": 0.2841, + "grad_norm": 1.07553812050799, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 28410 + }, + { + "epoch": 0.28411, + "grad_norm": 0.9722725243211752, + "learning_rate": 0.003, + "loss": 4.069, + "step": 28411 + }, + { + "epoch": 0.28412, + "grad_norm": 0.8192839057346518, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 28412 + }, + { + "epoch": 0.28413, + "grad_norm": 0.7363819485702107, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 28413 + }, + { + "epoch": 0.28414, + "grad_norm": 0.723644647906333, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 28414 + }, + { + "epoch": 0.28415, + "grad_norm": 0.6395704231769905, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 28415 + }, + { + "epoch": 0.28416, + "grad_norm": 0.6155111558086961, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 28416 + }, + { + "epoch": 0.28417, + "grad_norm": 0.6449462020411405, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 28417 + }, + { + "epoch": 0.28418, + "grad_norm": 0.7329202444339759, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 28418 + }, + { + "epoch": 0.28419, + "grad_norm": 0.8501990402549656, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 28419 + }, + { + "epoch": 0.2842, + "grad_norm": 0.9581448385656717, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 28420 + }, + { + "epoch": 0.28421, + "grad_norm": 1.0191191598691747, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 28421 + }, + { + "epoch": 0.28422, + "grad_norm": 1.041331940603941, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 28422 + }, + { + "epoch": 0.28423, + "grad_norm": 0.9855758585026174, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 28423 + }, + { + "epoch": 0.28424, + "grad_norm": 0.9544220851668925, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 28424 + }, + { + "epoch": 0.28425, + "grad_norm": 0.9550008305004369, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 28425 + }, + { + "epoch": 0.28426, + "grad_norm": 0.9124848880008065, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 28426 + }, + { + "epoch": 0.28427, + "grad_norm": 0.8803979741572936, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28427 + }, + { + "epoch": 0.28428, + "grad_norm": 0.7432629967963432, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 28428 + }, + { + "epoch": 0.28429, + "grad_norm": 0.7653633826344757, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 28429 + }, + { + "epoch": 0.2843, + "grad_norm": 0.9041638067837511, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 28430 + }, + { + "epoch": 0.28431, + "grad_norm": 0.908361844160174, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 28431 + }, + { + "epoch": 0.28432, + "grad_norm": 0.9150668784272112, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 28432 + }, + { + "epoch": 0.28433, + "grad_norm": 1.1063176773773953, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 28433 + }, + { + "epoch": 0.28434, + "grad_norm": 0.9984726618916955, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 28434 + }, + { + "epoch": 0.28435, + "grad_norm": 0.8926580982598674, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 28435 + }, + { + "epoch": 0.28436, + "grad_norm": 0.8419541420336488, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 28436 + }, + { + "epoch": 0.28437, + "grad_norm": 0.8834821940910866, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 28437 + }, + { + "epoch": 0.28438, + "grad_norm": 0.895215617353213, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 28438 + }, + { + "epoch": 0.28439, + "grad_norm": 0.8920475660278828, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 28439 + }, + { + "epoch": 0.2844, + "grad_norm": 0.8791568520933343, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 28440 + }, + { + "epoch": 0.28441, + "grad_norm": 0.7321723227516553, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 28441 + }, + { + "epoch": 0.28442, + "grad_norm": 0.6664770963616079, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 28442 + }, + { + "epoch": 0.28443, + "grad_norm": 0.6884860828476279, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 28443 + }, + { + "epoch": 0.28444, + "grad_norm": 0.7655128874371786, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 28444 + }, + { + "epoch": 0.28445, + "grad_norm": 0.8723799617328127, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 28445 + }, + { + "epoch": 0.28446, + "grad_norm": 0.9314716918099466, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 28446 + }, + { + "epoch": 0.28447, + "grad_norm": 0.8947768682975082, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 28447 + }, + { + "epoch": 0.28448, + "grad_norm": 0.8231683459094062, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 28448 + }, + { + "epoch": 0.28449, + "grad_norm": 0.73304671125213, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 28449 + }, + { + "epoch": 0.2845, + "grad_norm": 0.7703633254935931, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 28450 + }, + { + "epoch": 0.28451, + "grad_norm": 0.7048151986281054, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 28451 + }, + { + "epoch": 0.28452, + "grad_norm": 0.7136474784723533, + "learning_rate": 0.003, + "loss": 4.021, + "step": 28452 + }, + { + "epoch": 0.28453, + "grad_norm": 0.7750902793466988, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 28453 + }, + { + "epoch": 0.28454, + "grad_norm": 0.872546538204124, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 28454 + }, + { + "epoch": 0.28455, + "grad_norm": 1.1291460484687137, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 28455 + }, + { + "epoch": 0.28456, + "grad_norm": 0.9980106032398537, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 28456 + }, + { + "epoch": 0.28457, + "grad_norm": 0.9105613849000609, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 28457 + }, + { + "epoch": 0.28458, + "grad_norm": 0.7479972195739863, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 28458 + }, + { + "epoch": 0.28459, + "grad_norm": 0.644284996295813, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 28459 + }, + { + "epoch": 0.2846, + "grad_norm": 0.6237223931527431, + "learning_rate": 0.003, + "loss": 4.046, + "step": 28460 + }, + { + "epoch": 0.28461, + "grad_norm": 0.6348820492434564, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 28461 + }, + { + "epoch": 0.28462, + "grad_norm": 0.6213392631291724, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 28462 + }, + { + "epoch": 0.28463, + "grad_norm": 0.6357174779111046, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 28463 + }, + { + "epoch": 0.28464, + "grad_norm": 0.5665670620071153, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 28464 + }, + { + "epoch": 0.28465, + "grad_norm": 0.6161838047603717, + "learning_rate": 0.003, + "loss": 3.9898, + "step": 28465 + }, + { + "epoch": 0.28466, + "grad_norm": 0.7964675794733751, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 28466 + }, + { + "epoch": 0.28467, + "grad_norm": 0.8814555301712172, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 28467 + }, + { + "epoch": 0.28468, + "grad_norm": 0.7596096370030949, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 28468 + }, + { + "epoch": 0.28469, + "grad_norm": 0.8004701181170579, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 28469 + }, + { + "epoch": 0.2847, + "grad_norm": 0.915412304902652, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 28470 + }, + { + "epoch": 0.28471, + "grad_norm": 0.9588664697622711, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 28471 + }, + { + "epoch": 0.28472, + "grad_norm": 0.9430994918592408, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 28472 + }, + { + "epoch": 0.28473, + "grad_norm": 0.9233894079109843, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 28473 + }, + { + "epoch": 0.28474, + "grad_norm": 0.8512179476191793, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 28474 + }, + { + "epoch": 0.28475, + "grad_norm": 0.8581245809766811, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 28475 + }, + { + "epoch": 0.28476, + "grad_norm": 0.9955910625421436, + "learning_rate": 0.003, + "loss": 4.049, + "step": 28476 + }, + { + "epoch": 0.28477, + "grad_norm": 0.9735494175072673, + "learning_rate": 0.003, + "loss": 4.066, + "step": 28477 + }, + { + "epoch": 0.28478, + "grad_norm": 0.8730166403331033, + "learning_rate": 0.003, + "loss": 4.042, + "step": 28478 + }, + { + "epoch": 0.28479, + "grad_norm": 0.8394401101563326, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 28479 + }, + { + "epoch": 0.2848, + "grad_norm": 0.8209927185750898, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 28480 + }, + { + "epoch": 0.28481, + "grad_norm": 0.6898575542034411, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 28481 + }, + { + "epoch": 0.28482, + "grad_norm": 0.6655976153341645, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 28482 + }, + { + "epoch": 0.28483, + "grad_norm": 0.6694593539182925, + "learning_rate": 0.003, + "loss": 4.03, + "step": 28483 + }, + { + "epoch": 0.28484, + "grad_norm": 0.7080972119612121, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 28484 + }, + { + "epoch": 0.28485, + "grad_norm": 0.7409143657379968, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 28485 + }, + { + "epoch": 0.28486, + "grad_norm": 1.0944928294965413, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 28486 + }, + { + "epoch": 0.28487, + "grad_norm": 1.3724663561888903, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 28487 + }, + { + "epoch": 0.28488, + "grad_norm": 0.7315683852221001, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 28488 + }, + { + "epoch": 0.28489, + "grad_norm": 0.7818253941184138, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 28489 + }, + { + "epoch": 0.2849, + "grad_norm": 0.8341947128369528, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 28490 + }, + { + "epoch": 0.28491, + "grad_norm": 0.725608886284378, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 28491 + }, + { + "epoch": 0.28492, + "grad_norm": 0.6613121276164556, + "learning_rate": 0.003, + "loss": 4.046, + "step": 28492 + }, + { + "epoch": 0.28493, + "grad_norm": 0.7222776491795632, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 28493 + }, + { + "epoch": 0.28494, + "grad_norm": 0.7651053271137258, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 28494 + }, + { + "epoch": 0.28495, + "grad_norm": 0.8241155947178603, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 28495 + }, + { + "epoch": 0.28496, + "grad_norm": 0.9300505671250392, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 28496 + }, + { + "epoch": 0.28497, + "grad_norm": 1.197555813064701, + "learning_rate": 0.003, + "loss": 4.059, + "step": 28497 + }, + { + "epoch": 0.28498, + "grad_norm": 0.9496102676826441, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 28498 + }, + { + "epoch": 0.28499, + "grad_norm": 0.8920110158130717, + "learning_rate": 0.003, + "loss": 4.085, + "step": 28499 + }, + { + "epoch": 0.285, + "grad_norm": 1.0094113342480808, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 28500 + }, + { + "epoch": 0.28501, + "grad_norm": 1.3202217504599303, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 28501 + }, + { + "epoch": 0.28502, + "grad_norm": 0.714701401976666, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 28502 + }, + { + "epoch": 0.28503, + "grad_norm": 0.6773396062090474, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 28503 + }, + { + "epoch": 0.28504, + "grad_norm": 0.7034994891811566, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 28504 + }, + { + "epoch": 0.28505, + "grad_norm": 0.7965898897489029, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 28505 + }, + { + "epoch": 0.28506, + "grad_norm": 0.8545325143962685, + "learning_rate": 0.003, + "loss": 4.024, + "step": 28506 + }, + { + "epoch": 0.28507, + "grad_norm": 0.9317711920157477, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 28507 + }, + { + "epoch": 0.28508, + "grad_norm": 1.0434979388102015, + "learning_rate": 0.003, + "loss": 4.05, + "step": 28508 + }, + { + "epoch": 0.28509, + "grad_norm": 1.0218177268241753, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 28509 + }, + { + "epoch": 0.2851, + "grad_norm": 0.9543296747827839, + "learning_rate": 0.003, + "loss": 4.038, + "step": 28510 + }, + { + "epoch": 0.28511, + "grad_norm": 0.9461991779710117, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 28511 + }, + { + "epoch": 0.28512, + "grad_norm": 1.078060467860929, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 28512 + }, + { + "epoch": 0.28513, + "grad_norm": 1.0543500472295795, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 28513 + }, + { + "epoch": 0.28514, + "grad_norm": 0.8399092739365148, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 28514 + }, + { + "epoch": 0.28515, + "grad_norm": 0.7193185953118837, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 28515 + }, + { + "epoch": 0.28516, + "grad_norm": 0.6916142275440277, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 28516 + }, + { + "epoch": 0.28517, + "grad_norm": 0.649379086817287, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 28517 + }, + { + "epoch": 0.28518, + "grad_norm": 0.6712537822577079, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 28518 + }, + { + "epoch": 0.28519, + "grad_norm": 0.699236670919987, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 28519 + }, + { + "epoch": 0.2852, + "grad_norm": 0.7303392710594824, + "learning_rate": 0.003, + "loss": 4.058, + "step": 28520 + }, + { + "epoch": 0.28521, + "grad_norm": 0.854304461741675, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 28521 + }, + { + "epoch": 0.28522, + "grad_norm": 0.9879579694865874, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 28522 + }, + { + "epoch": 0.28523, + "grad_norm": 1.1572212914390956, + "learning_rate": 0.003, + "loss": 4.028, + "step": 28523 + }, + { + "epoch": 0.28524, + "grad_norm": 0.9435352112026604, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 28524 + }, + { + "epoch": 0.28525, + "grad_norm": 0.8135650918046783, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 28525 + }, + { + "epoch": 0.28526, + "grad_norm": 0.5954292318616392, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 28526 + }, + { + "epoch": 0.28527, + "grad_norm": 0.5550034604070853, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 28527 + }, + { + "epoch": 0.28528, + "grad_norm": 0.5823415912901907, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 28528 + }, + { + "epoch": 0.28529, + "grad_norm": 0.69477055144928, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 28529 + }, + { + "epoch": 0.2853, + "grad_norm": 0.8610315300373312, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 28530 + }, + { + "epoch": 0.28531, + "grad_norm": 0.9474318016661692, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 28531 + }, + { + "epoch": 0.28532, + "grad_norm": 0.9545779027009803, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 28532 + }, + { + "epoch": 0.28533, + "grad_norm": 1.0388622216645689, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28533 + }, + { + "epoch": 0.28534, + "grad_norm": 1.0689835997213255, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 28534 + }, + { + "epoch": 0.28535, + "grad_norm": 0.8960524614549431, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 28535 + }, + { + "epoch": 0.28536, + "grad_norm": 0.8710658925915614, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 28536 + }, + { + "epoch": 0.28537, + "grad_norm": 0.8686052293673259, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 28537 + }, + { + "epoch": 0.28538, + "grad_norm": 0.9228534648871559, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 28538 + }, + { + "epoch": 0.28539, + "grad_norm": 0.8042315973586255, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 28539 + }, + { + "epoch": 0.2854, + "grad_norm": 0.7959157240856279, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 28540 + }, + { + "epoch": 0.28541, + "grad_norm": 0.8606723491854665, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 28541 + }, + { + "epoch": 0.28542, + "grad_norm": 1.0683246095857424, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 28542 + }, + { + "epoch": 0.28543, + "grad_norm": 0.9758308279913178, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 28543 + }, + { + "epoch": 0.28544, + "grad_norm": 1.122155327128848, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 28544 + }, + { + "epoch": 0.28545, + "grad_norm": 0.9750212439586787, + "learning_rate": 0.003, + "loss": 4.054, + "step": 28545 + }, + { + "epoch": 0.28546, + "grad_norm": 1.015453367392447, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 28546 + }, + { + "epoch": 0.28547, + "grad_norm": 0.7229826685672138, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 28547 + }, + { + "epoch": 0.28548, + "grad_norm": 0.6207732667918442, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 28548 + }, + { + "epoch": 0.28549, + "grad_norm": 0.6833740728352132, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 28549 + }, + { + "epoch": 0.2855, + "grad_norm": 0.8193464028543707, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 28550 + }, + { + "epoch": 0.28551, + "grad_norm": 0.9663075532351895, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 28551 + }, + { + "epoch": 0.28552, + "grad_norm": 1.116074638181051, + "learning_rate": 0.003, + "loss": 4.068, + "step": 28552 + }, + { + "epoch": 0.28553, + "grad_norm": 1.1182803694070165, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 28553 + }, + { + "epoch": 0.28554, + "grad_norm": 0.8670533934733222, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 28554 + }, + { + "epoch": 0.28555, + "grad_norm": 0.7196127856963327, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 28555 + }, + { + "epoch": 0.28556, + "grad_norm": 0.7661445983433341, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 28556 + }, + { + "epoch": 0.28557, + "grad_norm": 0.8298300962898633, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 28557 + }, + { + "epoch": 0.28558, + "grad_norm": 0.8582227660318542, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 28558 + }, + { + "epoch": 0.28559, + "grad_norm": 0.8062986892552619, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 28559 + }, + { + "epoch": 0.2856, + "grad_norm": 0.9717933107357863, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 28560 + }, + { + "epoch": 0.28561, + "grad_norm": 1.1986161870703396, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 28561 + }, + { + "epoch": 0.28562, + "grad_norm": 0.9836828523483168, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 28562 + }, + { + "epoch": 0.28563, + "grad_norm": 0.9435272210316958, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 28563 + }, + { + "epoch": 0.28564, + "grad_norm": 0.920647301952589, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 28564 + }, + { + "epoch": 0.28565, + "grad_norm": 0.9271613864532717, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 28565 + }, + { + "epoch": 0.28566, + "grad_norm": 0.8921556479899426, + "learning_rate": 0.003, + "loss": 4.03, + "step": 28566 + }, + { + "epoch": 0.28567, + "grad_norm": 0.7857607501125209, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 28567 + }, + { + "epoch": 0.28568, + "grad_norm": 0.8438862192460418, + "learning_rate": 0.003, + "loss": 4.058, + "step": 28568 + }, + { + "epoch": 0.28569, + "grad_norm": 0.8807621010920424, + "learning_rate": 0.003, + "loss": 4.0051, + "step": 28569 + }, + { + "epoch": 0.2857, + "grad_norm": 0.8931263682673422, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 28570 + }, + { + "epoch": 0.28571, + "grad_norm": 0.8330816110416649, + "learning_rate": 0.003, + "loss": 4.069, + "step": 28571 + }, + { + "epoch": 0.28572, + "grad_norm": 0.8753928920061944, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 28572 + }, + { + "epoch": 0.28573, + "grad_norm": 1.063417726032065, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 28573 + }, + { + "epoch": 0.28574, + "grad_norm": 0.9913479555119775, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 28574 + }, + { + "epoch": 0.28575, + "grad_norm": 0.9321792951268351, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 28575 + }, + { + "epoch": 0.28576, + "grad_norm": 0.877538111941257, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 28576 + }, + { + "epoch": 0.28577, + "grad_norm": 0.8808790612965013, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 28577 + }, + { + "epoch": 0.28578, + "grad_norm": 0.9720485046907056, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 28578 + }, + { + "epoch": 0.28579, + "grad_norm": 1.0955368974720248, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 28579 + }, + { + "epoch": 0.2858, + "grad_norm": 0.9721233750913911, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 28580 + }, + { + "epoch": 0.28581, + "grad_norm": 0.7744423709794399, + "learning_rate": 0.003, + "loss": 4.017, + "step": 28581 + }, + { + "epoch": 0.28582, + "grad_norm": 0.7837454356095888, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 28582 + }, + { + "epoch": 0.28583, + "grad_norm": 0.834694152480912, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 28583 + }, + { + "epoch": 0.28584, + "grad_norm": 0.8257888028663101, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 28584 + }, + { + "epoch": 0.28585, + "grad_norm": 0.8028856031538308, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 28585 + }, + { + "epoch": 0.28586, + "grad_norm": 0.7632926029396118, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 28586 + }, + { + "epoch": 0.28587, + "grad_norm": 0.765855807489956, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 28587 + }, + { + "epoch": 0.28588, + "grad_norm": 0.8003726641525769, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 28588 + }, + { + "epoch": 0.28589, + "grad_norm": 0.8119539242130261, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 28589 + }, + { + "epoch": 0.2859, + "grad_norm": 0.8543732068372208, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 28590 + }, + { + "epoch": 0.28591, + "grad_norm": 0.8966118337674691, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 28591 + }, + { + "epoch": 0.28592, + "grad_norm": 0.8583802156790714, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 28592 + }, + { + "epoch": 0.28593, + "grad_norm": 0.8114882664944365, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 28593 + }, + { + "epoch": 0.28594, + "grad_norm": 0.8680560360455098, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 28594 + }, + { + "epoch": 0.28595, + "grad_norm": 0.9149883128882348, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 28595 + }, + { + "epoch": 0.28596, + "grad_norm": 0.7677318609263302, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 28596 + }, + { + "epoch": 0.28597, + "grad_norm": 0.6630941055664152, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 28597 + }, + { + "epoch": 0.28598, + "grad_norm": 0.6380367561684493, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 28598 + }, + { + "epoch": 0.28599, + "grad_norm": 0.6046570836883874, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 28599 + }, + { + "epoch": 0.286, + "grad_norm": 0.76722479702395, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 28600 + }, + { + "epoch": 0.28601, + "grad_norm": 1.0413249864512195, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 28601 + }, + { + "epoch": 0.28602, + "grad_norm": 1.3718316146930083, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 28602 + }, + { + "epoch": 0.28603, + "grad_norm": 0.647513863970598, + "learning_rate": 0.003, + "loss": 4.044, + "step": 28603 + }, + { + "epoch": 0.28604, + "grad_norm": 0.6694479910393238, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 28604 + }, + { + "epoch": 0.28605, + "grad_norm": 0.7162310994372082, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 28605 + }, + { + "epoch": 0.28606, + "grad_norm": 0.7907453917272956, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 28606 + }, + { + "epoch": 0.28607, + "grad_norm": 0.9015311342643925, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 28607 + }, + { + "epoch": 0.28608, + "grad_norm": 0.9565272242600616, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 28608 + }, + { + "epoch": 0.28609, + "grad_norm": 0.9015540131307266, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 28609 + }, + { + "epoch": 0.2861, + "grad_norm": 0.8197512709206659, + "learning_rate": 0.003, + "loss": 4.058, + "step": 28610 + }, + { + "epoch": 0.28611, + "grad_norm": 0.7474938293075504, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 28611 + }, + { + "epoch": 0.28612, + "grad_norm": 0.8078006431592246, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 28612 + }, + { + "epoch": 0.28613, + "grad_norm": 0.776324046700105, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 28613 + }, + { + "epoch": 0.28614, + "grad_norm": 0.8021374395950123, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 28614 + }, + { + "epoch": 0.28615, + "grad_norm": 1.0001348768999727, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 28615 + }, + { + "epoch": 0.28616, + "grad_norm": 1.224230843035204, + "learning_rate": 0.003, + "loss": 3.9937, + "step": 28616 + }, + { + "epoch": 0.28617, + "grad_norm": 0.7579693786774793, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 28617 + }, + { + "epoch": 0.28618, + "grad_norm": 0.6532911366541334, + "learning_rate": 0.003, + "loss": 4.036, + "step": 28618 + }, + { + "epoch": 0.28619, + "grad_norm": 0.607815815802335, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 28619 + }, + { + "epoch": 0.2862, + "grad_norm": 0.589361678870917, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 28620 + }, + { + "epoch": 0.28621, + "grad_norm": 0.6785879696406384, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 28621 + }, + { + "epoch": 0.28622, + "grad_norm": 0.7637479860788927, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 28622 + }, + { + "epoch": 0.28623, + "grad_norm": 0.8794926946523457, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 28623 + }, + { + "epoch": 0.28624, + "grad_norm": 0.9969838901943914, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 28624 + }, + { + "epoch": 0.28625, + "grad_norm": 1.327868933509164, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 28625 + }, + { + "epoch": 0.28626, + "grad_norm": 0.7638980944749479, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 28626 + }, + { + "epoch": 0.28627, + "grad_norm": 0.8072625180813008, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 28627 + }, + { + "epoch": 0.28628, + "grad_norm": 0.8667211698176158, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 28628 + }, + { + "epoch": 0.28629, + "grad_norm": 1.067362063753034, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 28629 + }, + { + "epoch": 0.2863, + "grad_norm": 0.9901283746064967, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 28630 + }, + { + "epoch": 0.28631, + "grad_norm": 0.8102771853845896, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 28631 + }, + { + "epoch": 0.28632, + "grad_norm": 0.721959790454203, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 28632 + }, + { + "epoch": 0.28633, + "grad_norm": 0.7532524916197352, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 28633 + }, + { + "epoch": 0.28634, + "grad_norm": 0.8044392442393213, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 28634 + }, + { + "epoch": 0.28635, + "grad_norm": 0.7483603571713031, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 28635 + }, + { + "epoch": 0.28636, + "grad_norm": 0.7298254490320449, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 28636 + }, + { + "epoch": 0.28637, + "grad_norm": 0.7614227112349682, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 28637 + }, + { + "epoch": 0.28638, + "grad_norm": 0.9044979061465064, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 28638 + }, + { + "epoch": 0.28639, + "grad_norm": 1.0577665845310897, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 28639 + }, + { + "epoch": 0.2864, + "grad_norm": 1.1633235569633706, + "learning_rate": 0.003, + "loss": 4.04, + "step": 28640 + }, + { + "epoch": 0.28641, + "grad_norm": 0.8753405851494542, + "learning_rate": 0.003, + "loss": 4.053, + "step": 28641 + }, + { + "epoch": 0.28642, + "grad_norm": 0.9183848659716629, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 28642 + }, + { + "epoch": 0.28643, + "grad_norm": 0.8418929349304822, + "learning_rate": 0.003, + "loss": 4.029, + "step": 28643 + }, + { + "epoch": 0.28644, + "grad_norm": 0.8033380734837967, + "learning_rate": 0.003, + "loss": 4.05, + "step": 28644 + }, + { + "epoch": 0.28645, + "grad_norm": 0.8029906141253343, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 28645 + }, + { + "epoch": 0.28646, + "grad_norm": 0.7514034302201833, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 28646 + }, + { + "epoch": 0.28647, + "grad_norm": 0.8380963080702498, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 28647 + }, + { + "epoch": 0.28648, + "grad_norm": 0.9552567650120333, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 28648 + }, + { + "epoch": 0.28649, + "grad_norm": 1.0962255447468026, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 28649 + }, + { + "epoch": 0.2865, + "grad_norm": 1.1013136835734934, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 28650 + }, + { + "epoch": 0.28651, + "grad_norm": 1.0179291792508771, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 28651 + }, + { + "epoch": 0.28652, + "grad_norm": 0.9057586051898991, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 28652 + }, + { + "epoch": 0.28653, + "grad_norm": 0.7950679078148759, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 28653 + }, + { + "epoch": 0.28654, + "grad_norm": 0.7555567030866315, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 28654 + }, + { + "epoch": 0.28655, + "grad_norm": 0.759306410472044, + "learning_rate": 0.003, + "loss": 4.014, + "step": 28655 + }, + { + "epoch": 0.28656, + "grad_norm": 0.7461651297571248, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 28656 + }, + { + "epoch": 0.28657, + "grad_norm": 0.7297003445907126, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 28657 + }, + { + "epoch": 0.28658, + "grad_norm": 0.8060513672962927, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 28658 + }, + { + "epoch": 0.28659, + "grad_norm": 0.9370072654042834, + "learning_rate": 0.003, + "loss": 4.059, + "step": 28659 + }, + { + "epoch": 0.2866, + "grad_norm": 1.0319254570205272, + "learning_rate": 0.003, + "loss": 3.9939, + "step": 28660 + }, + { + "epoch": 0.28661, + "grad_norm": 0.959529483172278, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 28661 + }, + { + "epoch": 0.28662, + "grad_norm": 0.9049348164694172, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 28662 + }, + { + "epoch": 0.28663, + "grad_norm": 0.9407665629700497, + "learning_rate": 0.003, + "loss": 4.0847, + "step": 28663 + }, + { + "epoch": 0.28664, + "grad_norm": 1.1415550483819394, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 28664 + }, + { + "epoch": 0.28665, + "grad_norm": 1.0271410783947623, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 28665 + }, + { + "epoch": 0.28666, + "grad_norm": 1.0855898539370348, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 28666 + }, + { + "epoch": 0.28667, + "grad_norm": 0.9559186613458154, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 28667 + }, + { + "epoch": 0.28668, + "grad_norm": 0.8321087329983451, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 28668 + }, + { + "epoch": 0.28669, + "grad_norm": 0.81674593971458, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 28669 + }, + { + "epoch": 0.2867, + "grad_norm": 0.898659393278337, + "learning_rate": 0.003, + "loss": 4.033, + "step": 28670 + }, + { + "epoch": 0.28671, + "grad_norm": 0.8258728722454893, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 28671 + }, + { + "epoch": 0.28672, + "grad_norm": 0.7601349614760261, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28672 + }, + { + "epoch": 0.28673, + "grad_norm": 0.7376150996390509, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 28673 + }, + { + "epoch": 0.28674, + "grad_norm": 0.7635617471339808, + "learning_rate": 0.003, + "loss": 4.04, + "step": 28674 + }, + { + "epoch": 0.28675, + "grad_norm": 0.7483350875269462, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 28675 + }, + { + "epoch": 0.28676, + "grad_norm": 0.8020892072543045, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 28676 + }, + { + "epoch": 0.28677, + "grad_norm": 0.9414117627556049, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 28677 + }, + { + "epoch": 0.28678, + "grad_norm": 1.2307854652514987, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 28678 + }, + { + "epoch": 0.28679, + "grad_norm": 0.6755227858832771, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 28679 + }, + { + "epoch": 0.2868, + "grad_norm": 0.6376590984908987, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 28680 + }, + { + "epoch": 0.28681, + "grad_norm": 0.7023829520388835, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 28681 + }, + { + "epoch": 0.28682, + "grad_norm": 0.7487917300890066, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 28682 + }, + { + "epoch": 0.28683, + "grad_norm": 0.8087027166528684, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 28683 + }, + { + "epoch": 0.28684, + "grad_norm": 0.8709890369655098, + "learning_rate": 0.003, + "loss": 4.051, + "step": 28684 + }, + { + "epoch": 0.28685, + "grad_norm": 1.0119458085120403, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 28685 + }, + { + "epoch": 0.28686, + "grad_norm": 0.9936016035354587, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 28686 + }, + { + "epoch": 0.28687, + "grad_norm": 0.9684957849152305, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 28687 + }, + { + "epoch": 0.28688, + "grad_norm": 0.9873728898311748, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 28688 + }, + { + "epoch": 0.28689, + "grad_norm": 1.0965230949983917, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 28689 + }, + { + "epoch": 0.2869, + "grad_norm": 1.0044953846872462, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 28690 + }, + { + "epoch": 0.28691, + "grad_norm": 1.0788124342285026, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 28691 + }, + { + "epoch": 0.28692, + "grad_norm": 0.9866010364597724, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 28692 + }, + { + "epoch": 0.28693, + "grad_norm": 0.9531341576077421, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 28693 + }, + { + "epoch": 0.28694, + "grad_norm": 0.9289584530079585, + "learning_rate": 0.003, + "loss": 4.063, + "step": 28694 + }, + { + "epoch": 0.28695, + "grad_norm": 1.008529812332349, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 28695 + }, + { + "epoch": 0.28696, + "grad_norm": 1.199599042504214, + "learning_rate": 0.003, + "loss": 4.06, + "step": 28696 + }, + { + "epoch": 0.28697, + "grad_norm": 1.0406176220560317, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 28697 + }, + { + "epoch": 0.28698, + "grad_norm": 1.0664840432856348, + "learning_rate": 0.003, + "loss": 4.079, + "step": 28698 + }, + { + "epoch": 0.28699, + "grad_norm": 0.9749718717655039, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 28699 + }, + { + "epoch": 0.287, + "grad_norm": 0.9742398814310319, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 28700 + }, + { + "epoch": 0.28701, + "grad_norm": 0.9160562374714015, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 28701 + }, + { + "epoch": 0.28702, + "grad_norm": 0.9336203803684406, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 28702 + }, + { + "epoch": 0.28703, + "grad_norm": 1.0323900847739385, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 28703 + }, + { + "epoch": 0.28704, + "grad_norm": 1.045266676005433, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 28704 + }, + { + "epoch": 0.28705, + "grad_norm": 0.7458410017829225, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 28705 + }, + { + "epoch": 0.28706, + "grad_norm": 0.5410785406428957, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 28706 + }, + { + "epoch": 0.28707, + "grad_norm": 0.5228453310856197, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 28707 + }, + { + "epoch": 0.28708, + "grad_norm": 0.6608791603326084, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 28708 + }, + { + "epoch": 0.28709, + "grad_norm": 0.678162447287557, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 28709 + }, + { + "epoch": 0.2871, + "grad_norm": 0.6816561773919839, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 28710 + }, + { + "epoch": 0.28711, + "grad_norm": 0.6578772151230273, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 28711 + }, + { + "epoch": 0.28712, + "grad_norm": 0.6367433544764254, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 28712 + }, + { + "epoch": 0.28713, + "grad_norm": 0.6895836421817106, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 28713 + }, + { + "epoch": 0.28714, + "grad_norm": 0.8043269558426925, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 28714 + }, + { + "epoch": 0.28715, + "grad_norm": 0.8894859494955192, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 28715 + }, + { + "epoch": 0.28716, + "grad_norm": 0.9297689490711322, + "learning_rate": 0.003, + "loss": 4.022, + "step": 28716 + }, + { + "epoch": 0.28717, + "grad_norm": 0.8500603200789149, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 28717 + }, + { + "epoch": 0.28718, + "grad_norm": 0.8222017380156561, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 28718 + }, + { + "epoch": 0.28719, + "grad_norm": 0.7544456531885075, + "learning_rate": 0.003, + "loss": 4.054, + "step": 28719 + }, + { + "epoch": 0.2872, + "grad_norm": 0.743072546748044, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 28720 + }, + { + "epoch": 0.28721, + "grad_norm": 0.7478512798035548, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 28721 + }, + { + "epoch": 0.28722, + "grad_norm": 0.7275563534846363, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 28722 + }, + { + "epoch": 0.28723, + "grad_norm": 0.7003112390131832, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 28723 + }, + { + "epoch": 0.28724, + "grad_norm": 0.626165082003098, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 28724 + }, + { + "epoch": 0.28725, + "grad_norm": 0.6369236092179669, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 28725 + }, + { + "epoch": 0.28726, + "grad_norm": 0.6749614811922979, + "learning_rate": 0.003, + "loss": 4.037, + "step": 28726 + }, + { + "epoch": 0.28727, + "grad_norm": 0.7721855282634369, + "learning_rate": 0.003, + "loss": 4.025, + "step": 28727 + }, + { + "epoch": 0.28728, + "grad_norm": 1.0819574725746266, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 28728 + }, + { + "epoch": 0.28729, + "grad_norm": 1.0922023903011702, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 28729 + }, + { + "epoch": 0.2873, + "grad_norm": 0.7990870203089259, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 28730 + }, + { + "epoch": 0.28731, + "grad_norm": 0.7862977174274907, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 28731 + }, + { + "epoch": 0.28732, + "grad_norm": 0.9726626359080396, + "learning_rate": 0.003, + "loss": 4.048, + "step": 28732 + }, + { + "epoch": 0.28733, + "grad_norm": 0.9407940799156077, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 28733 + }, + { + "epoch": 0.28734, + "grad_norm": 0.7893996055339209, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 28734 + }, + { + "epoch": 0.28735, + "grad_norm": 0.8885659286168502, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 28735 + }, + { + "epoch": 0.28736, + "grad_norm": 0.9024200508878604, + "learning_rate": 0.003, + "loss": 4.019, + "step": 28736 + }, + { + "epoch": 0.28737, + "grad_norm": 1.0081483264653155, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 28737 + }, + { + "epoch": 0.28738, + "grad_norm": 1.105446664767159, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 28738 + }, + { + "epoch": 0.28739, + "grad_norm": 1.0604018609584864, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 28739 + }, + { + "epoch": 0.2874, + "grad_norm": 0.8987620176287859, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 28740 + }, + { + "epoch": 0.28741, + "grad_norm": 0.8639899383418974, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 28741 + }, + { + "epoch": 0.28742, + "grad_norm": 0.9162535230069342, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 28742 + }, + { + "epoch": 0.28743, + "grad_norm": 0.9799438023227245, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 28743 + }, + { + "epoch": 0.28744, + "grad_norm": 0.7935329609065511, + "learning_rate": 0.003, + "loss": 3.995, + "step": 28744 + }, + { + "epoch": 0.28745, + "grad_norm": 0.8338727445704894, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 28745 + }, + { + "epoch": 0.28746, + "grad_norm": 0.905266957909698, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 28746 + }, + { + "epoch": 0.28747, + "grad_norm": 0.8770940628299769, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 28747 + }, + { + "epoch": 0.28748, + "grad_norm": 0.8432510573101485, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 28748 + }, + { + "epoch": 0.28749, + "grad_norm": 0.7797301630800301, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 28749 + }, + { + "epoch": 0.2875, + "grad_norm": 0.7270976706839277, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 28750 + }, + { + "epoch": 0.28751, + "grad_norm": 0.6731775501500047, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 28751 + }, + { + "epoch": 0.28752, + "grad_norm": 0.6621028070033257, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 28752 + }, + { + "epoch": 0.28753, + "grad_norm": 0.7033534492577809, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 28753 + }, + { + "epoch": 0.28754, + "grad_norm": 0.8058519496340879, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 28754 + }, + { + "epoch": 0.28755, + "grad_norm": 1.0755681891079643, + "learning_rate": 0.003, + "loss": 4.058, + "step": 28755 + }, + { + "epoch": 0.28756, + "grad_norm": 1.1819136179865433, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 28756 + }, + { + "epoch": 0.28757, + "grad_norm": 0.8585584032443118, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 28757 + }, + { + "epoch": 0.28758, + "grad_norm": 0.8194575693359871, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 28758 + }, + { + "epoch": 0.28759, + "grad_norm": 0.7758675257039371, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 28759 + }, + { + "epoch": 0.2876, + "grad_norm": 0.8804873978770608, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 28760 + }, + { + "epoch": 0.28761, + "grad_norm": 1.0207675112557233, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 28761 + }, + { + "epoch": 0.28762, + "grad_norm": 1.0070142682175276, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 28762 + }, + { + "epoch": 0.28763, + "grad_norm": 0.9032608936747407, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 28763 + }, + { + "epoch": 0.28764, + "grad_norm": 0.7944451678710528, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 28764 + }, + { + "epoch": 0.28765, + "grad_norm": 0.9757005427019756, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 28765 + }, + { + "epoch": 0.28766, + "grad_norm": 1.0567364362593248, + "learning_rate": 0.003, + "loss": 4.084, + "step": 28766 + }, + { + "epoch": 0.28767, + "grad_norm": 1.1374987464847945, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 28767 + }, + { + "epoch": 0.28768, + "grad_norm": 0.6875343980740054, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 28768 + }, + { + "epoch": 0.28769, + "grad_norm": 0.5438746668219597, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 28769 + }, + { + "epoch": 0.2877, + "grad_norm": 0.5723042393513118, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28770 + }, + { + "epoch": 0.28771, + "grad_norm": 0.5323460551822498, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 28771 + }, + { + "epoch": 0.28772, + "grad_norm": 0.5883430515516334, + "learning_rate": 0.003, + "loss": 4.048, + "step": 28772 + }, + { + "epoch": 0.28773, + "grad_norm": 0.6394326994428765, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 28773 + }, + { + "epoch": 0.28774, + "grad_norm": 0.7189978536918776, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 28774 + }, + { + "epoch": 0.28775, + "grad_norm": 0.7939359549607259, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28775 + }, + { + "epoch": 0.28776, + "grad_norm": 0.9370543226277266, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 28776 + }, + { + "epoch": 0.28777, + "grad_norm": 1.0926709991177488, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 28777 + }, + { + "epoch": 0.28778, + "grad_norm": 0.849676799590963, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 28778 + }, + { + "epoch": 0.28779, + "grad_norm": 0.858728108204247, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 28779 + }, + { + "epoch": 0.2878, + "grad_norm": 0.8931529320563203, + "learning_rate": 0.003, + "loss": 4.03, + "step": 28780 + }, + { + "epoch": 0.28781, + "grad_norm": 0.9567121839009043, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 28781 + }, + { + "epoch": 0.28782, + "grad_norm": 1.2009268673973026, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 28782 + }, + { + "epoch": 0.28783, + "grad_norm": 0.6718118412740363, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 28783 + }, + { + "epoch": 0.28784, + "grad_norm": 0.6821660674523115, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 28784 + }, + { + "epoch": 0.28785, + "grad_norm": 0.7593966336022011, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 28785 + }, + { + "epoch": 0.28786, + "grad_norm": 0.8053404883518945, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 28786 + }, + { + "epoch": 0.28787, + "grad_norm": 0.8054525415212919, + "learning_rate": 0.003, + "loss": 4.021, + "step": 28787 + }, + { + "epoch": 0.28788, + "grad_norm": 0.7578412395228789, + "learning_rate": 0.003, + "loss": 4.029, + "step": 28788 + }, + { + "epoch": 0.28789, + "grad_norm": 0.7403241891017145, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 28789 + }, + { + "epoch": 0.2879, + "grad_norm": 0.6962959568470731, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 28790 + }, + { + "epoch": 0.28791, + "grad_norm": 0.6811919069561606, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 28791 + }, + { + "epoch": 0.28792, + "grad_norm": 0.7109143023297189, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28792 + }, + { + "epoch": 0.28793, + "grad_norm": 0.7334447919163939, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 28793 + }, + { + "epoch": 0.28794, + "grad_norm": 0.8498749477540435, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 28794 + }, + { + "epoch": 0.28795, + "grad_norm": 1.0756288389505617, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 28795 + }, + { + "epoch": 0.28796, + "grad_norm": 1.0032295788374341, + "learning_rate": 0.003, + "loss": 4.039, + "step": 28796 + }, + { + "epoch": 0.28797, + "grad_norm": 0.9748250333262553, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 28797 + }, + { + "epoch": 0.28798, + "grad_norm": 1.0208362015596955, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 28798 + }, + { + "epoch": 0.28799, + "grad_norm": 0.9137221224484092, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 28799 + }, + { + "epoch": 0.288, + "grad_norm": 0.8194446088536961, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 28800 + }, + { + "epoch": 0.28801, + "grad_norm": 0.8639125685223914, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 28801 + }, + { + "epoch": 0.28802, + "grad_norm": 0.9747055878039127, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 28802 + }, + { + "epoch": 0.28803, + "grad_norm": 1.1328249528977736, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 28803 + }, + { + "epoch": 0.28804, + "grad_norm": 0.8552082138315015, + "learning_rate": 0.003, + "loss": 4.048, + "step": 28804 + }, + { + "epoch": 0.28805, + "grad_norm": 0.8190036674994713, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 28805 + }, + { + "epoch": 0.28806, + "grad_norm": 0.8373972133324258, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 28806 + }, + { + "epoch": 0.28807, + "grad_norm": 0.9410445186854962, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 28807 + }, + { + "epoch": 0.28808, + "grad_norm": 0.9707539856951962, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 28808 + }, + { + "epoch": 0.28809, + "grad_norm": 0.923905654628016, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 28809 + }, + { + "epoch": 0.2881, + "grad_norm": 1.0151648182286128, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 28810 + }, + { + "epoch": 0.28811, + "grad_norm": 1.0835985748612, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 28811 + }, + { + "epoch": 0.28812, + "grad_norm": 0.9070135643145147, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 28812 + }, + { + "epoch": 0.28813, + "grad_norm": 0.8059434748656653, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 28813 + }, + { + "epoch": 0.28814, + "grad_norm": 0.7204358888281435, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 28814 + }, + { + "epoch": 0.28815, + "grad_norm": 0.6784356433417426, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 28815 + }, + { + "epoch": 0.28816, + "grad_norm": 0.7295146138667735, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 28816 + }, + { + "epoch": 0.28817, + "grad_norm": 0.8349732600018718, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 28817 + }, + { + "epoch": 0.28818, + "grad_norm": 0.9393081716033626, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 28818 + }, + { + "epoch": 0.28819, + "grad_norm": 1.0553506114743156, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 28819 + }, + { + "epoch": 0.2882, + "grad_norm": 1.0593675142873102, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 28820 + }, + { + "epoch": 0.28821, + "grad_norm": 1.0392547297037715, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 28821 + }, + { + "epoch": 0.28822, + "grad_norm": 1.0008005915660343, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 28822 + }, + { + "epoch": 0.28823, + "grad_norm": 0.9938080828193561, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 28823 + }, + { + "epoch": 0.28824, + "grad_norm": 0.992620849940634, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 28824 + }, + { + "epoch": 0.28825, + "grad_norm": 0.9725729850849965, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 28825 + }, + { + "epoch": 0.28826, + "grad_norm": 0.9224416538826993, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 28826 + }, + { + "epoch": 0.28827, + "grad_norm": 1.001226553359419, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 28827 + }, + { + "epoch": 0.28828, + "grad_norm": 0.9684047594386208, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 28828 + }, + { + "epoch": 0.28829, + "grad_norm": 0.9621480717888037, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 28829 + }, + { + "epoch": 0.2883, + "grad_norm": 1.0187155697994454, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 28830 + }, + { + "epoch": 0.28831, + "grad_norm": 1.05891288616295, + "learning_rate": 0.003, + "loss": 4.057, + "step": 28831 + }, + { + "epoch": 0.28832, + "grad_norm": 0.9899475657299306, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 28832 + }, + { + "epoch": 0.28833, + "grad_norm": 0.9037773444226085, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 28833 + }, + { + "epoch": 0.28834, + "grad_norm": 0.8439236963182871, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 28834 + }, + { + "epoch": 0.28835, + "grad_norm": 0.8056800710415504, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 28835 + }, + { + "epoch": 0.28836, + "grad_norm": 0.6498983608735496, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 28836 + }, + { + "epoch": 0.28837, + "grad_norm": 0.5384200235370281, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 28837 + }, + { + "epoch": 0.28838, + "grad_norm": 0.5717915710151493, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 28838 + }, + { + "epoch": 0.28839, + "grad_norm": 0.6113261571751487, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 28839 + }, + { + "epoch": 0.2884, + "grad_norm": 0.6704764870226773, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 28840 + }, + { + "epoch": 0.28841, + "grad_norm": 0.7472050478435659, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 28841 + }, + { + "epoch": 0.28842, + "grad_norm": 0.7147134736800776, + "learning_rate": 0.003, + "loss": 4.048, + "step": 28842 + }, + { + "epoch": 0.28843, + "grad_norm": 0.625772791293347, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 28843 + }, + { + "epoch": 0.28844, + "grad_norm": 0.6552903800807355, + "learning_rate": 0.003, + "loss": 4.036, + "step": 28844 + }, + { + "epoch": 0.28845, + "grad_norm": 0.7106876907646548, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 28845 + }, + { + "epoch": 0.28846, + "grad_norm": 0.842710154032099, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 28846 + }, + { + "epoch": 0.28847, + "grad_norm": 0.9798976966727766, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 28847 + }, + { + "epoch": 0.28848, + "grad_norm": 1.1207163970792315, + "learning_rate": 0.003, + "loss": 4.0057, + "step": 28848 + }, + { + "epoch": 0.28849, + "grad_norm": 0.9405878686179455, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 28849 + }, + { + "epoch": 0.2885, + "grad_norm": 0.8478092083046985, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 28850 + }, + { + "epoch": 0.28851, + "grad_norm": 0.8610464189253911, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 28851 + }, + { + "epoch": 0.28852, + "grad_norm": 0.9114224049556959, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 28852 + }, + { + "epoch": 0.28853, + "grad_norm": 0.7872326957994091, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 28853 + }, + { + "epoch": 0.28854, + "grad_norm": 0.6969657206543184, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 28854 + }, + { + "epoch": 0.28855, + "grad_norm": 0.6824396135455695, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 28855 + }, + { + "epoch": 0.28856, + "grad_norm": 0.7226855467866632, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 28856 + }, + { + "epoch": 0.28857, + "grad_norm": 0.7081791347421768, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 28857 + }, + { + "epoch": 0.28858, + "grad_norm": 0.647868338067922, + "learning_rate": 0.003, + "loss": 4.029, + "step": 28858 + }, + { + "epoch": 0.28859, + "grad_norm": 0.5894469718862403, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 28859 + }, + { + "epoch": 0.2886, + "grad_norm": 0.5572159828049006, + "learning_rate": 0.003, + "loss": 4.052, + "step": 28860 + }, + { + "epoch": 0.28861, + "grad_norm": 0.7162841389792863, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 28861 + }, + { + "epoch": 0.28862, + "grad_norm": 0.9450464260186382, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 28862 + }, + { + "epoch": 0.28863, + "grad_norm": 1.1407901747760987, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 28863 + }, + { + "epoch": 0.28864, + "grad_norm": 0.9403691103361915, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 28864 + }, + { + "epoch": 0.28865, + "grad_norm": 0.9196587668469819, + "learning_rate": 0.003, + "loss": 4.03, + "step": 28865 + }, + { + "epoch": 0.28866, + "grad_norm": 0.7787395306860375, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 28866 + }, + { + "epoch": 0.28867, + "grad_norm": 0.8920242004885823, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 28867 + }, + { + "epoch": 0.28868, + "grad_norm": 0.9270328947617662, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 28868 + }, + { + "epoch": 0.28869, + "grad_norm": 0.9506191054783535, + "learning_rate": 0.003, + "loss": 4.038, + "step": 28869 + }, + { + "epoch": 0.2887, + "grad_norm": 1.0320246789051546, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 28870 + }, + { + "epoch": 0.28871, + "grad_norm": 0.9396029047915674, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 28871 + }, + { + "epoch": 0.28872, + "grad_norm": 1.0876543182870668, + "learning_rate": 0.003, + "loss": 4.0784, + "step": 28872 + }, + { + "epoch": 0.28873, + "grad_norm": 1.145933539183094, + "learning_rate": 0.003, + "loss": 4.039, + "step": 28873 + }, + { + "epoch": 0.28874, + "grad_norm": 1.0098045356186232, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 28874 + }, + { + "epoch": 0.28875, + "grad_norm": 0.9141701900183897, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 28875 + }, + { + "epoch": 0.28876, + "grad_norm": 0.9290129530760866, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 28876 + }, + { + "epoch": 0.28877, + "grad_norm": 0.8900875184167762, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 28877 + }, + { + "epoch": 0.28878, + "grad_norm": 0.9064701353647765, + "learning_rate": 0.003, + "loss": 4.044, + "step": 28878 + }, + { + "epoch": 0.28879, + "grad_norm": 0.8710248324815222, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 28879 + }, + { + "epoch": 0.2888, + "grad_norm": 0.8722735598674553, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 28880 + }, + { + "epoch": 0.28881, + "grad_norm": 0.8454826902457475, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 28881 + }, + { + "epoch": 0.28882, + "grad_norm": 0.9500881071323944, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 28882 + }, + { + "epoch": 0.28883, + "grad_norm": 0.9746945926448343, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 28883 + }, + { + "epoch": 0.28884, + "grad_norm": 1.142877424621866, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 28884 + }, + { + "epoch": 0.28885, + "grad_norm": 0.9559126363643763, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 28885 + }, + { + "epoch": 0.28886, + "grad_norm": 0.851044036397433, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 28886 + }, + { + "epoch": 0.28887, + "grad_norm": 0.8085349619339791, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 28887 + }, + { + "epoch": 0.28888, + "grad_norm": 0.8599276643573764, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 28888 + }, + { + "epoch": 0.28889, + "grad_norm": 0.7579893657603481, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 28889 + }, + { + "epoch": 0.2889, + "grad_norm": 0.7261995900233258, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 28890 + }, + { + "epoch": 0.28891, + "grad_norm": 0.8164054520380887, + "learning_rate": 0.003, + "loss": 4.027, + "step": 28891 + }, + { + "epoch": 0.28892, + "grad_norm": 0.9074819684848089, + "learning_rate": 0.003, + "loss": 4.034, + "step": 28892 + }, + { + "epoch": 0.28893, + "grad_norm": 0.9588242370465906, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 28893 + }, + { + "epoch": 0.28894, + "grad_norm": 0.8573618589972752, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 28894 + }, + { + "epoch": 0.28895, + "grad_norm": 0.7146283171071297, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 28895 + }, + { + "epoch": 0.28896, + "grad_norm": 0.6214239937521602, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 28896 + }, + { + "epoch": 0.28897, + "grad_norm": 0.5434195094438631, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 28897 + }, + { + "epoch": 0.28898, + "grad_norm": 0.5603548809203606, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 28898 + }, + { + "epoch": 0.28899, + "grad_norm": 0.6039629738600372, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 28899 + }, + { + "epoch": 0.289, + "grad_norm": 0.7245867621656004, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 28900 + }, + { + "epoch": 0.28901, + "grad_norm": 0.8821930362899142, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 28901 + }, + { + "epoch": 0.28902, + "grad_norm": 1.113326275254241, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 28902 + }, + { + "epoch": 0.28903, + "grad_norm": 0.8840333424071184, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 28903 + }, + { + "epoch": 0.28904, + "grad_norm": 0.9515853769265549, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28904 + }, + { + "epoch": 0.28905, + "grad_norm": 0.9406130391232622, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 28905 + }, + { + "epoch": 0.28906, + "grad_norm": 0.8085226066854387, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 28906 + }, + { + "epoch": 0.28907, + "grad_norm": 0.8497668464161044, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 28907 + }, + { + "epoch": 0.28908, + "grad_norm": 0.7258700497289223, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 28908 + }, + { + "epoch": 0.28909, + "grad_norm": 0.5796652716136772, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 28909 + }, + { + "epoch": 0.2891, + "grad_norm": 0.5773452600720476, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 28910 + }, + { + "epoch": 0.28911, + "grad_norm": 0.6489792879303007, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 28911 + }, + { + "epoch": 0.28912, + "grad_norm": 0.7451795230437973, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 28912 + }, + { + "epoch": 0.28913, + "grad_norm": 0.8879992894495066, + "learning_rate": 0.003, + "loss": 4.018, + "step": 28913 + }, + { + "epoch": 0.28914, + "grad_norm": 0.9263451341884663, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 28914 + }, + { + "epoch": 0.28915, + "grad_norm": 0.8917579206926248, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 28915 + }, + { + "epoch": 0.28916, + "grad_norm": 0.8607799072119376, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 28916 + }, + { + "epoch": 0.28917, + "grad_norm": 0.8118260330090631, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 28917 + }, + { + "epoch": 0.28918, + "grad_norm": 0.767832759979061, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 28918 + }, + { + "epoch": 0.28919, + "grad_norm": 0.766954785577315, + "learning_rate": 0.003, + "loss": 4.01, + "step": 28919 + }, + { + "epoch": 0.2892, + "grad_norm": 0.9121939162221603, + "learning_rate": 0.003, + "loss": 4.04, + "step": 28920 + }, + { + "epoch": 0.28921, + "grad_norm": 1.1709053490028063, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 28921 + }, + { + "epoch": 0.28922, + "grad_norm": 0.8883039525212179, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 28922 + }, + { + "epoch": 0.28923, + "grad_norm": 0.7980417516190044, + "learning_rate": 0.003, + "loss": 4.014, + "step": 28923 + }, + { + "epoch": 0.28924, + "grad_norm": 0.8725842407269302, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 28924 + }, + { + "epoch": 0.28925, + "grad_norm": 0.941083131137281, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 28925 + }, + { + "epoch": 0.28926, + "grad_norm": 0.8662753961081622, + "learning_rate": 0.003, + "loss": 4.006, + "step": 28926 + }, + { + "epoch": 0.28927, + "grad_norm": 0.8161538545027825, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 28927 + }, + { + "epoch": 0.28928, + "grad_norm": 0.8914212846765761, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 28928 + }, + { + "epoch": 0.28929, + "grad_norm": 1.058540640912842, + "learning_rate": 0.003, + "loss": 4.063, + "step": 28929 + }, + { + "epoch": 0.2893, + "grad_norm": 1.2074767942989324, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 28930 + }, + { + "epoch": 0.28931, + "grad_norm": 0.8109176845733301, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 28931 + }, + { + "epoch": 0.28932, + "grad_norm": 0.7664234781087832, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 28932 + }, + { + "epoch": 0.28933, + "grad_norm": 0.8489470995187731, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 28933 + }, + { + "epoch": 0.28934, + "grad_norm": 0.8087517299385847, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 28934 + }, + { + "epoch": 0.28935, + "grad_norm": 0.8193664487211005, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 28935 + }, + { + "epoch": 0.28936, + "grad_norm": 0.8405605650285187, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 28936 + }, + { + "epoch": 0.28937, + "grad_norm": 0.7667862875405236, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 28937 + }, + { + "epoch": 0.28938, + "grad_norm": 0.7362073526187615, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 28938 + }, + { + "epoch": 0.28939, + "grad_norm": 0.8217980027767611, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 28939 + }, + { + "epoch": 0.2894, + "grad_norm": 0.8919069308650239, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 28940 + }, + { + "epoch": 0.28941, + "grad_norm": 1.0021654430890485, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 28941 + }, + { + "epoch": 0.28942, + "grad_norm": 1.1888096855212755, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 28942 + }, + { + "epoch": 0.28943, + "grad_norm": 0.9588812091279149, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 28943 + }, + { + "epoch": 0.28944, + "grad_norm": 0.96319800565272, + "learning_rate": 0.003, + "loss": 4.041, + "step": 28944 + }, + { + "epoch": 0.28945, + "grad_norm": 0.8708084918092808, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 28945 + }, + { + "epoch": 0.28946, + "grad_norm": 0.8233991791945643, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 28946 + }, + { + "epoch": 0.28947, + "grad_norm": 0.7912094511926252, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 28947 + }, + { + "epoch": 0.28948, + "grad_norm": 0.7349432140474855, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 28948 + }, + { + "epoch": 0.28949, + "grad_norm": 0.7882877328977578, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 28949 + }, + { + "epoch": 0.2895, + "grad_norm": 0.8479933097718767, + "learning_rate": 0.003, + "loss": 4.033, + "step": 28950 + }, + { + "epoch": 0.28951, + "grad_norm": 0.8238371714602934, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 28951 + }, + { + "epoch": 0.28952, + "grad_norm": 0.7833981751328564, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 28952 + }, + { + "epoch": 0.28953, + "grad_norm": 0.7779321286986476, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 28953 + }, + { + "epoch": 0.28954, + "grad_norm": 0.9222481273354509, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 28954 + }, + { + "epoch": 0.28955, + "grad_norm": 1.0553150709426524, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 28955 + }, + { + "epoch": 0.28956, + "grad_norm": 0.9296024386697446, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 28956 + }, + { + "epoch": 0.28957, + "grad_norm": 0.8057004486816919, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 28957 + }, + { + "epoch": 0.28958, + "grad_norm": 0.711702985987495, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 28958 + }, + { + "epoch": 0.28959, + "grad_norm": 0.7211715957752926, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 28959 + }, + { + "epoch": 0.2896, + "grad_norm": 0.819124571215337, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 28960 + }, + { + "epoch": 0.28961, + "grad_norm": 0.7952229814352482, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 28961 + }, + { + "epoch": 0.28962, + "grad_norm": 0.7533518860602568, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 28962 + }, + { + "epoch": 0.28963, + "grad_norm": 0.8246125360983851, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 28963 + }, + { + "epoch": 0.28964, + "grad_norm": 0.8897105814362145, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 28964 + }, + { + "epoch": 0.28965, + "grad_norm": 1.0294537201267124, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 28965 + }, + { + "epoch": 0.28966, + "grad_norm": 1.0502585848701835, + "learning_rate": 0.003, + "loss": 4.058, + "step": 28966 + }, + { + "epoch": 0.28967, + "grad_norm": 0.92284934628861, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 28967 + }, + { + "epoch": 0.28968, + "grad_norm": 0.9363594262704179, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 28968 + }, + { + "epoch": 0.28969, + "grad_norm": 0.8690761993032236, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 28969 + }, + { + "epoch": 0.2897, + "grad_norm": 0.824409475588958, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 28970 + }, + { + "epoch": 0.28971, + "grad_norm": 0.8035164095434366, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 28971 + }, + { + "epoch": 0.28972, + "grad_norm": 0.7589006996790664, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 28972 + }, + { + "epoch": 0.28973, + "grad_norm": 0.7615179657641672, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 28973 + }, + { + "epoch": 0.28974, + "grad_norm": 0.8884397920607304, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 28974 + }, + { + "epoch": 0.28975, + "grad_norm": 1.1910551675567682, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 28975 + }, + { + "epoch": 0.28976, + "grad_norm": 0.8726550101565395, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 28976 + }, + { + "epoch": 0.28977, + "grad_norm": 0.8365635196242915, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 28977 + }, + { + "epoch": 0.28978, + "grad_norm": 0.903801751634053, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 28978 + }, + { + "epoch": 0.28979, + "grad_norm": 0.9267446222623441, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 28979 + }, + { + "epoch": 0.2898, + "grad_norm": 0.9326200584049709, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 28980 + }, + { + "epoch": 0.28981, + "grad_norm": 0.9669692812758987, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 28981 + }, + { + "epoch": 0.28982, + "grad_norm": 0.966563366586481, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 28982 + }, + { + "epoch": 0.28983, + "grad_norm": 0.8688769261203574, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 28983 + }, + { + "epoch": 0.28984, + "grad_norm": 0.7425761664345544, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 28984 + }, + { + "epoch": 0.28985, + "grad_norm": 0.719820241920268, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 28985 + }, + { + "epoch": 0.28986, + "grad_norm": 0.7084364008142771, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 28986 + }, + { + "epoch": 0.28987, + "grad_norm": 0.6363119903990807, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 28987 + }, + { + "epoch": 0.28988, + "grad_norm": 0.6195040204692277, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 28988 + }, + { + "epoch": 0.28989, + "grad_norm": 0.6076246697955769, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 28989 + }, + { + "epoch": 0.2899, + "grad_norm": 0.7139172986381505, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 28990 + }, + { + "epoch": 0.28991, + "grad_norm": 0.9037942607649624, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 28991 + }, + { + "epoch": 0.28992, + "grad_norm": 1.0507100135052365, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 28992 + }, + { + "epoch": 0.28993, + "grad_norm": 0.8792692268518675, + "learning_rate": 0.003, + "loss": 4.0055, + "step": 28993 + }, + { + "epoch": 0.28994, + "grad_norm": 0.8223857944248871, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 28994 + }, + { + "epoch": 0.28995, + "grad_norm": 0.9360105325980802, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 28995 + }, + { + "epoch": 0.28996, + "grad_norm": 0.9247845156632187, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 28996 + }, + { + "epoch": 0.28997, + "grad_norm": 0.9075118864410737, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 28997 + }, + { + "epoch": 0.28998, + "grad_norm": 0.9274920322665163, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 28998 + }, + { + "epoch": 0.28999, + "grad_norm": 0.9301193002309273, + "learning_rate": 0.003, + "loss": 4.062, + "step": 28999 + }, + { + "epoch": 0.29, + "grad_norm": 0.8063186387535685, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 29000 + }, + { + "epoch": 0.29001, + "grad_norm": 0.6998469161690216, + "learning_rate": 0.003, + "loss": 4.0788, + "step": 29001 + }, + { + "epoch": 0.29002, + "grad_norm": 0.6628655320968777, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 29002 + }, + { + "epoch": 0.29003, + "grad_norm": 0.7055446184701589, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 29003 + }, + { + "epoch": 0.29004, + "grad_norm": 0.7977392001401715, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 29004 + }, + { + "epoch": 0.29005, + "grad_norm": 0.9663136620822266, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 29005 + }, + { + "epoch": 0.29006, + "grad_norm": 1.0732239970082185, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 29006 + }, + { + "epoch": 0.29007, + "grad_norm": 0.8839605849640741, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 29007 + }, + { + "epoch": 0.29008, + "grad_norm": 1.0081742265128457, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 29008 + }, + { + "epoch": 0.29009, + "grad_norm": 1.1669634331025756, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 29009 + }, + { + "epoch": 0.2901, + "grad_norm": 0.8496385840039626, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 29010 + }, + { + "epoch": 0.29011, + "grad_norm": 0.7928554066794617, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 29011 + }, + { + "epoch": 0.29012, + "grad_norm": 0.8540558027110825, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29012 + }, + { + "epoch": 0.29013, + "grad_norm": 0.7824705400205778, + "learning_rate": 0.003, + "loss": 4.07, + "step": 29013 + }, + { + "epoch": 0.29014, + "grad_norm": 0.7508384438289558, + "learning_rate": 0.003, + "loss": 4.066, + "step": 29014 + }, + { + "epoch": 0.29015, + "grad_norm": 0.849971212923941, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 29015 + }, + { + "epoch": 0.29016, + "grad_norm": 0.975196621925759, + "learning_rate": 0.003, + "loss": 4.048, + "step": 29016 + }, + { + "epoch": 0.29017, + "grad_norm": 1.1610772629896862, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 29017 + }, + { + "epoch": 0.29018, + "grad_norm": 0.9337849370046681, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 29018 + }, + { + "epoch": 0.29019, + "grad_norm": 0.7789384036323321, + "learning_rate": 0.003, + "loss": 4.048, + "step": 29019 + }, + { + "epoch": 0.2902, + "grad_norm": 0.7247002327345203, + "learning_rate": 0.003, + "loss": 4.03, + "step": 29020 + }, + { + "epoch": 0.29021, + "grad_norm": 0.8343945083180758, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 29021 + }, + { + "epoch": 0.29022, + "grad_norm": 0.9794451664189109, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 29022 + }, + { + "epoch": 0.29023, + "grad_norm": 1.1109540449905884, + "learning_rate": 0.003, + "loss": 4.04, + "step": 29023 + }, + { + "epoch": 0.29024, + "grad_norm": 0.9064800008464814, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 29024 + }, + { + "epoch": 0.29025, + "grad_norm": 0.8810147389983761, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 29025 + }, + { + "epoch": 0.29026, + "grad_norm": 1.0028645438064123, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 29026 + }, + { + "epoch": 0.29027, + "grad_norm": 1.0417299244975535, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 29027 + }, + { + "epoch": 0.29028, + "grad_norm": 0.9347711329717023, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 29028 + }, + { + "epoch": 0.29029, + "grad_norm": 0.855309585154985, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 29029 + }, + { + "epoch": 0.2903, + "grad_norm": 0.88988367402162, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 29030 + }, + { + "epoch": 0.29031, + "grad_norm": 1.0074415556997198, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 29031 + }, + { + "epoch": 0.29032, + "grad_norm": 1.0013748834042207, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 29032 + }, + { + "epoch": 0.29033, + "grad_norm": 1.0032936374874766, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 29033 + }, + { + "epoch": 0.29034, + "grad_norm": 1.0440594839172814, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 29034 + }, + { + "epoch": 0.29035, + "grad_norm": 0.9621889667334785, + "learning_rate": 0.003, + "loss": 4.055, + "step": 29035 + }, + { + "epoch": 0.29036, + "grad_norm": 0.8260609841561933, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 29036 + }, + { + "epoch": 0.29037, + "grad_norm": 0.6968933938847688, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 29037 + }, + { + "epoch": 0.29038, + "grad_norm": 0.82885146563968, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 29038 + }, + { + "epoch": 0.29039, + "grad_norm": 0.9718326696076595, + "learning_rate": 0.003, + "loss": 4.087, + "step": 29039 + }, + { + "epoch": 0.2904, + "grad_norm": 0.9866022752244584, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 29040 + }, + { + "epoch": 0.29041, + "grad_norm": 0.958379521653482, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 29041 + }, + { + "epoch": 0.29042, + "grad_norm": 0.7910850266352244, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 29042 + }, + { + "epoch": 0.29043, + "grad_norm": 0.9021784771549576, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 29043 + }, + { + "epoch": 0.29044, + "grad_norm": 1.0413643594825606, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 29044 + }, + { + "epoch": 0.29045, + "grad_norm": 1.0311816862456207, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 29045 + }, + { + "epoch": 0.29046, + "grad_norm": 1.0162135162183141, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 29046 + }, + { + "epoch": 0.29047, + "grad_norm": 1.030962499576916, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 29047 + }, + { + "epoch": 0.29048, + "grad_norm": 1.1003512882897302, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 29048 + }, + { + "epoch": 0.29049, + "grad_norm": 0.7823182390929905, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 29049 + }, + { + "epoch": 0.2905, + "grad_norm": 0.6700577133750076, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 29050 + }, + { + "epoch": 0.29051, + "grad_norm": 0.5724679014614362, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 29051 + }, + { + "epoch": 0.29052, + "grad_norm": 0.5446329882275273, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 29052 + }, + { + "epoch": 0.29053, + "grad_norm": 0.5767621905010737, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 29053 + }, + { + "epoch": 0.29054, + "grad_norm": 0.6393633539014837, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 29054 + }, + { + "epoch": 0.29055, + "grad_norm": 0.7523622361464737, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 29055 + }, + { + "epoch": 0.29056, + "grad_norm": 0.8856001813867815, + "learning_rate": 0.003, + "loss": 4.039, + "step": 29056 + }, + { + "epoch": 0.29057, + "grad_norm": 1.0676416978946066, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 29057 + }, + { + "epoch": 0.29058, + "grad_norm": 0.882066029945932, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 29058 + }, + { + "epoch": 0.29059, + "grad_norm": 0.641676822051498, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 29059 + }, + { + "epoch": 0.2906, + "grad_norm": 0.5952505024829797, + "learning_rate": 0.003, + "loss": 4.015, + "step": 29060 + }, + { + "epoch": 0.29061, + "grad_norm": 0.7347464262698876, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 29061 + }, + { + "epoch": 0.29062, + "grad_norm": 0.8049741919398687, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 29062 + }, + { + "epoch": 0.29063, + "grad_norm": 0.7292550614254075, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 29063 + }, + { + "epoch": 0.29064, + "grad_norm": 0.64751892168709, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 29064 + }, + { + "epoch": 0.29065, + "grad_norm": 0.6988684266375307, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 29065 + }, + { + "epoch": 0.29066, + "grad_norm": 0.7879739710066137, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 29066 + }, + { + "epoch": 0.29067, + "grad_norm": 0.7912355083608571, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 29067 + }, + { + "epoch": 0.29068, + "grad_norm": 0.7466234972113204, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 29068 + }, + { + "epoch": 0.29069, + "grad_norm": 0.7467073765721259, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 29069 + }, + { + "epoch": 0.2907, + "grad_norm": 0.7522997760474645, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 29070 + }, + { + "epoch": 0.29071, + "grad_norm": 0.7945991428872144, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 29071 + }, + { + "epoch": 0.29072, + "grad_norm": 0.9778435310244655, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 29072 + }, + { + "epoch": 0.29073, + "grad_norm": 1.2424869038419357, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 29073 + }, + { + "epoch": 0.29074, + "grad_norm": 0.6971897608627028, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 29074 + }, + { + "epoch": 0.29075, + "grad_norm": 0.6244573819422945, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 29075 + }, + { + "epoch": 0.29076, + "grad_norm": 0.5621536073217575, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 29076 + }, + { + "epoch": 0.29077, + "grad_norm": 0.5955133103667601, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 29077 + }, + { + "epoch": 0.29078, + "grad_norm": 0.636876995375685, + "learning_rate": 0.003, + "loss": 4.01, + "step": 29078 + }, + { + "epoch": 0.29079, + "grad_norm": 0.7145822512966056, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 29079 + }, + { + "epoch": 0.2908, + "grad_norm": 0.8587816043339325, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 29080 + }, + { + "epoch": 0.29081, + "grad_norm": 0.9886681826339178, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 29081 + }, + { + "epoch": 0.29082, + "grad_norm": 1.1663563796101664, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 29082 + }, + { + "epoch": 0.29083, + "grad_norm": 1.0939174611670122, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 29083 + }, + { + "epoch": 0.29084, + "grad_norm": 0.9684962154354666, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 29084 + }, + { + "epoch": 0.29085, + "grad_norm": 0.9636114080243234, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 29085 + }, + { + "epoch": 0.29086, + "grad_norm": 1.0003433449468262, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 29086 + }, + { + "epoch": 0.29087, + "grad_norm": 0.9987590940573388, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 29087 + }, + { + "epoch": 0.29088, + "grad_norm": 0.9677320768125557, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 29088 + }, + { + "epoch": 0.29089, + "grad_norm": 0.9013250872209218, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 29089 + }, + { + "epoch": 0.2909, + "grad_norm": 0.8541432778093003, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 29090 + }, + { + "epoch": 0.29091, + "grad_norm": 0.7774036175780287, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 29091 + }, + { + "epoch": 0.29092, + "grad_norm": 0.7504335544785947, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 29092 + }, + { + "epoch": 0.29093, + "grad_norm": 0.7248556266488336, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 29093 + }, + { + "epoch": 0.29094, + "grad_norm": 0.7611453350552821, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 29094 + }, + { + "epoch": 0.29095, + "grad_norm": 0.9036233775356234, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 29095 + }, + { + "epoch": 0.29096, + "grad_norm": 0.9608494345301011, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 29096 + }, + { + "epoch": 0.29097, + "grad_norm": 1.0865538487596067, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 29097 + }, + { + "epoch": 0.29098, + "grad_norm": 1.0088182214699868, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 29098 + }, + { + "epoch": 0.29099, + "grad_norm": 0.8686632910774595, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 29099 + }, + { + "epoch": 0.291, + "grad_norm": 0.7632585279792381, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 29100 + }, + { + "epoch": 0.29101, + "grad_norm": 0.9137215478277559, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 29101 + }, + { + "epoch": 0.29102, + "grad_norm": 1.0876449179019083, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 29102 + }, + { + "epoch": 0.29103, + "grad_norm": 1.091843808926389, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 29103 + }, + { + "epoch": 0.29104, + "grad_norm": 1.1068651830643574, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 29104 + }, + { + "epoch": 0.29105, + "grad_norm": 1.0828130287017224, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 29105 + }, + { + "epoch": 0.29106, + "grad_norm": 0.8604116004067545, + "learning_rate": 0.003, + "loss": 4.055, + "step": 29106 + }, + { + "epoch": 0.29107, + "grad_norm": 0.7828077053221636, + "learning_rate": 0.003, + "loss": 4.02, + "step": 29107 + }, + { + "epoch": 0.29108, + "grad_norm": 0.7767082333441759, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 29108 + }, + { + "epoch": 0.29109, + "grad_norm": 0.7946975472712052, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 29109 + }, + { + "epoch": 0.2911, + "grad_norm": 0.7326015559378439, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 29110 + }, + { + "epoch": 0.29111, + "grad_norm": 0.6481010880823052, + "learning_rate": 0.003, + "loss": 4.052, + "step": 29111 + }, + { + "epoch": 0.29112, + "grad_norm": 0.5679321995182853, + "learning_rate": 0.003, + "loss": 4.053, + "step": 29112 + }, + { + "epoch": 0.29113, + "grad_norm": 0.6982882691454518, + "learning_rate": 0.003, + "loss": 4.0018, + "step": 29113 + }, + { + "epoch": 0.29114, + "grad_norm": 0.8768824511204043, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 29114 + }, + { + "epoch": 0.29115, + "grad_norm": 1.009020915345648, + "learning_rate": 0.003, + "loss": 4.04, + "step": 29115 + }, + { + "epoch": 0.29116, + "grad_norm": 1.1124340972151603, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 29116 + }, + { + "epoch": 0.29117, + "grad_norm": 0.8486503157805139, + "learning_rate": 0.003, + "loss": 4.061, + "step": 29117 + }, + { + "epoch": 0.29118, + "grad_norm": 0.7301274138635538, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 29118 + }, + { + "epoch": 0.29119, + "grad_norm": 0.677504566770362, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 29119 + }, + { + "epoch": 0.2912, + "grad_norm": 0.7040692663735684, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 29120 + }, + { + "epoch": 0.29121, + "grad_norm": 0.7662274462026148, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 29121 + }, + { + "epoch": 0.29122, + "grad_norm": 0.8316691644135649, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 29122 + }, + { + "epoch": 0.29123, + "grad_norm": 0.8281910237430045, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 29123 + }, + { + "epoch": 0.29124, + "grad_norm": 0.7926456141834706, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 29124 + }, + { + "epoch": 0.29125, + "grad_norm": 0.7986394671081941, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 29125 + }, + { + "epoch": 0.29126, + "grad_norm": 0.8069511561975887, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 29126 + }, + { + "epoch": 0.29127, + "grad_norm": 0.8303218114562186, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 29127 + }, + { + "epoch": 0.29128, + "grad_norm": 0.8826488097190619, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 29128 + }, + { + "epoch": 0.29129, + "grad_norm": 0.7946248949147684, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 29129 + }, + { + "epoch": 0.2913, + "grad_norm": 0.7271279939730684, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 29130 + }, + { + "epoch": 0.29131, + "grad_norm": 0.7644075406714997, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 29131 + }, + { + "epoch": 0.29132, + "grad_norm": 0.8223171611208211, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 29132 + }, + { + "epoch": 0.29133, + "grad_norm": 0.9724050191743714, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 29133 + }, + { + "epoch": 0.29134, + "grad_norm": 1.0140588538320097, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 29134 + }, + { + "epoch": 0.29135, + "grad_norm": 0.8474992815444606, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 29135 + }, + { + "epoch": 0.29136, + "grad_norm": 0.7710624274013632, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 29136 + }, + { + "epoch": 0.29137, + "grad_norm": 0.8816974042118865, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 29137 + }, + { + "epoch": 0.29138, + "grad_norm": 0.9964430355257407, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 29138 + }, + { + "epoch": 0.29139, + "grad_norm": 0.8346197104328477, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 29139 + }, + { + "epoch": 0.2914, + "grad_norm": 0.8363763717419509, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 29140 + }, + { + "epoch": 0.29141, + "grad_norm": 0.9938523638387123, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 29141 + }, + { + "epoch": 0.29142, + "grad_norm": 1.031964046885035, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 29142 + }, + { + "epoch": 0.29143, + "grad_norm": 1.0924702721225699, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 29143 + }, + { + "epoch": 0.29144, + "grad_norm": 1.082177194103632, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 29144 + }, + { + "epoch": 0.29145, + "grad_norm": 1.137026832348986, + "learning_rate": 0.003, + "loss": 4.086, + "step": 29145 + }, + { + "epoch": 0.29146, + "grad_norm": 0.9801174501113451, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 29146 + }, + { + "epoch": 0.29147, + "grad_norm": 1.105314278321518, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 29147 + }, + { + "epoch": 0.29148, + "grad_norm": 1.029273748969058, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 29148 + }, + { + "epoch": 0.29149, + "grad_norm": 1.0235732170606477, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 29149 + }, + { + "epoch": 0.2915, + "grad_norm": 0.8434851133467528, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 29150 + }, + { + "epoch": 0.29151, + "grad_norm": 0.7950055688953716, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 29151 + }, + { + "epoch": 0.29152, + "grad_norm": 0.8142024325311407, + "learning_rate": 0.003, + "loss": 4.06, + "step": 29152 + }, + { + "epoch": 0.29153, + "grad_norm": 0.8334174790882151, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 29153 + }, + { + "epoch": 0.29154, + "grad_norm": 0.9762462029590928, + "learning_rate": 0.003, + "loss": 4.062, + "step": 29154 + }, + { + "epoch": 0.29155, + "grad_norm": 0.9922185794266971, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 29155 + }, + { + "epoch": 0.29156, + "grad_norm": 0.8895402332661667, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 29156 + }, + { + "epoch": 0.29157, + "grad_norm": 0.7973012672503906, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 29157 + }, + { + "epoch": 0.29158, + "grad_norm": 0.7274269102993839, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 29158 + }, + { + "epoch": 0.29159, + "grad_norm": 0.6531038225079099, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 29159 + }, + { + "epoch": 0.2916, + "grad_norm": 0.6493453009214776, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 29160 + }, + { + "epoch": 0.29161, + "grad_norm": 0.7864708137416616, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 29161 + }, + { + "epoch": 0.29162, + "grad_norm": 0.8975987020304729, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 29162 + }, + { + "epoch": 0.29163, + "grad_norm": 1.0543012784670018, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 29163 + }, + { + "epoch": 0.29164, + "grad_norm": 0.8735621383671371, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 29164 + }, + { + "epoch": 0.29165, + "grad_norm": 0.7138285495075147, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 29165 + }, + { + "epoch": 0.29166, + "grad_norm": 0.6870998365700985, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 29166 + }, + { + "epoch": 0.29167, + "grad_norm": 0.7753553410597777, + "learning_rate": 0.003, + "loss": 4.031, + "step": 29167 + }, + { + "epoch": 0.29168, + "grad_norm": 0.9323123392932248, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 29168 + }, + { + "epoch": 0.29169, + "grad_norm": 0.9055132492437714, + "learning_rate": 0.003, + "loss": 4.042, + "step": 29169 + }, + { + "epoch": 0.2917, + "grad_norm": 0.8682574314297339, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 29170 + }, + { + "epoch": 0.29171, + "grad_norm": 0.8597638417454839, + "learning_rate": 0.003, + "loss": 4.05, + "step": 29171 + }, + { + "epoch": 0.29172, + "grad_norm": 0.8230271795583927, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 29172 + }, + { + "epoch": 0.29173, + "grad_norm": 0.7930967870319119, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 29173 + }, + { + "epoch": 0.29174, + "grad_norm": 0.831662052335709, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 29174 + }, + { + "epoch": 0.29175, + "grad_norm": 1.0465677821426422, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 29175 + }, + { + "epoch": 0.29176, + "grad_norm": 1.1632088759592274, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 29176 + }, + { + "epoch": 0.29177, + "grad_norm": 0.8445603997645509, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 29177 + }, + { + "epoch": 0.29178, + "grad_norm": 0.8052434933886632, + "learning_rate": 0.003, + "loss": 4.036, + "step": 29178 + }, + { + "epoch": 0.29179, + "grad_norm": 0.7763277938392388, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 29179 + }, + { + "epoch": 0.2918, + "grad_norm": 0.8580652102447227, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 29180 + }, + { + "epoch": 0.29181, + "grad_norm": 0.9614079622945705, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 29181 + }, + { + "epoch": 0.29182, + "grad_norm": 0.9743996187795096, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 29182 + }, + { + "epoch": 0.29183, + "grad_norm": 1.0054756520846415, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 29183 + }, + { + "epoch": 0.29184, + "grad_norm": 0.932837054364316, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 29184 + }, + { + "epoch": 0.29185, + "grad_norm": 0.788973956754135, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 29185 + }, + { + "epoch": 0.29186, + "grad_norm": 0.6747679546183112, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 29186 + }, + { + "epoch": 0.29187, + "grad_norm": 0.6987433246156082, + "learning_rate": 0.003, + "loss": 4.029, + "step": 29187 + }, + { + "epoch": 0.29188, + "grad_norm": 0.657936457458693, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 29188 + }, + { + "epoch": 0.29189, + "grad_norm": 0.575536929539275, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 29189 + }, + { + "epoch": 0.2919, + "grad_norm": 0.7164055895233915, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 29190 + }, + { + "epoch": 0.29191, + "grad_norm": 0.8941036088036134, + "learning_rate": 0.003, + "loss": 4.03, + "step": 29191 + }, + { + "epoch": 0.29192, + "grad_norm": 0.8920879001504314, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 29192 + }, + { + "epoch": 0.29193, + "grad_norm": 0.8214872567383538, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 29193 + }, + { + "epoch": 0.29194, + "grad_norm": 0.9175091876800733, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 29194 + }, + { + "epoch": 0.29195, + "grad_norm": 0.9872106304650676, + "learning_rate": 0.003, + "loss": 4.016, + "step": 29195 + }, + { + "epoch": 0.29196, + "grad_norm": 1.1176477508342733, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 29196 + }, + { + "epoch": 0.29197, + "grad_norm": 0.9619487907506075, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 29197 + }, + { + "epoch": 0.29198, + "grad_norm": 0.9776565126326301, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 29198 + }, + { + "epoch": 0.29199, + "grad_norm": 0.9949532674466596, + "learning_rate": 0.003, + "loss": 4.081, + "step": 29199 + }, + { + "epoch": 0.292, + "grad_norm": 1.0320563204471962, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 29200 + }, + { + "epoch": 0.29201, + "grad_norm": 0.9345325605721327, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 29201 + }, + { + "epoch": 0.29202, + "grad_norm": 1.0742316754594423, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 29202 + }, + { + "epoch": 0.29203, + "grad_norm": 1.0027997059284104, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 29203 + }, + { + "epoch": 0.29204, + "grad_norm": 0.928447195293581, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 29204 + }, + { + "epoch": 0.29205, + "grad_norm": 0.8456000591072352, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 29205 + }, + { + "epoch": 0.29206, + "grad_norm": 0.9844721961679435, + "learning_rate": 0.003, + "loss": 4.051, + "step": 29206 + }, + { + "epoch": 0.29207, + "grad_norm": 1.1335750333174595, + "learning_rate": 0.003, + "loss": 4.041, + "step": 29207 + }, + { + "epoch": 0.29208, + "grad_norm": 0.9075446818762009, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 29208 + }, + { + "epoch": 0.29209, + "grad_norm": 0.9809588662354749, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 29209 + }, + { + "epoch": 0.2921, + "grad_norm": 1.029200134176531, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 29210 + }, + { + "epoch": 0.29211, + "grad_norm": 0.868850439967874, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 29211 + }, + { + "epoch": 0.29212, + "grad_norm": 0.873708167758466, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 29212 + }, + { + "epoch": 0.29213, + "grad_norm": 0.8158447183498174, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 29213 + }, + { + "epoch": 0.29214, + "grad_norm": 0.6507164469894889, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 29214 + }, + { + "epoch": 0.29215, + "grad_norm": 0.6861763273810364, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 29215 + }, + { + "epoch": 0.29216, + "grad_norm": 0.7143574097076757, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 29216 + }, + { + "epoch": 0.29217, + "grad_norm": 0.7327039244228825, + "learning_rate": 0.003, + "loss": 4.048, + "step": 29217 + }, + { + "epoch": 0.29218, + "grad_norm": 0.6485960094681973, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 29218 + }, + { + "epoch": 0.29219, + "grad_norm": 0.6981930432351805, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 29219 + }, + { + "epoch": 0.2922, + "grad_norm": 0.6992820644232114, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 29220 + }, + { + "epoch": 0.29221, + "grad_norm": 0.6962057632360914, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 29221 + }, + { + "epoch": 0.29222, + "grad_norm": 0.6977666355584855, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 29222 + }, + { + "epoch": 0.29223, + "grad_norm": 0.7465474077780243, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 29223 + }, + { + "epoch": 0.29224, + "grad_norm": 0.9517453483707403, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 29224 + }, + { + "epoch": 0.29225, + "grad_norm": 1.1218142744214015, + "learning_rate": 0.003, + "loss": 4.009, + "step": 29225 + }, + { + "epoch": 0.29226, + "grad_norm": 0.9471998580493735, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 29226 + }, + { + "epoch": 0.29227, + "grad_norm": 0.853073856561557, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 29227 + }, + { + "epoch": 0.29228, + "grad_norm": 0.7099725582606511, + "learning_rate": 0.003, + "loss": 4.032, + "step": 29228 + }, + { + "epoch": 0.29229, + "grad_norm": 0.5629308853205258, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 29229 + }, + { + "epoch": 0.2923, + "grad_norm": 0.6419144848869232, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 29230 + }, + { + "epoch": 0.29231, + "grad_norm": 0.7551077407406445, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 29231 + }, + { + "epoch": 0.29232, + "grad_norm": 0.7626643978592605, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 29232 + }, + { + "epoch": 0.29233, + "grad_norm": 0.8590399794655413, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 29233 + }, + { + "epoch": 0.29234, + "grad_norm": 0.9664789537192814, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 29234 + }, + { + "epoch": 0.29235, + "grad_norm": 0.9508569048809632, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 29235 + }, + { + "epoch": 0.29236, + "grad_norm": 0.8572733667249588, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 29236 + }, + { + "epoch": 0.29237, + "grad_norm": 0.8374413998244363, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 29237 + }, + { + "epoch": 0.29238, + "grad_norm": 0.8735640718375238, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 29238 + }, + { + "epoch": 0.29239, + "grad_norm": 0.841671237794719, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 29239 + }, + { + "epoch": 0.2924, + "grad_norm": 0.7575902159207949, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 29240 + }, + { + "epoch": 0.29241, + "grad_norm": 0.7566043571661052, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 29241 + }, + { + "epoch": 0.29242, + "grad_norm": 0.79825792028567, + "learning_rate": 0.003, + "loss": 4.017, + "step": 29242 + }, + { + "epoch": 0.29243, + "grad_norm": 0.8159087056042108, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 29243 + }, + { + "epoch": 0.29244, + "grad_norm": 0.9249957025315871, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 29244 + }, + { + "epoch": 0.29245, + "grad_norm": 0.9701232118893561, + "learning_rate": 0.003, + "loss": 4.06, + "step": 29245 + }, + { + "epoch": 0.29246, + "grad_norm": 0.8872888400042878, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 29246 + }, + { + "epoch": 0.29247, + "grad_norm": 0.8944752668375463, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 29247 + }, + { + "epoch": 0.29248, + "grad_norm": 0.9139806627538845, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 29248 + }, + { + "epoch": 0.29249, + "grad_norm": 0.9963871026578371, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 29249 + }, + { + "epoch": 0.2925, + "grad_norm": 1.0543277604006016, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 29250 + }, + { + "epoch": 0.29251, + "grad_norm": 0.8844306037002795, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 29251 + }, + { + "epoch": 0.29252, + "grad_norm": 0.9673732563168388, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 29252 + }, + { + "epoch": 0.29253, + "grad_norm": 1.1085691494239849, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 29253 + }, + { + "epoch": 0.29254, + "grad_norm": 0.956399186563831, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 29254 + }, + { + "epoch": 0.29255, + "grad_norm": 0.981087886241805, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 29255 + }, + { + "epoch": 0.29256, + "grad_norm": 0.994106705551306, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 29256 + }, + { + "epoch": 0.29257, + "grad_norm": 0.9929782391556476, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 29257 + }, + { + "epoch": 0.29258, + "grad_norm": 1.0645426371806397, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 29258 + }, + { + "epoch": 0.29259, + "grad_norm": 1.049779212907228, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 29259 + }, + { + "epoch": 0.2926, + "grad_norm": 0.9297195795311273, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 29260 + }, + { + "epoch": 0.29261, + "grad_norm": 0.7790024594330531, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 29261 + }, + { + "epoch": 0.29262, + "grad_norm": 0.7555906336654221, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 29262 + }, + { + "epoch": 0.29263, + "grad_norm": 0.7760064421155114, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 29263 + }, + { + "epoch": 0.29264, + "grad_norm": 0.8353561432244543, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 29264 + }, + { + "epoch": 0.29265, + "grad_norm": 0.8007325510098213, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 29265 + }, + { + "epoch": 0.29266, + "grad_norm": 0.8022486526402812, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 29266 + }, + { + "epoch": 0.29267, + "grad_norm": 0.8601438121014888, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 29267 + }, + { + "epoch": 0.29268, + "grad_norm": 1.0729872349764427, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 29268 + }, + { + "epoch": 0.29269, + "grad_norm": 0.9725287227646507, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 29269 + }, + { + "epoch": 0.2927, + "grad_norm": 0.9554655999488532, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 29270 + }, + { + "epoch": 0.29271, + "grad_norm": 0.9392125538291688, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 29271 + }, + { + "epoch": 0.29272, + "grad_norm": 0.9184674691401653, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 29272 + }, + { + "epoch": 0.29273, + "grad_norm": 0.8498709279276783, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 29273 + }, + { + "epoch": 0.29274, + "grad_norm": 0.7657970703606856, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 29274 + }, + { + "epoch": 0.29275, + "grad_norm": 0.7396758246519036, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 29275 + }, + { + "epoch": 0.29276, + "grad_norm": 0.6328088429878861, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 29276 + }, + { + "epoch": 0.29277, + "grad_norm": 0.6582458836254869, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29277 + }, + { + "epoch": 0.29278, + "grad_norm": 0.6649561771400734, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 29278 + }, + { + "epoch": 0.29279, + "grad_norm": 0.6536574785301233, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 29279 + }, + { + "epoch": 0.2928, + "grad_norm": 0.7517055026256518, + "learning_rate": 0.003, + "loss": 4.04, + "step": 29280 + }, + { + "epoch": 0.29281, + "grad_norm": 0.8912560338568013, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 29281 + }, + { + "epoch": 0.29282, + "grad_norm": 0.9959478073835422, + "learning_rate": 0.003, + "loss": 4.022, + "step": 29282 + }, + { + "epoch": 0.29283, + "grad_norm": 0.9046294925219313, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 29283 + }, + { + "epoch": 0.29284, + "grad_norm": 0.7797624875132041, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 29284 + }, + { + "epoch": 0.29285, + "grad_norm": 0.8077708132175788, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 29285 + }, + { + "epoch": 0.29286, + "grad_norm": 0.8695378672606314, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 29286 + }, + { + "epoch": 0.29287, + "grad_norm": 0.8987457946756661, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 29287 + }, + { + "epoch": 0.29288, + "grad_norm": 0.7780386503779303, + "learning_rate": 0.003, + "loss": 4.027, + "step": 29288 + }, + { + "epoch": 0.29289, + "grad_norm": 0.8709870923387596, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 29289 + }, + { + "epoch": 0.2929, + "grad_norm": 1.0456277059453967, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 29290 + }, + { + "epoch": 0.29291, + "grad_norm": 1.1723855708269368, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 29291 + }, + { + "epoch": 0.29292, + "grad_norm": 0.8104458824260034, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 29292 + }, + { + "epoch": 0.29293, + "grad_norm": 0.7300959197410518, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 29293 + }, + { + "epoch": 0.29294, + "grad_norm": 0.8325433941364638, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 29294 + }, + { + "epoch": 0.29295, + "grad_norm": 0.7752263757912774, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 29295 + }, + { + "epoch": 0.29296, + "grad_norm": 0.7330061442429696, + "learning_rate": 0.003, + "loss": 4.036, + "step": 29296 + }, + { + "epoch": 0.29297, + "grad_norm": 0.7606785513405115, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 29297 + }, + { + "epoch": 0.29298, + "grad_norm": 0.7752443979213537, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 29298 + }, + { + "epoch": 0.29299, + "grad_norm": 0.7163303971309438, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 29299 + }, + { + "epoch": 0.293, + "grad_norm": 0.7078724538567072, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 29300 + }, + { + "epoch": 0.29301, + "grad_norm": 0.7400479243784089, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 29301 + }, + { + "epoch": 0.29302, + "grad_norm": 0.614657902074472, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 29302 + }, + { + "epoch": 0.29303, + "grad_norm": 0.592817221258346, + "learning_rate": 0.003, + "loss": 4.0023, + "step": 29303 + }, + { + "epoch": 0.29304, + "grad_norm": 0.586983309838342, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 29304 + }, + { + "epoch": 0.29305, + "grad_norm": 0.672445884429573, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 29305 + }, + { + "epoch": 0.29306, + "grad_norm": 0.8363999158206532, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 29306 + }, + { + "epoch": 0.29307, + "grad_norm": 0.9472930103111694, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 29307 + }, + { + "epoch": 0.29308, + "grad_norm": 1.1250665591571334, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 29308 + }, + { + "epoch": 0.29309, + "grad_norm": 0.891751435054052, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 29309 + }, + { + "epoch": 0.2931, + "grad_norm": 0.8581268495889038, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 29310 + }, + { + "epoch": 0.29311, + "grad_norm": 0.8691667252222027, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 29311 + }, + { + "epoch": 0.29312, + "grad_norm": 0.9536017613377106, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 29312 + }, + { + "epoch": 0.29313, + "grad_norm": 1.0694797475805728, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 29313 + }, + { + "epoch": 0.29314, + "grad_norm": 1.0938337418640525, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 29314 + }, + { + "epoch": 0.29315, + "grad_norm": 0.8457552211769384, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 29315 + }, + { + "epoch": 0.29316, + "grad_norm": 0.6826360826222817, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 29316 + }, + { + "epoch": 0.29317, + "grad_norm": 0.6417864421668424, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 29317 + }, + { + "epoch": 0.29318, + "grad_norm": 0.6124779443548072, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 29318 + }, + { + "epoch": 0.29319, + "grad_norm": 0.6206130616811728, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 29319 + }, + { + "epoch": 0.2932, + "grad_norm": 0.7004393581641024, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 29320 + }, + { + "epoch": 0.29321, + "grad_norm": 0.8542966359168971, + "learning_rate": 0.003, + "loss": 4.031, + "step": 29321 + }, + { + "epoch": 0.29322, + "grad_norm": 0.893787775535247, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 29322 + }, + { + "epoch": 0.29323, + "grad_norm": 1.036254246874217, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 29323 + }, + { + "epoch": 0.29324, + "grad_norm": 1.1285848478345955, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 29324 + }, + { + "epoch": 0.29325, + "grad_norm": 1.0624341873589678, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 29325 + }, + { + "epoch": 0.29326, + "grad_norm": 1.1898100802304177, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 29326 + }, + { + "epoch": 0.29327, + "grad_norm": 0.9155271668904038, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 29327 + }, + { + "epoch": 0.29328, + "grad_norm": 0.7832101885107661, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 29328 + }, + { + "epoch": 0.29329, + "grad_norm": 0.8357660496198622, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 29329 + }, + { + "epoch": 0.2933, + "grad_norm": 0.8180783038957523, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 29330 + }, + { + "epoch": 0.29331, + "grad_norm": 0.766684286497782, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 29331 + }, + { + "epoch": 0.29332, + "grad_norm": 0.7822793183487627, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 29332 + }, + { + "epoch": 0.29333, + "grad_norm": 0.9624853512287003, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 29333 + }, + { + "epoch": 0.29334, + "grad_norm": 1.1986404088821028, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 29334 + }, + { + "epoch": 0.29335, + "grad_norm": 1.0079024707435384, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 29335 + }, + { + "epoch": 0.29336, + "grad_norm": 0.9257803558030068, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 29336 + }, + { + "epoch": 0.29337, + "grad_norm": 0.8121587682686504, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 29337 + }, + { + "epoch": 0.29338, + "grad_norm": 0.6169157464172522, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 29338 + }, + { + "epoch": 0.29339, + "grad_norm": 0.6479845919171563, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 29339 + }, + { + "epoch": 0.2934, + "grad_norm": 0.6429919865765577, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 29340 + }, + { + "epoch": 0.29341, + "grad_norm": 0.7489589063393629, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 29341 + }, + { + "epoch": 0.29342, + "grad_norm": 0.7818093763508773, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 29342 + }, + { + "epoch": 0.29343, + "grad_norm": 0.8723346775893264, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 29343 + }, + { + "epoch": 0.29344, + "grad_norm": 1.0294862172880013, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 29344 + }, + { + "epoch": 0.29345, + "grad_norm": 0.9509432984652021, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 29345 + }, + { + "epoch": 0.29346, + "grad_norm": 1.0070727284803271, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 29346 + }, + { + "epoch": 0.29347, + "grad_norm": 1.0465082248263042, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 29347 + }, + { + "epoch": 0.29348, + "grad_norm": 0.9302677982521874, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 29348 + }, + { + "epoch": 0.29349, + "grad_norm": 0.7910710682233116, + "learning_rate": 0.003, + "loss": 4.056, + "step": 29349 + }, + { + "epoch": 0.2935, + "grad_norm": 0.9211929392420392, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 29350 + }, + { + "epoch": 0.29351, + "grad_norm": 1.136217527111664, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 29351 + }, + { + "epoch": 0.29352, + "grad_norm": 0.9763175957886535, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 29352 + }, + { + "epoch": 0.29353, + "grad_norm": 1.0550535061560031, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 29353 + }, + { + "epoch": 0.29354, + "grad_norm": 1.1804677918962205, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 29354 + }, + { + "epoch": 0.29355, + "grad_norm": 0.8818082137386197, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 29355 + }, + { + "epoch": 0.29356, + "grad_norm": 0.8432497120911839, + "learning_rate": 0.003, + "loss": 4.067, + "step": 29356 + }, + { + "epoch": 0.29357, + "grad_norm": 0.8183324006555233, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 29357 + }, + { + "epoch": 0.29358, + "grad_norm": 0.8253824736379026, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 29358 + }, + { + "epoch": 0.29359, + "grad_norm": 0.7472111524234776, + "learning_rate": 0.003, + "loss": 4.0936, + "step": 29359 + }, + { + "epoch": 0.2936, + "grad_norm": 0.6769350553960137, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 29360 + }, + { + "epoch": 0.29361, + "grad_norm": 0.8636802864298198, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 29361 + }, + { + "epoch": 0.29362, + "grad_norm": 0.8750581057134919, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 29362 + }, + { + "epoch": 0.29363, + "grad_norm": 0.700701774055599, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 29363 + }, + { + "epoch": 0.29364, + "grad_norm": 0.6499345477004443, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 29364 + }, + { + "epoch": 0.29365, + "grad_norm": 0.7373235784125733, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 29365 + }, + { + "epoch": 0.29366, + "grad_norm": 0.9383187433769475, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 29366 + }, + { + "epoch": 0.29367, + "grad_norm": 1.2180201920740534, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 29367 + }, + { + "epoch": 0.29368, + "grad_norm": 0.8603715946271335, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 29368 + }, + { + "epoch": 0.29369, + "grad_norm": 0.7783481370551975, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 29369 + }, + { + "epoch": 0.2937, + "grad_norm": 0.7053504760861421, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 29370 + }, + { + "epoch": 0.29371, + "grad_norm": 0.6381069701125877, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 29371 + }, + { + "epoch": 0.29372, + "grad_norm": 0.5681433003024388, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 29372 + }, + { + "epoch": 0.29373, + "grad_norm": 0.5562354470133768, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 29373 + }, + { + "epoch": 0.29374, + "grad_norm": 0.5521219069834646, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 29374 + }, + { + "epoch": 0.29375, + "grad_norm": 0.6518573580635623, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 29375 + }, + { + "epoch": 0.29376, + "grad_norm": 0.8445787062573374, + "learning_rate": 0.003, + "loss": 3.9676, + "step": 29376 + }, + { + "epoch": 0.29377, + "grad_norm": 1.0716315636604676, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 29377 + }, + { + "epoch": 0.29378, + "grad_norm": 0.8855734739817278, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 29378 + }, + { + "epoch": 0.29379, + "grad_norm": 0.7040999874748098, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 29379 + }, + { + "epoch": 0.2938, + "grad_norm": 0.7885734862809363, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 29380 + }, + { + "epoch": 0.29381, + "grad_norm": 0.8772073173017452, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 29381 + }, + { + "epoch": 0.29382, + "grad_norm": 1.1070383687332594, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 29382 + }, + { + "epoch": 0.29383, + "grad_norm": 1.1823486821085294, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 29383 + }, + { + "epoch": 0.29384, + "grad_norm": 0.824068428310071, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 29384 + }, + { + "epoch": 0.29385, + "grad_norm": 0.7665301938662346, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 29385 + }, + { + "epoch": 0.29386, + "grad_norm": 0.7195913168185638, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 29386 + }, + { + "epoch": 0.29387, + "grad_norm": 0.7467475163534808, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 29387 + }, + { + "epoch": 0.29388, + "grad_norm": 0.7378043214796863, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 29388 + }, + { + "epoch": 0.29389, + "grad_norm": 0.7729985333158655, + "learning_rate": 0.003, + "loss": 4.012, + "step": 29389 + }, + { + "epoch": 0.2939, + "grad_norm": 0.8485866748452497, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 29390 + }, + { + "epoch": 0.29391, + "grad_norm": 0.8846622982264443, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 29391 + }, + { + "epoch": 0.29392, + "grad_norm": 1.0386675904125229, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 29392 + }, + { + "epoch": 0.29393, + "grad_norm": 1.143344713470598, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 29393 + }, + { + "epoch": 0.29394, + "grad_norm": 0.8325357363399948, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 29394 + }, + { + "epoch": 0.29395, + "grad_norm": 0.8474904371237587, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 29395 + }, + { + "epoch": 0.29396, + "grad_norm": 0.9194393410525915, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 29396 + }, + { + "epoch": 0.29397, + "grad_norm": 1.1119271052996582, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 29397 + }, + { + "epoch": 0.29398, + "grad_norm": 0.890142378599785, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 29398 + }, + { + "epoch": 0.29399, + "grad_norm": 0.775042444165704, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 29399 + }, + { + "epoch": 0.294, + "grad_norm": 0.6977583946766327, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29400 + }, + { + "epoch": 0.29401, + "grad_norm": 0.7195391011420256, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 29401 + }, + { + "epoch": 0.29402, + "grad_norm": 0.7237915768477647, + "learning_rate": 0.003, + "loss": 4.057, + "step": 29402 + }, + { + "epoch": 0.29403, + "grad_norm": 0.8111438671199099, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 29403 + }, + { + "epoch": 0.29404, + "grad_norm": 0.9547673336666775, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 29404 + }, + { + "epoch": 0.29405, + "grad_norm": 1.0508424483800807, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 29405 + }, + { + "epoch": 0.29406, + "grad_norm": 1.0430394877156017, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 29406 + }, + { + "epoch": 0.29407, + "grad_norm": 1.0257595835759235, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 29407 + }, + { + "epoch": 0.29408, + "grad_norm": 0.8901802891087471, + "learning_rate": 0.003, + "loss": 4.045, + "step": 29408 + }, + { + "epoch": 0.29409, + "grad_norm": 1.0341414318962376, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 29409 + }, + { + "epoch": 0.2941, + "grad_norm": 0.9075300764837801, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 29410 + }, + { + "epoch": 0.29411, + "grad_norm": 0.8329102074623471, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 29411 + }, + { + "epoch": 0.29412, + "grad_norm": 0.8968233295213194, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 29412 + }, + { + "epoch": 0.29413, + "grad_norm": 0.8382196147532294, + "learning_rate": 0.003, + "loss": 4.037, + "step": 29413 + }, + { + "epoch": 0.29414, + "grad_norm": 0.7916301118266401, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 29414 + }, + { + "epoch": 0.29415, + "grad_norm": 0.907773529494859, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 29415 + }, + { + "epoch": 0.29416, + "grad_norm": 1.091731432805627, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 29416 + }, + { + "epoch": 0.29417, + "grad_norm": 1.2279355011694542, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 29417 + }, + { + "epoch": 0.29418, + "grad_norm": 1.0364428282659894, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 29418 + }, + { + "epoch": 0.29419, + "grad_norm": 0.9712972652451504, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 29419 + }, + { + "epoch": 0.2942, + "grad_norm": 0.9781110015034683, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 29420 + }, + { + "epoch": 0.29421, + "grad_norm": 0.7647066938784287, + "learning_rate": 0.003, + "loss": 4.026, + "step": 29421 + }, + { + "epoch": 0.29422, + "grad_norm": 0.5502451154897172, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 29422 + }, + { + "epoch": 0.29423, + "grad_norm": 0.6075951211892019, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 29423 + }, + { + "epoch": 0.29424, + "grad_norm": 0.6464675368552271, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 29424 + }, + { + "epoch": 0.29425, + "grad_norm": 0.6627867712028837, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 29425 + }, + { + "epoch": 0.29426, + "grad_norm": 0.6846854367081188, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 29426 + }, + { + "epoch": 0.29427, + "grad_norm": 0.7681286337363664, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 29427 + }, + { + "epoch": 0.29428, + "grad_norm": 1.0320443371701227, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 29428 + }, + { + "epoch": 0.29429, + "grad_norm": 1.1669986666989423, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 29429 + }, + { + "epoch": 0.2943, + "grad_norm": 0.7351435156755731, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 29430 + }, + { + "epoch": 0.29431, + "grad_norm": 0.574233678938655, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 29431 + }, + { + "epoch": 0.29432, + "grad_norm": 0.5843227567201444, + "learning_rate": 0.003, + "loss": 4.019, + "step": 29432 + }, + { + "epoch": 0.29433, + "grad_norm": 0.6328376861445577, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 29433 + }, + { + "epoch": 0.29434, + "grad_norm": 0.7474010091636214, + "learning_rate": 0.003, + "loss": 4.044, + "step": 29434 + }, + { + "epoch": 0.29435, + "grad_norm": 0.8528713482052022, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 29435 + }, + { + "epoch": 0.29436, + "grad_norm": 0.8170653541615462, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 29436 + }, + { + "epoch": 0.29437, + "grad_norm": 0.7911538671197178, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 29437 + }, + { + "epoch": 0.29438, + "grad_norm": 0.8699160963102068, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 29438 + }, + { + "epoch": 0.29439, + "grad_norm": 0.9195430257919991, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 29439 + }, + { + "epoch": 0.2944, + "grad_norm": 0.8205631760201486, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 29440 + }, + { + "epoch": 0.29441, + "grad_norm": 0.771694743786183, + "learning_rate": 0.003, + "loss": 4.037, + "step": 29441 + }, + { + "epoch": 0.29442, + "grad_norm": 0.7973155023227254, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 29442 + }, + { + "epoch": 0.29443, + "grad_norm": 0.8943623151740967, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 29443 + }, + { + "epoch": 0.29444, + "grad_norm": 0.9399768421798038, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 29444 + }, + { + "epoch": 0.29445, + "grad_norm": 0.9923662002372997, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 29445 + }, + { + "epoch": 0.29446, + "grad_norm": 1.0405872044250775, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 29446 + }, + { + "epoch": 0.29447, + "grad_norm": 1.0471919222228883, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 29447 + }, + { + "epoch": 0.29448, + "grad_norm": 0.846721521217031, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 29448 + }, + { + "epoch": 0.29449, + "grad_norm": 0.7551628261238632, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 29449 + }, + { + "epoch": 0.2945, + "grad_norm": 0.8234291243578339, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 29450 + }, + { + "epoch": 0.29451, + "grad_norm": 0.8695330127680944, + "learning_rate": 0.003, + "loss": 4.0857, + "step": 29451 + }, + { + "epoch": 0.29452, + "grad_norm": 0.9472059697091871, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 29452 + }, + { + "epoch": 0.29453, + "grad_norm": 1.1260293913305148, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 29453 + }, + { + "epoch": 0.29454, + "grad_norm": 1.040259203859668, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 29454 + }, + { + "epoch": 0.29455, + "grad_norm": 0.9312626050299808, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 29455 + }, + { + "epoch": 0.29456, + "grad_norm": 1.0108751514201586, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 29456 + }, + { + "epoch": 0.29457, + "grad_norm": 1.1150873160426766, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 29457 + }, + { + "epoch": 0.29458, + "grad_norm": 0.9282232771748145, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 29458 + }, + { + "epoch": 0.29459, + "grad_norm": 0.810583068691548, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 29459 + }, + { + "epoch": 0.2946, + "grad_norm": 0.8181249956960069, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 29460 + }, + { + "epoch": 0.29461, + "grad_norm": 0.7990932822464166, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 29461 + }, + { + "epoch": 0.29462, + "grad_norm": 0.9409787560412682, + "learning_rate": 0.003, + "loss": 4.021, + "step": 29462 + }, + { + "epoch": 0.29463, + "grad_norm": 1.0878902263865704, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 29463 + }, + { + "epoch": 0.29464, + "grad_norm": 0.8516730948613528, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 29464 + }, + { + "epoch": 0.29465, + "grad_norm": 0.8710536265885893, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 29465 + }, + { + "epoch": 0.29466, + "grad_norm": 0.8915727174850789, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 29466 + }, + { + "epoch": 0.29467, + "grad_norm": 0.8705364756496882, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 29467 + }, + { + "epoch": 0.29468, + "grad_norm": 0.8304403580999421, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 29468 + }, + { + "epoch": 0.29469, + "grad_norm": 0.7785606132144904, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 29469 + }, + { + "epoch": 0.2947, + "grad_norm": 0.8008128384464949, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 29470 + }, + { + "epoch": 0.29471, + "grad_norm": 0.8130658489116273, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 29471 + }, + { + "epoch": 0.29472, + "grad_norm": 0.7643610907829264, + "learning_rate": 0.003, + "loss": 4.0048, + "step": 29472 + }, + { + "epoch": 0.29473, + "grad_norm": 0.7873625835649938, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 29473 + }, + { + "epoch": 0.29474, + "grad_norm": 0.9768499358461032, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 29474 + }, + { + "epoch": 0.29475, + "grad_norm": 1.0664920194339051, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 29475 + }, + { + "epoch": 0.29476, + "grad_norm": 0.8198234745307579, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 29476 + }, + { + "epoch": 0.29477, + "grad_norm": 0.6999768683716316, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 29477 + }, + { + "epoch": 0.29478, + "grad_norm": 0.6698242942409547, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 29478 + }, + { + "epoch": 0.29479, + "grad_norm": 0.6647049597418161, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 29479 + }, + { + "epoch": 0.2948, + "grad_norm": 0.6677754405325208, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 29480 + }, + { + "epoch": 0.29481, + "grad_norm": 0.6995143862658786, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 29481 + }, + { + "epoch": 0.29482, + "grad_norm": 0.794002711475159, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 29482 + }, + { + "epoch": 0.29483, + "grad_norm": 0.9434788602165429, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 29483 + }, + { + "epoch": 0.29484, + "grad_norm": 1.192686185090285, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 29484 + }, + { + "epoch": 0.29485, + "grad_norm": 0.8815242194906802, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 29485 + }, + { + "epoch": 0.29486, + "grad_norm": 0.7657186120971496, + "learning_rate": 0.003, + "loss": 4.021, + "step": 29486 + }, + { + "epoch": 0.29487, + "grad_norm": 0.764947774133333, + "learning_rate": 0.003, + "loss": 4.031, + "step": 29487 + }, + { + "epoch": 0.29488, + "grad_norm": 0.7123030742452965, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 29488 + }, + { + "epoch": 0.29489, + "grad_norm": 0.6559101069447222, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 29489 + }, + { + "epoch": 0.2949, + "grad_norm": 0.5796461460955309, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 29490 + }, + { + "epoch": 0.29491, + "grad_norm": 0.6969441767256102, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 29491 + }, + { + "epoch": 0.29492, + "grad_norm": 0.6890163133223163, + "learning_rate": 0.003, + "loss": 4.008, + "step": 29492 + }, + { + "epoch": 0.29493, + "grad_norm": 0.807105101506135, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 29493 + }, + { + "epoch": 0.29494, + "grad_norm": 0.8991104524951704, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 29494 + }, + { + "epoch": 0.29495, + "grad_norm": 1.0338243230930393, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 29495 + }, + { + "epoch": 0.29496, + "grad_norm": 0.9642614448712948, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 29496 + }, + { + "epoch": 0.29497, + "grad_norm": 1.016237341079977, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 29497 + }, + { + "epoch": 0.29498, + "grad_norm": 0.9862010034761993, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 29498 + }, + { + "epoch": 0.29499, + "grad_norm": 0.9731648416861356, + "learning_rate": 0.003, + "loss": 4.021, + "step": 29499 + }, + { + "epoch": 0.295, + "grad_norm": 1.136399519482867, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 29500 + }, + { + "epoch": 0.29501, + "grad_norm": 0.942864178110427, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 29501 + }, + { + "epoch": 0.29502, + "grad_norm": 0.7623838200910764, + "learning_rate": 0.003, + "loss": 4.035, + "step": 29502 + }, + { + "epoch": 0.29503, + "grad_norm": 0.7160664731109688, + "learning_rate": 0.003, + "loss": 4.072, + "step": 29503 + }, + { + "epoch": 0.29504, + "grad_norm": 0.7421126285864131, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 29504 + }, + { + "epoch": 0.29505, + "grad_norm": 0.7319073303619947, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 29505 + }, + { + "epoch": 0.29506, + "grad_norm": 0.7526278658149351, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 29506 + }, + { + "epoch": 0.29507, + "grad_norm": 0.8191453196250793, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 29507 + }, + { + "epoch": 0.29508, + "grad_norm": 1.0857025274887944, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 29508 + }, + { + "epoch": 0.29509, + "grad_norm": 1.2150819596816411, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 29509 + }, + { + "epoch": 0.2951, + "grad_norm": 0.8190918531372475, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 29510 + }, + { + "epoch": 0.29511, + "grad_norm": 0.77128110683923, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 29511 + }, + { + "epoch": 0.29512, + "grad_norm": 0.7612889148525646, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 29512 + }, + { + "epoch": 0.29513, + "grad_norm": 0.734835710813353, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 29513 + }, + { + "epoch": 0.29514, + "grad_norm": 0.7764659037603447, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 29514 + }, + { + "epoch": 0.29515, + "grad_norm": 0.9024195163548533, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 29515 + }, + { + "epoch": 0.29516, + "grad_norm": 0.987506889096575, + "learning_rate": 0.003, + "loss": 4.045, + "step": 29516 + }, + { + "epoch": 0.29517, + "grad_norm": 1.0641078262301504, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 29517 + }, + { + "epoch": 0.29518, + "grad_norm": 0.8511848700763053, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 29518 + }, + { + "epoch": 0.29519, + "grad_norm": 0.8649927794523451, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 29519 + }, + { + "epoch": 0.2952, + "grad_norm": 0.8756882864815512, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 29520 + }, + { + "epoch": 0.29521, + "grad_norm": 0.879019446662677, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 29521 + }, + { + "epoch": 0.29522, + "grad_norm": 0.9051351209559111, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 29522 + }, + { + "epoch": 0.29523, + "grad_norm": 0.9299109368632689, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 29523 + }, + { + "epoch": 0.29524, + "grad_norm": 0.9656001235730147, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 29524 + }, + { + "epoch": 0.29525, + "grad_norm": 1.0264509223988085, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 29525 + }, + { + "epoch": 0.29526, + "grad_norm": 1.1808398670528966, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 29526 + }, + { + "epoch": 0.29527, + "grad_norm": 0.8023245450306955, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 29527 + }, + { + "epoch": 0.29528, + "grad_norm": 0.7123463170048762, + "learning_rate": 0.003, + "loss": 4.06, + "step": 29528 + }, + { + "epoch": 0.29529, + "grad_norm": 0.7463955513675214, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 29529 + }, + { + "epoch": 0.2953, + "grad_norm": 0.9121723700323058, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 29530 + }, + { + "epoch": 0.29531, + "grad_norm": 1.1925961869679653, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 29531 + }, + { + "epoch": 0.29532, + "grad_norm": 0.762055476458954, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 29532 + }, + { + "epoch": 0.29533, + "grad_norm": 0.661944938134374, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 29533 + }, + { + "epoch": 0.29534, + "grad_norm": 0.575886306095579, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 29534 + }, + { + "epoch": 0.29535, + "grad_norm": 0.5472425238953568, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 29535 + }, + { + "epoch": 0.29536, + "grad_norm": 0.571062041663618, + "learning_rate": 0.003, + "loss": 3.9911, + "step": 29536 + }, + { + "epoch": 0.29537, + "grad_norm": 0.5921001628375261, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 29537 + }, + { + "epoch": 0.29538, + "grad_norm": 0.6551670610860285, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 29538 + }, + { + "epoch": 0.29539, + "grad_norm": 0.8374393810891719, + "learning_rate": 0.003, + "loss": 4.031, + "step": 29539 + }, + { + "epoch": 0.2954, + "grad_norm": 1.0128539041783893, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 29540 + }, + { + "epoch": 0.29541, + "grad_norm": 1.1120568398715585, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 29541 + }, + { + "epoch": 0.29542, + "grad_norm": 0.7087566315141395, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 29542 + }, + { + "epoch": 0.29543, + "grad_norm": 0.6455174430036086, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 29543 + }, + { + "epoch": 0.29544, + "grad_norm": 0.730123729795281, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 29544 + }, + { + "epoch": 0.29545, + "grad_norm": 0.7695499029463173, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 29545 + }, + { + "epoch": 0.29546, + "grad_norm": 0.798957318269689, + "learning_rate": 0.003, + "loss": 4.034, + "step": 29546 + }, + { + "epoch": 0.29547, + "grad_norm": 0.8734950830632281, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 29547 + }, + { + "epoch": 0.29548, + "grad_norm": 0.9887024410143668, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 29548 + }, + { + "epoch": 0.29549, + "grad_norm": 1.1180277087182828, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 29549 + }, + { + "epoch": 0.2955, + "grad_norm": 0.7743161580059043, + "learning_rate": 0.003, + "loss": 4.072, + "step": 29550 + }, + { + "epoch": 0.29551, + "grad_norm": 0.645265233569196, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 29551 + }, + { + "epoch": 0.29552, + "grad_norm": 0.7195815093661289, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 29552 + }, + { + "epoch": 0.29553, + "grad_norm": 0.9345691445009946, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 29553 + }, + { + "epoch": 0.29554, + "grad_norm": 0.9851289588394291, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29554 + }, + { + "epoch": 0.29555, + "grad_norm": 0.8662162788211244, + "learning_rate": 0.003, + "loss": 4.0034, + "step": 29555 + }, + { + "epoch": 0.29556, + "grad_norm": 1.0285263698127753, + "learning_rate": 0.003, + "loss": 4.052, + "step": 29556 + }, + { + "epoch": 0.29557, + "grad_norm": 0.8204976746681296, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 29557 + }, + { + "epoch": 0.29558, + "grad_norm": 0.7493046369735843, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 29558 + }, + { + "epoch": 0.29559, + "grad_norm": 0.7747280826427463, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 29559 + }, + { + "epoch": 0.2956, + "grad_norm": 0.8393597894478902, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 29560 + }, + { + "epoch": 0.29561, + "grad_norm": 0.9308004435293608, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 29561 + }, + { + "epoch": 0.29562, + "grad_norm": 0.9523914271200857, + "learning_rate": 0.003, + "loss": 4.054, + "step": 29562 + }, + { + "epoch": 0.29563, + "grad_norm": 0.9597397436633324, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 29563 + }, + { + "epoch": 0.29564, + "grad_norm": 1.0260306861923776, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 29564 + }, + { + "epoch": 0.29565, + "grad_norm": 0.9137918689610293, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 29565 + }, + { + "epoch": 0.29566, + "grad_norm": 0.8630218318084596, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 29566 + }, + { + "epoch": 0.29567, + "grad_norm": 0.9202201205435803, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 29567 + }, + { + "epoch": 0.29568, + "grad_norm": 0.9371638569610556, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 29568 + }, + { + "epoch": 0.29569, + "grad_norm": 1.0390643349735722, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 29569 + }, + { + "epoch": 0.2957, + "grad_norm": 1.1794776301400998, + "learning_rate": 0.003, + "loss": 4.0021, + "step": 29570 + }, + { + "epoch": 0.29571, + "grad_norm": 1.021249842684877, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 29571 + }, + { + "epoch": 0.29572, + "grad_norm": 1.0871462567103367, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 29572 + }, + { + "epoch": 0.29573, + "grad_norm": 1.0569978065315713, + "learning_rate": 0.003, + "loss": 4.048, + "step": 29573 + }, + { + "epoch": 0.29574, + "grad_norm": 0.926605531433748, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 29574 + }, + { + "epoch": 0.29575, + "grad_norm": 0.8473364328806434, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 29575 + }, + { + "epoch": 0.29576, + "grad_norm": 0.7467891036255305, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 29576 + }, + { + "epoch": 0.29577, + "grad_norm": 0.6888699641560188, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 29577 + }, + { + "epoch": 0.29578, + "grad_norm": 0.6611322621244413, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 29578 + }, + { + "epoch": 0.29579, + "grad_norm": 0.8588671885118704, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 29579 + }, + { + "epoch": 0.2958, + "grad_norm": 0.9365676143287244, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 29580 + }, + { + "epoch": 0.29581, + "grad_norm": 0.9759202804955065, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 29581 + }, + { + "epoch": 0.29582, + "grad_norm": 1.3194867649724762, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 29582 + }, + { + "epoch": 0.29583, + "grad_norm": 0.7109295383577998, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 29583 + }, + { + "epoch": 0.29584, + "grad_norm": 0.5887460743689457, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 29584 + }, + { + "epoch": 0.29585, + "grad_norm": 0.5219985540242765, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 29585 + }, + { + "epoch": 0.29586, + "grad_norm": 0.5503463955774311, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 29586 + }, + { + "epoch": 0.29587, + "grad_norm": 0.512112772950889, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29587 + }, + { + "epoch": 0.29588, + "grad_norm": 0.5399087861647854, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 29588 + }, + { + "epoch": 0.29589, + "grad_norm": 0.527214607148675, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 29589 + }, + { + "epoch": 0.2959, + "grad_norm": 0.5526671339848339, + "learning_rate": 0.003, + "loss": 3.998, + "step": 29590 + }, + { + "epoch": 0.29591, + "grad_norm": 0.566958697794794, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 29591 + }, + { + "epoch": 0.29592, + "grad_norm": 0.5832809965410394, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 29592 + }, + { + "epoch": 0.29593, + "grad_norm": 0.5750473992025271, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 29593 + }, + { + "epoch": 0.29594, + "grad_norm": 0.6220475341371999, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 29594 + }, + { + "epoch": 0.29595, + "grad_norm": 0.6728802995895984, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 29595 + }, + { + "epoch": 0.29596, + "grad_norm": 0.8310651183092241, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 29596 + }, + { + "epoch": 0.29597, + "grad_norm": 1.1012629577376085, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 29597 + }, + { + "epoch": 0.29598, + "grad_norm": 1.0535211463857392, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 29598 + }, + { + "epoch": 0.29599, + "grad_norm": 1.0212072643068095, + "learning_rate": 0.003, + "loss": 4.025, + "step": 29599 + }, + { + "epoch": 0.296, + "grad_norm": 1.1297903558209657, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 29600 + }, + { + "epoch": 0.29601, + "grad_norm": 0.8741276178791385, + "learning_rate": 0.003, + "loss": 4.072, + "step": 29601 + }, + { + "epoch": 0.29602, + "grad_norm": 0.9073951972046345, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 29602 + }, + { + "epoch": 0.29603, + "grad_norm": 0.8217460421860436, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 29603 + }, + { + "epoch": 0.29604, + "grad_norm": 0.7398378463036518, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 29604 + }, + { + "epoch": 0.29605, + "grad_norm": 0.7949452650483415, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 29605 + }, + { + "epoch": 0.29606, + "grad_norm": 0.7633065173124948, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 29606 + }, + { + "epoch": 0.29607, + "grad_norm": 0.7829140020877731, + "learning_rate": 0.003, + "loss": 4.023, + "step": 29607 + }, + { + "epoch": 0.29608, + "grad_norm": 0.7938670416348678, + "learning_rate": 0.003, + "loss": 4.0053, + "step": 29608 + }, + { + "epoch": 0.29609, + "grad_norm": 0.8010392803898696, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 29609 + }, + { + "epoch": 0.2961, + "grad_norm": 0.9542051616240214, + "learning_rate": 0.003, + "loss": 4.04, + "step": 29610 + }, + { + "epoch": 0.29611, + "grad_norm": 1.1626236340796248, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 29611 + }, + { + "epoch": 0.29612, + "grad_norm": 0.8621593014942216, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 29612 + }, + { + "epoch": 0.29613, + "grad_norm": 0.7819868846487147, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 29613 + }, + { + "epoch": 0.29614, + "grad_norm": 0.7932869759280725, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 29614 + }, + { + "epoch": 0.29615, + "grad_norm": 0.739249644562522, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 29615 + }, + { + "epoch": 0.29616, + "grad_norm": 0.7463235633943912, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 29616 + }, + { + "epoch": 0.29617, + "grad_norm": 0.6900108220946533, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 29617 + }, + { + "epoch": 0.29618, + "grad_norm": 0.7575690106096097, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 29618 + }, + { + "epoch": 0.29619, + "grad_norm": 0.8397644291261247, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 29619 + }, + { + "epoch": 0.2962, + "grad_norm": 0.9186349882590031, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 29620 + }, + { + "epoch": 0.29621, + "grad_norm": 1.0755308522755005, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 29621 + }, + { + "epoch": 0.29622, + "grad_norm": 1.1406148923172066, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 29622 + }, + { + "epoch": 0.29623, + "grad_norm": 0.9056012931374529, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 29623 + }, + { + "epoch": 0.29624, + "grad_norm": 0.8978287031207305, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 29624 + }, + { + "epoch": 0.29625, + "grad_norm": 0.8942775652892172, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 29625 + }, + { + "epoch": 0.29626, + "grad_norm": 0.8478470372000767, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 29626 + }, + { + "epoch": 0.29627, + "grad_norm": 0.8882974986771165, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 29627 + }, + { + "epoch": 0.29628, + "grad_norm": 1.0318189510158946, + "learning_rate": 0.003, + "loss": 4.043, + "step": 29628 + }, + { + "epoch": 0.29629, + "grad_norm": 0.9701998439781265, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 29629 + }, + { + "epoch": 0.2963, + "grad_norm": 1.0086135975062058, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 29630 + }, + { + "epoch": 0.29631, + "grad_norm": 1.0456520112074126, + "learning_rate": 0.003, + "loss": 4.0785, + "step": 29631 + }, + { + "epoch": 0.29632, + "grad_norm": 1.122121201150599, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 29632 + }, + { + "epoch": 0.29633, + "grad_norm": 0.8964923787783627, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 29633 + }, + { + "epoch": 0.29634, + "grad_norm": 0.8502426772406847, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 29634 + }, + { + "epoch": 0.29635, + "grad_norm": 0.925780586940106, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 29635 + }, + { + "epoch": 0.29636, + "grad_norm": 1.0108082754823764, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 29636 + }, + { + "epoch": 0.29637, + "grad_norm": 1.152125571785256, + "learning_rate": 0.003, + "loss": 4.063, + "step": 29637 + }, + { + "epoch": 0.29638, + "grad_norm": 1.039164345116775, + "learning_rate": 0.003, + "loss": 4.053, + "step": 29638 + }, + { + "epoch": 0.29639, + "grad_norm": 0.955630958968111, + "learning_rate": 0.003, + "loss": 4.072, + "step": 29639 + }, + { + "epoch": 0.2964, + "grad_norm": 0.9457602089450506, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 29640 + }, + { + "epoch": 0.29641, + "grad_norm": 1.0631782794572842, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 29641 + }, + { + "epoch": 0.29642, + "grad_norm": 0.9835046624981021, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29642 + }, + { + "epoch": 0.29643, + "grad_norm": 0.8803066089019086, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 29643 + }, + { + "epoch": 0.29644, + "grad_norm": 0.8602293082260116, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 29644 + }, + { + "epoch": 0.29645, + "grad_norm": 0.8418993402895806, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 29645 + }, + { + "epoch": 0.29646, + "grad_norm": 0.7526301468369274, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 29646 + }, + { + "epoch": 0.29647, + "grad_norm": 0.829762735849243, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 29647 + }, + { + "epoch": 0.29648, + "grad_norm": 0.9371174195527552, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 29648 + }, + { + "epoch": 0.29649, + "grad_norm": 1.009484719954905, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 29649 + }, + { + "epoch": 0.2965, + "grad_norm": 0.9406803127506687, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 29650 + }, + { + "epoch": 0.29651, + "grad_norm": 0.8824418327237745, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 29651 + }, + { + "epoch": 0.29652, + "grad_norm": 0.7726539682395782, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 29652 + }, + { + "epoch": 0.29653, + "grad_norm": 0.6614874259634501, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 29653 + }, + { + "epoch": 0.29654, + "grad_norm": 0.5928611140891672, + "learning_rate": 0.003, + "loss": 3.9935, + "step": 29654 + }, + { + "epoch": 0.29655, + "grad_norm": 0.6241851822305493, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 29655 + }, + { + "epoch": 0.29656, + "grad_norm": 0.5681418179543076, + "learning_rate": 0.003, + "loss": 4.0088, + "step": 29656 + }, + { + "epoch": 0.29657, + "grad_norm": 0.5364055499223117, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 29657 + }, + { + "epoch": 0.29658, + "grad_norm": 0.6370679706561624, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 29658 + }, + { + "epoch": 0.29659, + "grad_norm": 0.7268044673307017, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 29659 + }, + { + "epoch": 0.2966, + "grad_norm": 0.877045654676319, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 29660 + }, + { + "epoch": 0.29661, + "grad_norm": 1.1645647816576228, + "learning_rate": 0.003, + "loss": 4.04, + "step": 29661 + }, + { + "epoch": 0.29662, + "grad_norm": 1.1408081566642048, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 29662 + }, + { + "epoch": 0.29663, + "grad_norm": 0.8514601043194153, + "learning_rate": 0.003, + "loss": 4.045, + "step": 29663 + }, + { + "epoch": 0.29664, + "grad_norm": 0.7343051062249578, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 29664 + }, + { + "epoch": 0.29665, + "grad_norm": 0.6913645925867937, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 29665 + }, + { + "epoch": 0.29666, + "grad_norm": 0.7321238508139923, + "learning_rate": 0.003, + "loss": 4.047, + "step": 29666 + }, + { + "epoch": 0.29667, + "grad_norm": 0.6982298018406868, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 29667 + }, + { + "epoch": 0.29668, + "grad_norm": 0.7136694579138156, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 29668 + }, + { + "epoch": 0.29669, + "grad_norm": 0.6637359125409228, + "learning_rate": 0.003, + "loss": 4.045, + "step": 29669 + }, + { + "epoch": 0.2967, + "grad_norm": 0.6883563983666308, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 29670 + }, + { + "epoch": 0.29671, + "grad_norm": 0.7136494169718673, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 29671 + }, + { + "epoch": 0.29672, + "grad_norm": 0.7947545752396443, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 29672 + }, + { + "epoch": 0.29673, + "grad_norm": 0.7154622820917592, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 29673 + }, + { + "epoch": 0.29674, + "grad_norm": 0.7607427849331586, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 29674 + }, + { + "epoch": 0.29675, + "grad_norm": 0.8763483258759387, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 29675 + }, + { + "epoch": 0.29676, + "grad_norm": 1.2380102699713451, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 29676 + }, + { + "epoch": 0.29677, + "grad_norm": 0.7851799758030761, + "learning_rate": 0.003, + "loss": 4.023, + "step": 29677 + }, + { + "epoch": 0.29678, + "grad_norm": 0.6055775967261222, + "learning_rate": 0.003, + "loss": 3.9948, + "step": 29678 + }, + { + "epoch": 0.29679, + "grad_norm": 0.6331845486836994, + "learning_rate": 0.003, + "loss": 4.0031, + "step": 29679 + }, + { + "epoch": 0.2968, + "grad_norm": 0.7430470795238474, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 29680 + }, + { + "epoch": 0.29681, + "grad_norm": 0.7624076017508807, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 29681 + }, + { + "epoch": 0.29682, + "grad_norm": 0.61983155564648, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 29682 + }, + { + "epoch": 0.29683, + "grad_norm": 0.6328813671032738, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 29683 + }, + { + "epoch": 0.29684, + "grad_norm": 0.6576067957676136, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 29684 + }, + { + "epoch": 0.29685, + "grad_norm": 0.8308586745323387, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 29685 + }, + { + "epoch": 0.29686, + "grad_norm": 1.0608540377470554, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 29686 + }, + { + "epoch": 0.29687, + "grad_norm": 1.1731593338542414, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 29687 + }, + { + "epoch": 0.29688, + "grad_norm": 0.8705715812090735, + "learning_rate": 0.003, + "loss": 4.03, + "step": 29688 + }, + { + "epoch": 0.29689, + "grad_norm": 0.8875891068838223, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 29689 + }, + { + "epoch": 0.2969, + "grad_norm": 0.9099328437632714, + "learning_rate": 0.003, + "loss": 4.033, + "step": 29690 + }, + { + "epoch": 0.29691, + "grad_norm": 0.9163390680700281, + "learning_rate": 0.003, + "loss": 4.039, + "step": 29691 + }, + { + "epoch": 0.29692, + "grad_norm": 1.1507738729704846, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 29692 + }, + { + "epoch": 0.29693, + "grad_norm": 1.0673164314393269, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 29693 + }, + { + "epoch": 0.29694, + "grad_norm": 0.971512945628094, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 29694 + }, + { + "epoch": 0.29695, + "grad_norm": 1.109622578772811, + "learning_rate": 0.003, + "loss": 4.054, + "step": 29695 + }, + { + "epoch": 0.29696, + "grad_norm": 0.9342161206080827, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 29696 + }, + { + "epoch": 0.29697, + "grad_norm": 0.9385147288680579, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 29697 + }, + { + "epoch": 0.29698, + "grad_norm": 0.8946363528941353, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 29698 + }, + { + "epoch": 0.29699, + "grad_norm": 0.820235416769547, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 29699 + }, + { + "epoch": 0.297, + "grad_norm": 0.79023674161317, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 29700 + }, + { + "epoch": 0.29701, + "grad_norm": 0.8408153240941739, + "learning_rate": 0.003, + "loss": 4.057, + "step": 29701 + }, + { + "epoch": 0.29702, + "grad_norm": 0.9916062068131645, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 29702 + }, + { + "epoch": 0.29703, + "grad_norm": 1.2658707477408067, + "learning_rate": 0.003, + "loss": 4.0829, + "step": 29703 + }, + { + "epoch": 0.29704, + "grad_norm": 0.8569388924458262, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 29704 + }, + { + "epoch": 0.29705, + "grad_norm": 0.8709509263416317, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 29705 + }, + { + "epoch": 0.29706, + "grad_norm": 0.9274583416516791, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 29706 + }, + { + "epoch": 0.29707, + "grad_norm": 0.9953752983054379, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 29707 + }, + { + "epoch": 0.29708, + "grad_norm": 1.0154630634355237, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 29708 + }, + { + "epoch": 0.29709, + "grad_norm": 1.0324954467148522, + "learning_rate": 0.003, + "loss": 4.0796, + "step": 29709 + }, + { + "epoch": 0.2971, + "grad_norm": 0.9814300142787983, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 29710 + }, + { + "epoch": 0.29711, + "grad_norm": 0.9308114907413595, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 29711 + }, + { + "epoch": 0.29712, + "grad_norm": 0.8404837084850487, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 29712 + }, + { + "epoch": 0.29713, + "grad_norm": 0.8147618442565345, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 29713 + }, + { + "epoch": 0.29714, + "grad_norm": 0.7916539372622257, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 29714 + }, + { + "epoch": 0.29715, + "grad_norm": 0.8010260484088131, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 29715 + }, + { + "epoch": 0.29716, + "grad_norm": 0.7744813003608569, + "learning_rate": 0.003, + "loss": 4.069, + "step": 29716 + }, + { + "epoch": 0.29717, + "grad_norm": 0.6615541597781733, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 29717 + }, + { + "epoch": 0.29718, + "grad_norm": 0.5891056875590933, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 29718 + }, + { + "epoch": 0.29719, + "grad_norm": 0.6109715601698277, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 29719 + }, + { + "epoch": 0.2972, + "grad_norm": 0.6719435882520997, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 29720 + }, + { + "epoch": 0.29721, + "grad_norm": 0.7399503064204112, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 29721 + }, + { + "epoch": 0.29722, + "grad_norm": 0.8881285731177666, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 29722 + }, + { + "epoch": 0.29723, + "grad_norm": 1.2063009855350686, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 29723 + }, + { + "epoch": 0.29724, + "grad_norm": 1.0564501177997567, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 29724 + }, + { + "epoch": 0.29725, + "grad_norm": 0.7714319217486869, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 29725 + }, + { + "epoch": 0.29726, + "grad_norm": 0.6979303292405438, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 29726 + }, + { + "epoch": 0.29727, + "grad_norm": 0.776882368729348, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 29727 + }, + { + "epoch": 0.29728, + "grad_norm": 0.8735100440246475, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 29728 + }, + { + "epoch": 0.29729, + "grad_norm": 0.884608600652728, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 29729 + }, + { + "epoch": 0.2973, + "grad_norm": 0.7983200596192327, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 29730 + }, + { + "epoch": 0.29731, + "grad_norm": 0.7306407924064392, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 29731 + }, + { + "epoch": 0.29732, + "grad_norm": 0.7587832174618953, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 29732 + }, + { + "epoch": 0.29733, + "grad_norm": 0.8929159716862467, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 29733 + }, + { + "epoch": 0.29734, + "grad_norm": 1.0030335108818575, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 29734 + }, + { + "epoch": 0.29735, + "grad_norm": 1.1285104505245875, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 29735 + }, + { + "epoch": 0.29736, + "grad_norm": 0.9399870544681409, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 29736 + }, + { + "epoch": 0.29737, + "grad_norm": 0.8652065996898423, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 29737 + }, + { + "epoch": 0.29738, + "grad_norm": 0.8237999093397274, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 29738 + }, + { + "epoch": 0.29739, + "grad_norm": 0.7739115685825558, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 29739 + }, + { + "epoch": 0.2974, + "grad_norm": 0.7212454050258579, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 29740 + }, + { + "epoch": 0.29741, + "grad_norm": 0.7450060867478219, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 29741 + }, + { + "epoch": 0.29742, + "grad_norm": 0.7510429669549749, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 29742 + }, + { + "epoch": 0.29743, + "grad_norm": 0.7324361765145662, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 29743 + }, + { + "epoch": 0.29744, + "grad_norm": 0.7823156763480396, + "learning_rate": 0.003, + "loss": 4.058, + "step": 29744 + }, + { + "epoch": 0.29745, + "grad_norm": 1.0275229981848362, + "learning_rate": 0.003, + "loss": 4.039, + "step": 29745 + }, + { + "epoch": 0.29746, + "grad_norm": 1.1153082681017585, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 29746 + }, + { + "epoch": 0.29747, + "grad_norm": 1.08832744829383, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 29747 + }, + { + "epoch": 0.29748, + "grad_norm": 1.0440713005434106, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 29748 + }, + { + "epoch": 0.29749, + "grad_norm": 0.9321793036595923, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 29749 + }, + { + "epoch": 0.2975, + "grad_norm": 0.8829255908689442, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 29750 + }, + { + "epoch": 0.29751, + "grad_norm": 0.8967871059237603, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 29751 + }, + { + "epoch": 0.29752, + "grad_norm": 0.753552348299697, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 29752 + }, + { + "epoch": 0.29753, + "grad_norm": 0.6062601552451126, + "learning_rate": 0.003, + "loss": 4.036, + "step": 29753 + }, + { + "epoch": 0.29754, + "grad_norm": 0.6339606128262646, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 29754 + }, + { + "epoch": 0.29755, + "grad_norm": 0.7230749238072391, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 29755 + }, + { + "epoch": 0.29756, + "grad_norm": 0.7657519147767696, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 29756 + }, + { + "epoch": 0.29757, + "grad_norm": 0.7666752713901773, + "learning_rate": 0.003, + "loss": 4.014, + "step": 29757 + }, + { + "epoch": 0.29758, + "grad_norm": 0.8096820214067828, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 29758 + }, + { + "epoch": 0.29759, + "grad_norm": 0.8480195564404693, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 29759 + }, + { + "epoch": 0.2976, + "grad_norm": 0.8755426884377995, + "learning_rate": 0.003, + "loss": 4.068, + "step": 29760 + }, + { + "epoch": 0.29761, + "grad_norm": 0.9025945835045597, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 29761 + }, + { + "epoch": 0.29762, + "grad_norm": 0.893742559643703, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 29762 + }, + { + "epoch": 0.29763, + "grad_norm": 0.7638140155680376, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 29763 + }, + { + "epoch": 0.29764, + "grad_norm": 0.6696205205087604, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 29764 + }, + { + "epoch": 0.29765, + "grad_norm": 0.6402113653291301, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 29765 + }, + { + "epoch": 0.29766, + "grad_norm": 0.6889426009158199, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 29766 + }, + { + "epoch": 0.29767, + "grad_norm": 0.7717369554544314, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 29767 + }, + { + "epoch": 0.29768, + "grad_norm": 0.936639939699774, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 29768 + }, + { + "epoch": 0.29769, + "grad_norm": 1.221681340990684, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 29769 + }, + { + "epoch": 0.2977, + "grad_norm": 1.0265064931081562, + "learning_rate": 0.003, + "loss": 4.044, + "step": 29770 + }, + { + "epoch": 0.29771, + "grad_norm": 1.0054701270809359, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 29771 + }, + { + "epoch": 0.29772, + "grad_norm": 1.002019850211728, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 29772 + }, + { + "epoch": 0.29773, + "grad_norm": 1.0561698708408154, + "learning_rate": 0.003, + "loss": 4.055, + "step": 29773 + }, + { + "epoch": 0.29774, + "grad_norm": 1.1316651053086748, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 29774 + }, + { + "epoch": 0.29775, + "grad_norm": 0.8368753114257649, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 29775 + }, + { + "epoch": 0.29776, + "grad_norm": 0.8055798512404035, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 29776 + }, + { + "epoch": 0.29777, + "grad_norm": 0.6899477198209971, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 29777 + }, + { + "epoch": 0.29778, + "grad_norm": 0.6523284086009201, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 29778 + }, + { + "epoch": 0.29779, + "grad_norm": 0.7239401618591073, + "learning_rate": 0.003, + "loss": 4.05, + "step": 29779 + }, + { + "epoch": 0.2978, + "grad_norm": 0.9664884900967454, + "learning_rate": 0.003, + "loss": 4.026, + "step": 29780 + }, + { + "epoch": 0.29781, + "grad_norm": 1.234128046077109, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 29781 + }, + { + "epoch": 0.29782, + "grad_norm": 0.6854677600327224, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 29782 + }, + { + "epoch": 0.29783, + "grad_norm": 0.6821932066191768, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 29783 + }, + { + "epoch": 0.29784, + "grad_norm": 0.7334846343839895, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 29784 + }, + { + "epoch": 0.29785, + "grad_norm": 0.8613948084461409, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 29785 + }, + { + "epoch": 0.29786, + "grad_norm": 0.985413525952653, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 29786 + }, + { + "epoch": 0.29787, + "grad_norm": 0.9268528271207865, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 29787 + }, + { + "epoch": 0.29788, + "grad_norm": 0.8233064234641196, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 29788 + }, + { + "epoch": 0.29789, + "grad_norm": 0.6614358515098475, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 29789 + }, + { + "epoch": 0.2979, + "grad_norm": 0.5946823989392257, + "learning_rate": 0.003, + "loss": 3.9966, + "step": 29790 + }, + { + "epoch": 0.29791, + "grad_norm": 0.6311183531736815, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 29791 + }, + { + "epoch": 0.29792, + "grad_norm": 0.59783935842209, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 29792 + }, + { + "epoch": 0.29793, + "grad_norm": 0.6020949066879727, + "learning_rate": 0.003, + "loss": 4.031, + "step": 29793 + }, + { + "epoch": 0.29794, + "grad_norm": 0.7097508471024112, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 29794 + }, + { + "epoch": 0.29795, + "grad_norm": 0.7379849122948166, + "learning_rate": 0.003, + "loss": 4.075, + "step": 29795 + }, + { + "epoch": 0.29796, + "grad_norm": 0.670652202591052, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 29796 + }, + { + "epoch": 0.29797, + "grad_norm": 0.7749702787846947, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 29797 + }, + { + "epoch": 0.29798, + "grad_norm": 1.063582357180149, + "learning_rate": 0.003, + "loss": 4.016, + "step": 29798 + }, + { + "epoch": 0.29799, + "grad_norm": 1.292542836392054, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 29799 + }, + { + "epoch": 0.298, + "grad_norm": 0.9577123938686618, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 29800 + }, + { + "epoch": 0.29801, + "grad_norm": 0.8765144754295019, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 29801 + }, + { + "epoch": 0.29802, + "grad_norm": 0.8599844172808233, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 29802 + }, + { + "epoch": 0.29803, + "grad_norm": 0.8152042706777664, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 29803 + }, + { + "epoch": 0.29804, + "grad_norm": 0.8856433268858597, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 29804 + }, + { + "epoch": 0.29805, + "grad_norm": 0.8435965168644539, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 29805 + }, + { + "epoch": 0.29806, + "grad_norm": 0.8959856743660108, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 29806 + }, + { + "epoch": 0.29807, + "grad_norm": 1.0808917490144245, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 29807 + }, + { + "epoch": 0.29808, + "grad_norm": 1.1442659220970852, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 29808 + }, + { + "epoch": 0.29809, + "grad_norm": 0.8071600642590813, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 29809 + }, + { + "epoch": 0.2981, + "grad_norm": 0.7152612517749717, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 29810 + }, + { + "epoch": 0.29811, + "grad_norm": 0.6785983090888845, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 29811 + }, + { + "epoch": 0.29812, + "grad_norm": 0.7761444871738625, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 29812 + }, + { + "epoch": 0.29813, + "grad_norm": 0.8941966045957692, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 29813 + }, + { + "epoch": 0.29814, + "grad_norm": 0.9566636395768782, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 29814 + }, + { + "epoch": 0.29815, + "grad_norm": 0.8949460660896343, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 29815 + }, + { + "epoch": 0.29816, + "grad_norm": 0.8807018634127046, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 29816 + }, + { + "epoch": 0.29817, + "grad_norm": 0.9091484631748709, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 29817 + }, + { + "epoch": 0.29818, + "grad_norm": 0.8323951210594167, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 29818 + }, + { + "epoch": 0.29819, + "grad_norm": 0.78892133536652, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 29819 + }, + { + "epoch": 0.2982, + "grad_norm": 0.9387053370219199, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 29820 + }, + { + "epoch": 0.29821, + "grad_norm": 1.0921604932230289, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 29821 + }, + { + "epoch": 0.29822, + "grad_norm": 1.1616808463752424, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 29822 + }, + { + "epoch": 0.29823, + "grad_norm": 1.094287442163826, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 29823 + }, + { + "epoch": 0.29824, + "grad_norm": 0.920875056610523, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 29824 + }, + { + "epoch": 0.29825, + "grad_norm": 0.8919604620100988, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 29825 + }, + { + "epoch": 0.29826, + "grad_norm": 0.8167006147210444, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 29826 + }, + { + "epoch": 0.29827, + "grad_norm": 0.7773772443523552, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 29827 + }, + { + "epoch": 0.29828, + "grad_norm": 0.7473722054195873, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 29828 + }, + { + "epoch": 0.29829, + "grad_norm": 0.7392400462911206, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 29829 + }, + { + "epoch": 0.2983, + "grad_norm": 0.7231372275007345, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 29830 + }, + { + "epoch": 0.29831, + "grad_norm": 0.7199135791670837, + "learning_rate": 0.003, + "loss": 4.04, + "step": 29831 + }, + { + "epoch": 0.29832, + "grad_norm": 0.6646203814122272, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 29832 + }, + { + "epoch": 0.29833, + "grad_norm": 0.6631942943293005, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 29833 + }, + { + "epoch": 0.29834, + "grad_norm": 0.6975576911842695, + "learning_rate": 0.003, + "loss": 4.017, + "step": 29834 + }, + { + "epoch": 0.29835, + "grad_norm": 0.7875529936579908, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 29835 + }, + { + "epoch": 0.29836, + "grad_norm": 0.9741851736856987, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 29836 + }, + { + "epoch": 0.29837, + "grad_norm": 1.2564165716035691, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 29837 + }, + { + "epoch": 0.29838, + "grad_norm": 0.8070417678265821, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 29838 + }, + { + "epoch": 0.29839, + "grad_norm": 0.804747374111468, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 29839 + }, + { + "epoch": 0.2984, + "grad_norm": 0.8284540291126762, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 29840 + }, + { + "epoch": 0.29841, + "grad_norm": 0.8879353465508759, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 29841 + }, + { + "epoch": 0.29842, + "grad_norm": 0.9637084195746578, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 29842 + }, + { + "epoch": 0.29843, + "grad_norm": 1.1791934203295928, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 29843 + }, + { + "epoch": 0.29844, + "grad_norm": 0.9962279758395227, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 29844 + }, + { + "epoch": 0.29845, + "grad_norm": 0.9919586575413125, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 29845 + }, + { + "epoch": 0.29846, + "grad_norm": 0.8406675494918255, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 29846 + }, + { + "epoch": 0.29847, + "grad_norm": 0.8924895216014683, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 29847 + }, + { + "epoch": 0.29848, + "grad_norm": 0.8365651684846891, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 29848 + }, + { + "epoch": 0.29849, + "grad_norm": 0.7551172971270768, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 29849 + }, + { + "epoch": 0.2985, + "grad_norm": 0.7054970422772004, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 29850 + }, + { + "epoch": 0.29851, + "grad_norm": 0.7740689053334437, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 29851 + }, + { + "epoch": 0.29852, + "grad_norm": 0.8214511195633418, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 29852 + }, + { + "epoch": 0.29853, + "grad_norm": 0.8886683379826029, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 29853 + }, + { + "epoch": 0.29854, + "grad_norm": 0.890682058220869, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 29854 + }, + { + "epoch": 0.29855, + "grad_norm": 0.878638618349842, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 29855 + }, + { + "epoch": 0.29856, + "grad_norm": 0.8556718474217165, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 29856 + }, + { + "epoch": 0.29857, + "grad_norm": 0.7865331634679883, + "learning_rate": 0.003, + "loss": 4.044, + "step": 29857 + }, + { + "epoch": 0.29858, + "grad_norm": 0.6969737706676679, + "learning_rate": 0.003, + "loss": 4.0057, + "step": 29858 + }, + { + "epoch": 0.29859, + "grad_norm": 0.7176812968847659, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 29859 + }, + { + "epoch": 0.2986, + "grad_norm": 0.7675759177330358, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 29860 + }, + { + "epoch": 0.29861, + "grad_norm": 0.8460140861729517, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 29861 + }, + { + "epoch": 0.29862, + "grad_norm": 0.9167122648219999, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 29862 + }, + { + "epoch": 0.29863, + "grad_norm": 0.9762692650823959, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 29863 + }, + { + "epoch": 0.29864, + "grad_norm": 0.9376137328640115, + "learning_rate": 0.003, + "loss": 4.047, + "step": 29864 + }, + { + "epoch": 0.29865, + "grad_norm": 0.8126793268141304, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 29865 + }, + { + "epoch": 0.29866, + "grad_norm": 0.8916374748439551, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 29866 + }, + { + "epoch": 0.29867, + "grad_norm": 1.0591532666196584, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 29867 + }, + { + "epoch": 0.29868, + "grad_norm": 0.8566645971235454, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 29868 + }, + { + "epoch": 0.29869, + "grad_norm": 0.9214805970588473, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 29869 + }, + { + "epoch": 0.2987, + "grad_norm": 0.9768597323913967, + "learning_rate": 0.003, + "loss": 4.049, + "step": 29870 + }, + { + "epoch": 0.29871, + "grad_norm": 1.2148625421734747, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 29871 + }, + { + "epoch": 0.29872, + "grad_norm": 0.8861620808704939, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 29872 + }, + { + "epoch": 0.29873, + "grad_norm": 1.0495765569360282, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 29873 + }, + { + "epoch": 0.29874, + "grad_norm": 1.2499065934797244, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 29874 + }, + { + "epoch": 0.29875, + "grad_norm": 0.8983482008505086, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 29875 + }, + { + "epoch": 0.29876, + "grad_norm": 0.7463817159816309, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 29876 + }, + { + "epoch": 0.29877, + "grad_norm": 0.7265966741137838, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 29877 + }, + { + "epoch": 0.29878, + "grad_norm": 0.7652893105424238, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 29878 + }, + { + "epoch": 0.29879, + "grad_norm": 0.7406606801530248, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 29879 + }, + { + "epoch": 0.2988, + "grad_norm": 0.7154487724549269, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 29880 + }, + { + "epoch": 0.29881, + "grad_norm": 0.6241907660807905, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 29881 + }, + { + "epoch": 0.29882, + "grad_norm": 0.579905717645663, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 29882 + }, + { + "epoch": 0.29883, + "grad_norm": 0.7079610043041767, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 29883 + }, + { + "epoch": 0.29884, + "grad_norm": 0.7990126907752872, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 29884 + }, + { + "epoch": 0.29885, + "grad_norm": 0.7710028849610616, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 29885 + }, + { + "epoch": 0.29886, + "grad_norm": 0.8454477443145967, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 29886 + }, + { + "epoch": 0.29887, + "grad_norm": 0.963456315302519, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 29887 + }, + { + "epoch": 0.29888, + "grad_norm": 1.1026455706808131, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 29888 + }, + { + "epoch": 0.29889, + "grad_norm": 0.9625173267258381, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 29889 + }, + { + "epoch": 0.2989, + "grad_norm": 0.8687887031763422, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 29890 + }, + { + "epoch": 0.29891, + "grad_norm": 0.8327960752553026, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 29891 + }, + { + "epoch": 0.29892, + "grad_norm": 0.858909363576543, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 29892 + }, + { + "epoch": 0.29893, + "grad_norm": 0.9295240799826819, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 29893 + }, + { + "epoch": 0.29894, + "grad_norm": 0.9531787993857671, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 29894 + }, + { + "epoch": 0.29895, + "grad_norm": 0.866574533435851, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 29895 + }, + { + "epoch": 0.29896, + "grad_norm": 0.7907549954130869, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 29896 + }, + { + "epoch": 0.29897, + "grad_norm": 0.7368963353290511, + "learning_rate": 0.003, + "loss": 4.044, + "step": 29897 + }, + { + "epoch": 0.29898, + "grad_norm": 0.7499534276875517, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 29898 + }, + { + "epoch": 0.29899, + "grad_norm": 0.7496316512911848, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 29899 + }, + { + "epoch": 0.299, + "grad_norm": 0.6656815949009053, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 29900 + }, + { + "epoch": 0.29901, + "grad_norm": 0.5736562769253083, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 29901 + }, + { + "epoch": 0.29902, + "grad_norm": 0.6805079792796459, + "learning_rate": 0.003, + "loss": 4.0084, + "step": 29902 + }, + { + "epoch": 0.29903, + "grad_norm": 0.6792499144842693, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 29903 + }, + { + "epoch": 0.29904, + "grad_norm": 0.7372448998185561, + "learning_rate": 0.003, + "loss": 4.026, + "step": 29904 + }, + { + "epoch": 0.29905, + "grad_norm": 0.6928200517307427, + "learning_rate": 0.003, + "loss": 4.043, + "step": 29905 + }, + { + "epoch": 0.29906, + "grad_norm": 0.634728372704992, + "learning_rate": 0.003, + "loss": 4.042, + "step": 29906 + }, + { + "epoch": 0.29907, + "grad_norm": 0.6884889133685475, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 29907 + }, + { + "epoch": 0.29908, + "grad_norm": 1.0646283360060134, + "learning_rate": 0.003, + "loss": 4.029, + "step": 29908 + }, + { + "epoch": 0.29909, + "grad_norm": 1.3348607803611459, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 29909 + }, + { + "epoch": 0.2991, + "grad_norm": 0.8486129291997292, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 29910 + }, + { + "epoch": 0.29911, + "grad_norm": 0.8465249503174112, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 29911 + }, + { + "epoch": 0.29912, + "grad_norm": 0.752963220887893, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 29912 + }, + { + "epoch": 0.29913, + "grad_norm": 0.8543463648778149, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 29913 + }, + { + "epoch": 0.29914, + "grad_norm": 0.9941153775002424, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 29914 + }, + { + "epoch": 0.29915, + "grad_norm": 1.0720961998067813, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 29915 + }, + { + "epoch": 0.29916, + "grad_norm": 0.9556205503100462, + "learning_rate": 0.003, + "loss": 3.9936, + "step": 29916 + }, + { + "epoch": 0.29917, + "grad_norm": 1.0700270465746613, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 29917 + }, + { + "epoch": 0.29918, + "grad_norm": 0.9735479859008586, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 29918 + }, + { + "epoch": 0.29919, + "grad_norm": 1.0367887442185864, + "learning_rate": 0.003, + "loss": 4.0942, + "step": 29919 + }, + { + "epoch": 0.2992, + "grad_norm": 1.0273658836334438, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 29920 + }, + { + "epoch": 0.29921, + "grad_norm": 0.979507571105691, + "learning_rate": 0.003, + "loss": 4.059, + "step": 29921 + }, + { + "epoch": 0.29922, + "grad_norm": 0.9635805317346876, + "learning_rate": 0.003, + "loss": 4.054, + "step": 29922 + }, + { + "epoch": 0.29923, + "grad_norm": 0.9385501536787636, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 29923 + }, + { + "epoch": 0.29924, + "grad_norm": 1.0714777942234008, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 29924 + }, + { + "epoch": 0.29925, + "grad_norm": 0.9782301712148836, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 29925 + }, + { + "epoch": 0.29926, + "grad_norm": 1.0704170947388072, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 29926 + }, + { + "epoch": 0.29927, + "grad_norm": 1.1082807110806554, + "learning_rate": 0.003, + "loss": 4.042, + "step": 29927 + }, + { + "epoch": 0.29928, + "grad_norm": 0.9662183686340001, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 29928 + }, + { + "epoch": 0.29929, + "grad_norm": 0.9828129847523678, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 29929 + }, + { + "epoch": 0.2993, + "grad_norm": 1.0141080316087163, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 29930 + }, + { + "epoch": 0.29931, + "grad_norm": 1.0603941127250183, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 29931 + }, + { + "epoch": 0.29932, + "grad_norm": 0.9385482367956242, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 29932 + }, + { + "epoch": 0.29933, + "grad_norm": 0.9495843704594316, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 29933 + }, + { + "epoch": 0.29934, + "grad_norm": 0.7621890274308901, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 29934 + }, + { + "epoch": 0.29935, + "grad_norm": 0.670148891910428, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 29935 + }, + { + "epoch": 0.29936, + "grad_norm": 0.6918341289448267, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 29936 + }, + { + "epoch": 0.29937, + "grad_norm": 0.7300500019729539, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 29937 + }, + { + "epoch": 0.29938, + "grad_norm": 0.7996003949546515, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 29938 + }, + { + "epoch": 0.29939, + "grad_norm": 0.9009504036121483, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 29939 + }, + { + "epoch": 0.2994, + "grad_norm": 0.8967614383509817, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 29940 + }, + { + "epoch": 0.29941, + "grad_norm": 0.7486581461859741, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 29941 + }, + { + "epoch": 0.29942, + "grad_norm": 0.5436960414793199, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 29942 + }, + { + "epoch": 0.29943, + "grad_norm": 0.5433321346212, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 29943 + }, + { + "epoch": 0.29944, + "grad_norm": 0.6040885457355956, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 29944 + }, + { + "epoch": 0.29945, + "grad_norm": 0.6307588023026995, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 29945 + }, + { + "epoch": 0.29946, + "grad_norm": 0.8505343185913659, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 29946 + }, + { + "epoch": 0.29947, + "grad_norm": 1.0491865569042322, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 29947 + }, + { + "epoch": 0.29948, + "grad_norm": 1.0088624339509338, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 29948 + }, + { + "epoch": 0.29949, + "grad_norm": 0.9793667573466214, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 29949 + }, + { + "epoch": 0.2995, + "grad_norm": 0.9532289106721306, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 29950 + }, + { + "epoch": 0.29951, + "grad_norm": 0.8423690843277375, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 29951 + }, + { + "epoch": 0.29952, + "grad_norm": 0.8127863827334167, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 29952 + }, + { + "epoch": 0.29953, + "grad_norm": 0.7224316627746021, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 29953 + }, + { + "epoch": 0.29954, + "grad_norm": 0.7349568619746814, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 29954 + }, + { + "epoch": 0.29955, + "grad_norm": 0.7351471009496151, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 29955 + }, + { + "epoch": 0.29956, + "grad_norm": 0.9009173971834606, + "learning_rate": 0.003, + "loss": 4.035, + "step": 29956 + }, + { + "epoch": 0.29957, + "grad_norm": 1.0389833900958094, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 29957 + }, + { + "epoch": 0.29958, + "grad_norm": 1.1435333394755678, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 29958 + }, + { + "epoch": 0.29959, + "grad_norm": 0.8518454227039677, + "learning_rate": 0.003, + "loss": 4.044, + "step": 29959 + }, + { + "epoch": 0.2996, + "grad_norm": 0.7615978861368772, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 29960 + }, + { + "epoch": 0.29961, + "grad_norm": 0.7716144330057941, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 29961 + }, + { + "epoch": 0.29962, + "grad_norm": 0.731413426793074, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 29962 + }, + { + "epoch": 0.29963, + "grad_norm": 0.7031853443533272, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 29963 + }, + { + "epoch": 0.29964, + "grad_norm": 0.7450221290428228, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 29964 + }, + { + "epoch": 0.29965, + "grad_norm": 0.8064980273704023, + "learning_rate": 0.003, + "loss": 4.039, + "step": 29965 + }, + { + "epoch": 0.29966, + "grad_norm": 0.7292102000814651, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 29966 + }, + { + "epoch": 0.29967, + "grad_norm": 0.7208088492121769, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 29967 + }, + { + "epoch": 0.29968, + "grad_norm": 0.6357198280191361, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 29968 + }, + { + "epoch": 0.29969, + "grad_norm": 0.5511699932749419, + "learning_rate": 0.003, + "loss": 3.9986, + "step": 29969 + }, + { + "epoch": 0.2997, + "grad_norm": 0.5153403811395589, + "learning_rate": 0.003, + "loss": 4.059, + "step": 29970 + }, + { + "epoch": 0.29971, + "grad_norm": 0.5528168609732497, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 29971 + }, + { + "epoch": 0.29972, + "grad_norm": 0.716017557646997, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 29972 + }, + { + "epoch": 0.29973, + "grad_norm": 1.0034460219563752, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 29973 + }, + { + "epoch": 0.29974, + "grad_norm": 1.4411631734064885, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 29974 + }, + { + "epoch": 0.29975, + "grad_norm": 0.6570003140002783, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 29975 + }, + { + "epoch": 0.29976, + "grad_norm": 0.8045359237880182, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 29976 + }, + { + "epoch": 0.29977, + "grad_norm": 0.8432241108520697, + "learning_rate": 0.003, + "loss": 4.02, + "step": 29977 + }, + { + "epoch": 0.29978, + "grad_norm": 0.8720170248075246, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 29978 + }, + { + "epoch": 0.29979, + "grad_norm": 0.9519789925672366, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 29979 + }, + { + "epoch": 0.2998, + "grad_norm": 0.9440240167604893, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 29980 + }, + { + "epoch": 0.29981, + "grad_norm": 1.0088462072322921, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 29981 + }, + { + "epoch": 0.29982, + "grad_norm": 0.9533193437388763, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 29982 + }, + { + "epoch": 0.29983, + "grad_norm": 0.843125832374633, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 29983 + }, + { + "epoch": 0.29984, + "grad_norm": 0.7335017039971817, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 29984 + }, + { + "epoch": 0.29985, + "grad_norm": 0.708453652750939, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 29985 + }, + { + "epoch": 0.29986, + "grad_norm": 0.697392187517962, + "learning_rate": 0.003, + "loss": 4.047, + "step": 29986 + }, + { + "epoch": 0.29987, + "grad_norm": 0.8052542261238657, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 29987 + }, + { + "epoch": 0.29988, + "grad_norm": 0.8563985358895021, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 29988 + }, + { + "epoch": 0.29989, + "grad_norm": 1.0290446469115981, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 29989 + }, + { + "epoch": 0.2999, + "grad_norm": 1.0788640441323787, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 29990 + }, + { + "epoch": 0.29991, + "grad_norm": 0.7971295929192805, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 29991 + }, + { + "epoch": 0.29992, + "grad_norm": 0.7549997091463418, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 29992 + }, + { + "epoch": 0.29993, + "grad_norm": 0.8524769593365019, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 29993 + }, + { + "epoch": 0.29994, + "grad_norm": 0.8710290558718577, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 29994 + }, + { + "epoch": 0.29995, + "grad_norm": 0.8760789689982463, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 29995 + }, + { + "epoch": 0.29996, + "grad_norm": 0.9096928474588732, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 29996 + }, + { + "epoch": 0.29997, + "grad_norm": 1.013286824238396, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 29997 + }, + { + "epoch": 0.29998, + "grad_norm": 0.9934113186633183, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 29998 + }, + { + "epoch": 0.29999, + "grad_norm": 0.9104835937467876, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 29999 + }, + { + "epoch": 0.3, + "grad_norm": 0.7609670676207376, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 30000 + }, + { + "epoch": 0.30001, + "grad_norm": 0.6768705519191272, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 30001 + }, + { + "epoch": 0.30002, + "grad_norm": 0.7216189318349105, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 30002 + }, + { + "epoch": 0.30003, + "grad_norm": 0.9337982162119006, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 30003 + }, + { + "epoch": 0.30004, + "grad_norm": 1.1581049278023678, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 30004 + }, + { + "epoch": 0.30005, + "grad_norm": 1.1340406382539954, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 30005 + }, + { + "epoch": 0.30006, + "grad_norm": 0.9686000265289199, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 30006 + }, + { + "epoch": 0.30007, + "grad_norm": 0.865684303729371, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 30007 + }, + { + "epoch": 0.30008, + "grad_norm": 0.8801501423520437, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 30008 + }, + { + "epoch": 0.30009, + "grad_norm": 0.8907726531299617, + "learning_rate": 0.003, + "loss": 4.046, + "step": 30009 + }, + { + "epoch": 0.3001, + "grad_norm": 0.9057286780090243, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 30010 + }, + { + "epoch": 0.30011, + "grad_norm": 0.9722334373874111, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 30011 + }, + { + "epoch": 0.30012, + "grad_norm": 1.146644988575812, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 30012 + }, + { + "epoch": 0.30013, + "grad_norm": 1.0382624693738378, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 30013 + }, + { + "epoch": 0.30014, + "grad_norm": 0.9624670762005234, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 30014 + }, + { + "epoch": 0.30015, + "grad_norm": 0.9467457493940971, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 30015 + }, + { + "epoch": 0.30016, + "grad_norm": 0.9637719141145777, + "learning_rate": 0.003, + "loss": 4.0895, + "step": 30016 + }, + { + "epoch": 0.30017, + "grad_norm": 0.8037783126999698, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 30017 + }, + { + "epoch": 0.30018, + "grad_norm": 0.6689509918660139, + "learning_rate": 0.003, + "loss": 4.059, + "step": 30018 + }, + { + "epoch": 0.30019, + "grad_norm": 0.6859254473787996, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 30019 + }, + { + "epoch": 0.3002, + "grad_norm": 0.6623547301896632, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 30020 + }, + { + "epoch": 0.30021, + "grad_norm": 0.7579636175985185, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 30021 + }, + { + "epoch": 0.30022, + "grad_norm": 0.9090252729828222, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 30022 + }, + { + "epoch": 0.30023, + "grad_norm": 0.8618629308045245, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 30023 + }, + { + "epoch": 0.30024, + "grad_norm": 0.7750880138993158, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 30024 + }, + { + "epoch": 0.30025, + "grad_norm": 0.7043935545754563, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 30025 + }, + { + "epoch": 0.30026, + "grad_norm": 0.5462093703042328, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 30026 + }, + { + "epoch": 0.30027, + "grad_norm": 0.579464940457767, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 30027 + }, + { + "epoch": 0.30028, + "grad_norm": 0.5619803956517676, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 30028 + }, + { + "epoch": 0.30029, + "grad_norm": 0.683528488963088, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 30029 + }, + { + "epoch": 0.3003, + "grad_norm": 0.9005339668751994, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30030 + }, + { + "epoch": 0.30031, + "grad_norm": 1.043034063549366, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 30031 + }, + { + "epoch": 0.30032, + "grad_norm": 1.1044415073978013, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 30032 + }, + { + "epoch": 0.30033, + "grad_norm": 0.8515047874466991, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 30033 + }, + { + "epoch": 0.30034, + "grad_norm": 0.8079167024001153, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 30034 + }, + { + "epoch": 0.30035, + "grad_norm": 0.8223760540839545, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 30035 + }, + { + "epoch": 0.30036, + "grad_norm": 0.8243023067115568, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 30036 + }, + { + "epoch": 0.30037, + "grad_norm": 0.9364052224680736, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 30037 + }, + { + "epoch": 0.30038, + "grad_norm": 0.9859726026801926, + "learning_rate": 0.003, + "loss": 4.063, + "step": 30038 + }, + { + "epoch": 0.30039, + "grad_norm": 0.8828738888731624, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 30039 + }, + { + "epoch": 0.3004, + "grad_norm": 0.9749440159725465, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 30040 + }, + { + "epoch": 0.30041, + "grad_norm": 1.0180256552220674, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 30041 + }, + { + "epoch": 0.30042, + "grad_norm": 1.052450809911558, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 30042 + }, + { + "epoch": 0.30043, + "grad_norm": 1.0022914870255615, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 30043 + }, + { + "epoch": 0.30044, + "grad_norm": 1.0583587673789436, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 30044 + }, + { + "epoch": 0.30045, + "grad_norm": 0.8797160954048673, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 30045 + }, + { + "epoch": 0.30046, + "grad_norm": 0.9060523224018905, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 30046 + }, + { + "epoch": 0.30047, + "grad_norm": 0.921091882592109, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 30047 + }, + { + "epoch": 0.30048, + "grad_norm": 0.8249637446581346, + "learning_rate": 0.003, + "loss": 4.075, + "step": 30048 + }, + { + "epoch": 0.30049, + "grad_norm": 0.6975417003859311, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 30049 + }, + { + "epoch": 0.3005, + "grad_norm": 0.7535937565190751, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 30050 + }, + { + "epoch": 0.30051, + "grad_norm": 0.8743596691093144, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 30051 + }, + { + "epoch": 0.30052, + "grad_norm": 0.9364444064689101, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 30052 + }, + { + "epoch": 0.30053, + "grad_norm": 0.9282894316145495, + "learning_rate": 0.003, + "loss": 4.047, + "step": 30053 + }, + { + "epoch": 0.30054, + "grad_norm": 0.9510638857677045, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 30054 + }, + { + "epoch": 0.30055, + "grad_norm": 0.8523663704086185, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 30055 + }, + { + "epoch": 0.30056, + "grad_norm": 0.7916935711883694, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 30056 + }, + { + "epoch": 0.30057, + "grad_norm": 0.8037230615048561, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 30057 + }, + { + "epoch": 0.30058, + "grad_norm": 0.8615380437480775, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 30058 + }, + { + "epoch": 0.30059, + "grad_norm": 0.9911226831859725, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 30059 + }, + { + "epoch": 0.3006, + "grad_norm": 0.9722316069959992, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 30060 + }, + { + "epoch": 0.30061, + "grad_norm": 1.0000912636679056, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 30061 + }, + { + "epoch": 0.30062, + "grad_norm": 0.9951263532939444, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 30062 + }, + { + "epoch": 0.30063, + "grad_norm": 0.9800900643328759, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 30063 + }, + { + "epoch": 0.30064, + "grad_norm": 0.918794953737637, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 30064 + }, + { + "epoch": 0.30065, + "grad_norm": 0.7944156457114094, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 30065 + }, + { + "epoch": 0.30066, + "grad_norm": 0.7729617448687248, + "learning_rate": 0.003, + "loss": 4.078, + "step": 30066 + }, + { + "epoch": 0.30067, + "grad_norm": 0.7213423191874977, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 30067 + }, + { + "epoch": 0.30068, + "grad_norm": 0.6726827151221186, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 30068 + }, + { + "epoch": 0.30069, + "grad_norm": 0.6141110949903028, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 30069 + }, + { + "epoch": 0.3007, + "grad_norm": 0.7229540208822735, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 30070 + }, + { + "epoch": 0.30071, + "grad_norm": 0.8984671497992637, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 30071 + }, + { + "epoch": 0.30072, + "grad_norm": 1.1410116392740988, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 30072 + }, + { + "epoch": 0.30073, + "grad_norm": 0.8385558605503886, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 30073 + }, + { + "epoch": 0.30074, + "grad_norm": 0.7145560590076628, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 30074 + }, + { + "epoch": 0.30075, + "grad_norm": 0.6364971444796947, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 30075 + }, + { + "epoch": 0.30076, + "grad_norm": 0.6066845628687378, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 30076 + }, + { + "epoch": 0.30077, + "grad_norm": 0.5408152101135294, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 30077 + }, + { + "epoch": 0.30078, + "grad_norm": 0.5071662683503553, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 30078 + }, + { + "epoch": 0.30079, + "grad_norm": 0.5615718819642903, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 30079 + }, + { + "epoch": 0.3008, + "grad_norm": 0.6540390855159591, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 30080 + }, + { + "epoch": 0.30081, + "grad_norm": 0.8398230736417415, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 30081 + }, + { + "epoch": 0.30082, + "grad_norm": 1.0588878078917954, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 30082 + }, + { + "epoch": 0.30083, + "grad_norm": 1.0024313744365674, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 30083 + }, + { + "epoch": 0.30084, + "grad_norm": 0.864016386433271, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 30084 + }, + { + "epoch": 0.30085, + "grad_norm": 0.7840385847514313, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 30085 + }, + { + "epoch": 0.30086, + "grad_norm": 0.8237240552770042, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 30086 + }, + { + "epoch": 0.30087, + "grad_norm": 0.79118066110452, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 30087 + }, + { + "epoch": 0.30088, + "grad_norm": 0.7528140503604116, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 30088 + }, + { + "epoch": 0.30089, + "grad_norm": 0.8292117519622972, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 30089 + }, + { + "epoch": 0.3009, + "grad_norm": 0.8863435333810046, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 30090 + }, + { + "epoch": 0.30091, + "grad_norm": 0.9316574911148339, + "learning_rate": 0.003, + "loss": 4.065, + "step": 30091 + }, + { + "epoch": 0.30092, + "grad_norm": 0.913764352276469, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 30092 + }, + { + "epoch": 0.30093, + "grad_norm": 0.8692387064950584, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 30093 + }, + { + "epoch": 0.30094, + "grad_norm": 0.970055132569459, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 30094 + }, + { + "epoch": 0.30095, + "grad_norm": 1.0863418822648088, + "learning_rate": 0.003, + "loss": 4.034, + "step": 30095 + }, + { + "epoch": 0.30096, + "grad_norm": 1.0232357345895409, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 30096 + }, + { + "epoch": 0.30097, + "grad_norm": 0.8713014856312566, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 30097 + }, + { + "epoch": 0.30098, + "grad_norm": 0.8981817705233588, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 30098 + }, + { + "epoch": 0.30099, + "grad_norm": 1.0820389679329667, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 30099 + }, + { + "epoch": 0.301, + "grad_norm": 1.0762350415117965, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 30100 + }, + { + "epoch": 0.30101, + "grad_norm": 0.9973509471455523, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 30101 + }, + { + "epoch": 0.30102, + "grad_norm": 1.1506205516157868, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 30102 + }, + { + "epoch": 0.30103, + "grad_norm": 0.9283518604922325, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 30103 + }, + { + "epoch": 0.30104, + "grad_norm": 0.9400148061894855, + "learning_rate": 0.003, + "loss": 4.052, + "step": 30104 + }, + { + "epoch": 0.30105, + "grad_norm": 1.0016286824167613, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 30105 + }, + { + "epoch": 0.30106, + "grad_norm": 1.1977642133734585, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 30106 + }, + { + "epoch": 0.30107, + "grad_norm": 0.8362791462005004, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 30107 + }, + { + "epoch": 0.30108, + "grad_norm": 0.7907209557495114, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 30108 + }, + { + "epoch": 0.30109, + "grad_norm": 0.6190337375909825, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 30109 + }, + { + "epoch": 0.3011, + "grad_norm": 0.6450194041014916, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 30110 + }, + { + "epoch": 0.30111, + "grad_norm": 0.6587941590689612, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 30111 + }, + { + "epoch": 0.30112, + "grad_norm": 0.7835331809286532, + "learning_rate": 0.003, + "loss": 4.034, + "step": 30112 + }, + { + "epoch": 0.30113, + "grad_norm": 0.8973886184932217, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 30113 + }, + { + "epoch": 0.30114, + "grad_norm": 0.9794868722981024, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 30114 + }, + { + "epoch": 0.30115, + "grad_norm": 0.9573664396017678, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 30115 + }, + { + "epoch": 0.30116, + "grad_norm": 0.8013979782937286, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 30116 + }, + { + "epoch": 0.30117, + "grad_norm": 0.746306562452, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 30117 + }, + { + "epoch": 0.30118, + "grad_norm": 0.7345993729588842, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 30118 + }, + { + "epoch": 0.30119, + "grad_norm": 0.6604456319685357, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 30119 + }, + { + "epoch": 0.3012, + "grad_norm": 0.6078471200141904, + "learning_rate": 0.003, + "loss": 4.064, + "step": 30120 + }, + { + "epoch": 0.30121, + "grad_norm": 0.6048428354305295, + "learning_rate": 0.003, + "loss": 4.038, + "step": 30121 + }, + { + "epoch": 0.30122, + "grad_norm": 0.5751053806824694, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 30122 + }, + { + "epoch": 0.30123, + "grad_norm": 0.5825817751715745, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 30123 + }, + { + "epoch": 0.30124, + "grad_norm": 0.6554904891482146, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 30124 + }, + { + "epoch": 0.30125, + "grad_norm": 0.850841628446038, + "learning_rate": 0.003, + "loss": 4.029, + "step": 30125 + }, + { + "epoch": 0.30126, + "grad_norm": 1.1698886941979998, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 30126 + }, + { + "epoch": 0.30127, + "grad_norm": 0.806427010350725, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 30127 + }, + { + "epoch": 0.30128, + "grad_norm": 0.6618090482210965, + "learning_rate": 0.003, + "loss": 4.051, + "step": 30128 + }, + { + "epoch": 0.30129, + "grad_norm": 0.6811450009370348, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 30129 + }, + { + "epoch": 0.3013, + "grad_norm": 0.7127479194248231, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 30130 + }, + { + "epoch": 0.30131, + "grad_norm": 0.7401118890625202, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 30131 + }, + { + "epoch": 0.30132, + "grad_norm": 0.7366464220984607, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 30132 + }, + { + "epoch": 0.30133, + "grad_norm": 0.8846841795565, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 30133 + }, + { + "epoch": 0.30134, + "grad_norm": 1.207185552503816, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 30134 + }, + { + "epoch": 0.30135, + "grad_norm": 0.7622856521279255, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 30135 + }, + { + "epoch": 0.30136, + "grad_norm": 0.5716734051600992, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 30136 + }, + { + "epoch": 0.30137, + "grad_norm": 0.729130944485432, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 30137 + }, + { + "epoch": 0.30138, + "grad_norm": 0.8494605358873717, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 30138 + }, + { + "epoch": 0.30139, + "grad_norm": 1.0416513872681878, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 30139 + }, + { + "epoch": 0.3014, + "grad_norm": 1.0420772417366526, + "learning_rate": 0.003, + "loss": 4.0015, + "step": 30140 + }, + { + "epoch": 0.30141, + "grad_norm": 1.0008164486738604, + "learning_rate": 0.003, + "loss": 4.025, + "step": 30141 + }, + { + "epoch": 0.30142, + "grad_norm": 0.8860170151281823, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 30142 + }, + { + "epoch": 0.30143, + "grad_norm": 0.8499583971903932, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 30143 + }, + { + "epoch": 0.30144, + "grad_norm": 0.8726028574911808, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 30144 + }, + { + "epoch": 0.30145, + "grad_norm": 0.9528791970936563, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 30145 + }, + { + "epoch": 0.30146, + "grad_norm": 0.9923317792665013, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 30146 + }, + { + "epoch": 0.30147, + "grad_norm": 1.1399615488088233, + "learning_rate": 0.003, + "loss": 4.0821, + "step": 30147 + }, + { + "epoch": 0.30148, + "grad_norm": 1.015608599080371, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 30148 + }, + { + "epoch": 0.30149, + "grad_norm": 1.1761185165508, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 30149 + }, + { + "epoch": 0.3015, + "grad_norm": 1.095759960703571, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 30150 + }, + { + "epoch": 0.30151, + "grad_norm": 0.7492310058232573, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 30151 + }, + { + "epoch": 0.30152, + "grad_norm": 0.7508516214228315, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 30152 + }, + { + "epoch": 0.30153, + "grad_norm": 0.73165040518065, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 30153 + }, + { + "epoch": 0.30154, + "grad_norm": 0.8101928026200462, + "learning_rate": 0.003, + "loss": 4.041, + "step": 30154 + }, + { + "epoch": 0.30155, + "grad_norm": 0.9078439647495635, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 30155 + }, + { + "epoch": 0.30156, + "grad_norm": 1.0113830363444647, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 30156 + }, + { + "epoch": 0.30157, + "grad_norm": 1.1460778397011255, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 30157 + }, + { + "epoch": 0.30158, + "grad_norm": 0.9390622507499444, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 30158 + }, + { + "epoch": 0.30159, + "grad_norm": 0.8884700711819042, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 30159 + }, + { + "epoch": 0.3016, + "grad_norm": 0.8129163416528855, + "learning_rate": 0.003, + "loss": 4.071, + "step": 30160 + }, + { + "epoch": 0.30161, + "grad_norm": 0.7179065489472536, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 30161 + }, + { + "epoch": 0.30162, + "grad_norm": 0.6229724760344895, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 30162 + }, + { + "epoch": 0.30163, + "grad_norm": 0.5872493987647046, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 30163 + }, + { + "epoch": 0.30164, + "grad_norm": 0.5262716817864452, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 30164 + }, + { + "epoch": 0.30165, + "grad_norm": 0.48924516695168063, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 30165 + }, + { + "epoch": 0.30166, + "grad_norm": 0.5219311979934258, + "learning_rate": 0.003, + "loss": 4.0, + "step": 30166 + }, + { + "epoch": 0.30167, + "grad_norm": 0.617220437680268, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30167 + }, + { + "epoch": 0.30168, + "grad_norm": 0.6731686921783068, + "learning_rate": 0.003, + "loss": 4.043, + "step": 30168 + }, + { + "epoch": 0.30169, + "grad_norm": 0.8028200768431429, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 30169 + }, + { + "epoch": 0.3017, + "grad_norm": 0.9927710316218432, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 30170 + }, + { + "epoch": 0.30171, + "grad_norm": 1.230435455188699, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 30171 + }, + { + "epoch": 0.30172, + "grad_norm": 0.6468184113836593, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 30172 + }, + { + "epoch": 0.30173, + "grad_norm": 0.6914570176515393, + "learning_rate": 0.003, + "loss": 4.033, + "step": 30173 + }, + { + "epoch": 0.30174, + "grad_norm": 0.7440930304004294, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 30174 + }, + { + "epoch": 0.30175, + "grad_norm": 0.7684478765390758, + "learning_rate": 0.003, + "loss": 4.0845, + "step": 30175 + }, + { + "epoch": 0.30176, + "grad_norm": 0.8473078011691901, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 30176 + }, + { + "epoch": 0.30177, + "grad_norm": 0.9998850605661346, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 30177 + }, + { + "epoch": 0.30178, + "grad_norm": 1.191118895961893, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 30178 + }, + { + "epoch": 0.30179, + "grad_norm": 0.9319372224088357, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 30179 + }, + { + "epoch": 0.3018, + "grad_norm": 0.9241402466680498, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 30180 + }, + { + "epoch": 0.30181, + "grad_norm": 1.0082617174558068, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 30181 + }, + { + "epoch": 0.30182, + "grad_norm": 0.9268322466053224, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 30182 + }, + { + "epoch": 0.30183, + "grad_norm": 0.8054305846010326, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 30183 + }, + { + "epoch": 0.30184, + "grad_norm": 0.8776615775735316, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 30184 + }, + { + "epoch": 0.30185, + "grad_norm": 0.8567159995745438, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 30185 + }, + { + "epoch": 0.30186, + "grad_norm": 0.7864827193211007, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 30186 + }, + { + "epoch": 0.30187, + "grad_norm": 0.7074574022526259, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 30187 + }, + { + "epoch": 0.30188, + "grad_norm": 0.8796303585088163, + "learning_rate": 0.003, + "loss": 4.0843, + "step": 30188 + }, + { + "epoch": 0.30189, + "grad_norm": 1.084594708203222, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 30189 + }, + { + "epoch": 0.3019, + "grad_norm": 1.0803267897508766, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 30190 + }, + { + "epoch": 0.30191, + "grad_norm": 1.0133599223194938, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 30191 + }, + { + "epoch": 0.30192, + "grad_norm": 0.9459721852381003, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 30192 + }, + { + "epoch": 0.30193, + "grad_norm": 1.0030039629693577, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 30193 + }, + { + "epoch": 0.30194, + "grad_norm": 0.9647096275252048, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 30194 + }, + { + "epoch": 0.30195, + "grad_norm": 0.9134975042400836, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 30195 + }, + { + "epoch": 0.30196, + "grad_norm": 0.8296765586270167, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 30196 + }, + { + "epoch": 0.30197, + "grad_norm": 0.7268598319664543, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 30197 + }, + { + "epoch": 0.30198, + "grad_norm": 0.7576457102845, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 30198 + }, + { + "epoch": 0.30199, + "grad_norm": 0.7236578858880327, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 30199 + }, + { + "epoch": 0.302, + "grad_norm": 0.7807662772913275, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 30200 + }, + { + "epoch": 0.30201, + "grad_norm": 0.8157959707903321, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 30201 + }, + { + "epoch": 0.30202, + "grad_norm": 0.8565347842947642, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 30202 + }, + { + "epoch": 0.30203, + "grad_norm": 0.8254922441594126, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 30203 + }, + { + "epoch": 0.30204, + "grad_norm": 1.0395414295896102, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 30204 + }, + { + "epoch": 0.30205, + "grad_norm": 1.1346354252092652, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 30205 + }, + { + "epoch": 0.30206, + "grad_norm": 1.0048613163711353, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 30206 + }, + { + "epoch": 0.30207, + "grad_norm": 0.8947779257328616, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 30207 + }, + { + "epoch": 0.30208, + "grad_norm": 0.7789926533717361, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 30208 + }, + { + "epoch": 0.30209, + "grad_norm": 0.6294075332404504, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 30209 + }, + { + "epoch": 0.3021, + "grad_norm": 0.5794887657185035, + "learning_rate": 0.003, + "loss": 4.039, + "step": 30210 + }, + { + "epoch": 0.30211, + "grad_norm": 0.6878575425938104, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 30211 + }, + { + "epoch": 0.30212, + "grad_norm": 0.7904747368103345, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 30212 + }, + { + "epoch": 0.30213, + "grad_norm": 0.8423104315246172, + "learning_rate": 0.003, + "loss": 4.018, + "step": 30213 + }, + { + "epoch": 0.30214, + "grad_norm": 0.9181942290641042, + "learning_rate": 0.003, + "loss": 4.045, + "step": 30214 + }, + { + "epoch": 0.30215, + "grad_norm": 0.9280790786166967, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 30215 + }, + { + "epoch": 0.30216, + "grad_norm": 0.8861612461425382, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 30216 + }, + { + "epoch": 0.30217, + "grad_norm": 0.9560385597711841, + "learning_rate": 0.003, + "loss": 4.026, + "step": 30217 + }, + { + "epoch": 0.30218, + "grad_norm": 0.9675741604916318, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 30218 + }, + { + "epoch": 0.30219, + "grad_norm": 0.829072723069058, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 30219 + }, + { + "epoch": 0.3022, + "grad_norm": 0.6917333307009174, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 30220 + }, + { + "epoch": 0.30221, + "grad_norm": 0.801743159567868, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 30221 + }, + { + "epoch": 0.30222, + "grad_norm": 0.9904544803647845, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 30222 + }, + { + "epoch": 0.30223, + "grad_norm": 1.2567400902459158, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 30223 + }, + { + "epoch": 0.30224, + "grad_norm": 0.6580466809910713, + "learning_rate": 0.003, + "loss": 4.023, + "step": 30224 + }, + { + "epoch": 0.30225, + "grad_norm": 0.6552494141731057, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 30225 + }, + { + "epoch": 0.30226, + "grad_norm": 0.7268933074144787, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 30226 + }, + { + "epoch": 0.30227, + "grad_norm": 0.7533267192358415, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 30227 + }, + { + "epoch": 0.30228, + "grad_norm": 0.8663240625366962, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 30228 + }, + { + "epoch": 0.30229, + "grad_norm": 0.9616932705896533, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 30229 + }, + { + "epoch": 0.3023, + "grad_norm": 0.9851997579955184, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 30230 + }, + { + "epoch": 0.30231, + "grad_norm": 0.8770988413238499, + "learning_rate": 0.003, + "loss": 3.9938, + "step": 30231 + }, + { + "epoch": 0.30232, + "grad_norm": 0.777840995872639, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 30232 + }, + { + "epoch": 0.30233, + "grad_norm": 0.7853122651488179, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 30233 + }, + { + "epoch": 0.30234, + "grad_norm": 0.8677222722718249, + "learning_rate": 0.003, + "loss": 4.061, + "step": 30234 + }, + { + "epoch": 0.30235, + "grad_norm": 0.9088437875486106, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 30235 + }, + { + "epoch": 0.30236, + "grad_norm": 0.9795015108727622, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 30236 + }, + { + "epoch": 0.30237, + "grad_norm": 1.0431021719652733, + "learning_rate": 0.003, + "loss": 4.07, + "step": 30237 + }, + { + "epoch": 0.30238, + "grad_norm": 0.9567088999423756, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 30238 + }, + { + "epoch": 0.30239, + "grad_norm": 0.8719577168680183, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 30239 + }, + { + "epoch": 0.3024, + "grad_norm": 0.8847054409272194, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30240 + }, + { + "epoch": 0.30241, + "grad_norm": 1.0324779281681393, + "learning_rate": 0.003, + "loss": 4.057, + "step": 30241 + }, + { + "epoch": 0.30242, + "grad_norm": 1.1174717366240166, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 30242 + }, + { + "epoch": 0.30243, + "grad_norm": 0.8548772551325193, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 30243 + }, + { + "epoch": 0.30244, + "grad_norm": 0.8716835927874, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 30244 + }, + { + "epoch": 0.30245, + "grad_norm": 0.8870638937515534, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 30245 + }, + { + "epoch": 0.30246, + "grad_norm": 0.9281777253711491, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 30246 + }, + { + "epoch": 0.30247, + "grad_norm": 1.1480474558640383, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 30247 + }, + { + "epoch": 0.30248, + "grad_norm": 0.8556284493919245, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 30248 + }, + { + "epoch": 0.30249, + "grad_norm": 0.7762358545881688, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 30249 + }, + { + "epoch": 0.3025, + "grad_norm": 0.7889072140931954, + "learning_rate": 0.003, + "loss": 4.031, + "step": 30250 + }, + { + "epoch": 0.30251, + "grad_norm": 0.6634415407956741, + "learning_rate": 0.003, + "loss": 4.022, + "step": 30251 + }, + { + "epoch": 0.30252, + "grad_norm": 0.6716827740077547, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 30252 + }, + { + "epoch": 0.30253, + "grad_norm": 0.6564279248289177, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 30253 + }, + { + "epoch": 0.30254, + "grad_norm": 0.6843073766519795, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 30254 + }, + { + "epoch": 0.30255, + "grad_norm": 0.9708243470474383, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 30255 + }, + { + "epoch": 0.30256, + "grad_norm": 1.1856892877141294, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 30256 + }, + { + "epoch": 0.30257, + "grad_norm": 0.7047268858673291, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 30257 + }, + { + "epoch": 0.30258, + "grad_norm": 0.6217357136422378, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 30258 + }, + { + "epoch": 0.30259, + "grad_norm": 0.6661141812941753, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 30259 + }, + { + "epoch": 0.3026, + "grad_norm": 0.8288771426003788, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 30260 + }, + { + "epoch": 0.30261, + "grad_norm": 0.9389334705134942, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 30261 + }, + { + "epoch": 0.30262, + "grad_norm": 0.8923514026387495, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 30262 + }, + { + "epoch": 0.30263, + "grad_norm": 0.8378552060658956, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 30263 + }, + { + "epoch": 0.30264, + "grad_norm": 0.8286896357732173, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 30264 + }, + { + "epoch": 0.30265, + "grad_norm": 0.7736714578059609, + "learning_rate": 0.003, + "loss": 4.0865, + "step": 30265 + }, + { + "epoch": 0.30266, + "grad_norm": 0.7027885308545315, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 30266 + }, + { + "epoch": 0.30267, + "grad_norm": 0.7572522819183456, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 30267 + }, + { + "epoch": 0.30268, + "grad_norm": 0.8583655452138667, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 30268 + }, + { + "epoch": 0.30269, + "grad_norm": 0.9257866844793068, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 30269 + }, + { + "epoch": 0.3027, + "grad_norm": 0.9587327236254054, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 30270 + }, + { + "epoch": 0.30271, + "grad_norm": 0.9973885176791025, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 30271 + }, + { + "epoch": 0.30272, + "grad_norm": 1.1154447798842144, + "learning_rate": 0.003, + "loss": 4.095, + "step": 30272 + }, + { + "epoch": 0.30273, + "grad_norm": 0.8938248111648088, + "learning_rate": 0.003, + "loss": 4.058, + "step": 30273 + }, + { + "epoch": 0.30274, + "grad_norm": 0.7974154927094851, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 30274 + }, + { + "epoch": 0.30275, + "grad_norm": 0.7646063148274, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 30275 + }, + { + "epoch": 0.30276, + "grad_norm": 0.8578301505446296, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 30276 + }, + { + "epoch": 0.30277, + "grad_norm": 0.965487826416326, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 30277 + }, + { + "epoch": 0.30278, + "grad_norm": 0.9761680390046865, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 30278 + }, + { + "epoch": 0.30279, + "grad_norm": 1.015388266586735, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 30279 + }, + { + "epoch": 0.3028, + "grad_norm": 1.0182864075842986, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 30280 + }, + { + "epoch": 0.30281, + "grad_norm": 0.9425172693851305, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 30281 + }, + { + "epoch": 0.30282, + "grad_norm": 0.8810106789198481, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 30282 + }, + { + "epoch": 0.30283, + "grad_norm": 0.8125669325110937, + "learning_rate": 0.003, + "loss": 4.0763, + "step": 30283 + }, + { + "epoch": 0.30284, + "grad_norm": 0.7198831490635238, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 30284 + }, + { + "epoch": 0.30285, + "grad_norm": 0.7296678374144023, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 30285 + }, + { + "epoch": 0.30286, + "grad_norm": 0.751971281174092, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 30286 + }, + { + "epoch": 0.30287, + "grad_norm": 0.7541534009811488, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 30287 + }, + { + "epoch": 0.30288, + "grad_norm": 0.7441026158887752, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 30288 + }, + { + "epoch": 0.30289, + "grad_norm": 0.7320371537104422, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 30289 + }, + { + "epoch": 0.3029, + "grad_norm": 0.8192699362569991, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 30290 + }, + { + "epoch": 0.30291, + "grad_norm": 0.7396393554226357, + "learning_rate": 0.003, + "loss": 4.05, + "step": 30291 + }, + { + "epoch": 0.30292, + "grad_norm": 0.7322871753341583, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 30292 + }, + { + "epoch": 0.30293, + "grad_norm": 0.7552806767362475, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 30293 + }, + { + "epoch": 0.30294, + "grad_norm": 0.8307313136559434, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 30294 + }, + { + "epoch": 0.30295, + "grad_norm": 0.8115439524067782, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 30295 + }, + { + "epoch": 0.30296, + "grad_norm": 0.8244297448028151, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 30296 + }, + { + "epoch": 0.30297, + "grad_norm": 0.7328666570988653, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 30297 + }, + { + "epoch": 0.30298, + "grad_norm": 0.6955333035293306, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 30298 + }, + { + "epoch": 0.30299, + "grad_norm": 0.6730751202375586, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 30299 + }, + { + "epoch": 0.303, + "grad_norm": 0.7484661433762339, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 30300 + }, + { + "epoch": 0.30301, + "grad_norm": 0.8594429525780563, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 30301 + }, + { + "epoch": 0.30302, + "grad_norm": 0.9226836181777879, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 30302 + }, + { + "epoch": 0.30303, + "grad_norm": 1.0218726866250025, + "learning_rate": 0.003, + "loss": 4.096, + "step": 30303 + }, + { + "epoch": 0.30304, + "grad_norm": 1.0513759282274067, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 30304 + }, + { + "epoch": 0.30305, + "grad_norm": 0.8383673264780788, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 30305 + }, + { + "epoch": 0.30306, + "grad_norm": 0.8479729082374823, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 30306 + }, + { + "epoch": 0.30307, + "grad_norm": 0.8705096203591386, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 30307 + }, + { + "epoch": 0.30308, + "grad_norm": 1.0320537142735033, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 30308 + }, + { + "epoch": 0.30309, + "grad_norm": 1.1949871638826641, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 30309 + }, + { + "epoch": 0.3031, + "grad_norm": 0.9663187034316818, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 30310 + }, + { + "epoch": 0.30311, + "grad_norm": 0.9761822857109628, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 30311 + }, + { + "epoch": 0.30312, + "grad_norm": 0.9837425009489806, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 30312 + }, + { + "epoch": 0.30313, + "grad_norm": 0.9024120654314449, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 30313 + }, + { + "epoch": 0.30314, + "grad_norm": 0.8674094187069319, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 30314 + }, + { + "epoch": 0.30315, + "grad_norm": 0.9041556031086786, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 30315 + }, + { + "epoch": 0.30316, + "grad_norm": 0.9394311657478577, + "learning_rate": 0.003, + "loss": 4.039, + "step": 30316 + }, + { + "epoch": 0.30317, + "grad_norm": 0.9662805251903689, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 30317 + }, + { + "epoch": 0.30318, + "grad_norm": 1.0070809227896294, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 30318 + }, + { + "epoch": 0.30319, + "grad_norm": 0.9520665896256096, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 30319 + }, + { + "epoch": 0.3032, + "grad_norm": 0.8791330806702093, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 30320 + }, + { + "epoch": 0.30321, + "grad_norm": 0.8275199618899908, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 30321 + }, + { + "epoch": 0.30322, + "grad_norm": 0.9259135899100793, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 30322 + }, + { + "epoch": 0.30323, + "grad_norm": 0.8930378482409589, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 30323 + }, + { + "epoch": 0.30324, + "grad_norm": 0.9640065182701385, + "learning_rate": 0.003, + "loss": 4.053, + "step": 30324 + }, + { + "epoch": 0.30325, + "grad_norm": 0.9355570709520026, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 30325 + }, + { + "epoch": 0.30326, + "grad_norm": 0.9451752486195746, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 30326 + }, + { + "epoch": 0.30327, + "grad_norm": 0.8444279845649119, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 30327 + }, + { + "epoch": 0.30328, + "grad_norm": 0.9638581131720523, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 30328 + }, + { + "epoch": 0.30329, + "grad_norm": 1.2164357065826361, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 30329 + }, + { + "epoch": 0.3033, + "grad_norm": 0.8500487588012542, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 30330 + }, + { + "epoch": 0.30331, + "grad_norm": 0.6809214158237236, + "learning_rate": 0.003, + "loss": 4.0901, + "step": 30331 + }, + { + "epoch": 0.30332, + "grad_norm": 0.6622067608266444, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 30332 + }, + { + "epoch": 0.30333, + "grad_norm": 0.5439625163689614, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 30333 + }, + { + "epoch": 0.30334, + "grad_norm": 0.5104067080174235, + "learning_rate": 0.003, + "loss": 4.042, + "step": 30334 + }, + { + "epoch": 0.30335, + "grad_norm": 0.536076715743425, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 30335 + }, + { + "epoch": 0.30336, + "grad_norm": 0.5633306296787329, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 30336 + }, + { + "epoch": 0.30337, + "grad_norm": 0.603095346957842, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 30337 + }, + { + "epoch": 0.30338, + "grad_norm": 0.6593516123939079, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 30338 + }, + { + "epoch": 0.30339, + "grad_norm": 0.6999218461770925, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 30339 + }, + { + "epoch": 0.3034, + "grad_norm": 0.7055835145734309, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 30340 + }, + { + "epoch": 0.30341, + "grad_norm": 0.7743380441808827, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 30341 + }, + { + "epoch": 0.30342, + "grad_norm": 0.8022722102790272, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 30342 + }, + { + "epoch": 0.30343, + "grad_norm": 0.8427322854885158, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 30343 + }, + { + "epoch": 0.30344, + "grad_norm": 1.017967577262336, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 30344 + }, + { + "epoch": 0.30345, + "grad_norm": 1.2565658641051938, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 30345 + }, + { + "epoch": 0.30346, + "grad_norm": 0.8434448705219795, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 30346 + }, + { + "epoch": 0.30347, + "grad_norm": 0.7778339357991646, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 30347 + }, + { + "epoch": 0.30348, + "grad_norm": 0.7547255526072627, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 30348 + }, + { + "epoch": 0.30349, + "grad_norm": 0.6303580063583124, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 30349 + }, + { + "epoch": 0.3035, + "grad_norm": 0.6978509716572284, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 30350 + }, + { + "epoch": 0.30351, + "grad_norm": 0.7036367152057607, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 30351 + }, + { + "epoch": 0.30352, + "grad_norm": 0.8213754893843945, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 30352 + }, + { + "epoch": 0.30353, + "grad_norm": 0.8639831488103732, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 30353 + }, + { + "epoch": 0.30354, + "grad_norm": 0.9723515904276983, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 30354 + }, + { + "epoch": 0.30355, + "grad_norm": 1.1624190832534276, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 30355 + }, + { + "epoch": 0.30356, + "grad_norm": 0.9977567528210546, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 30356 + }, + { + "epoch": 0.30357, + "grad_norm": 0.9478570711543748, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 30357 + }, + { + "epoch": 0.30358, + "grad_norm": 0.9258046695519578, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 30358 + }, + { + "epoch": 0.30359, + "grad_norm": 0.8180790303742542, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 30359 + }, + { + "epoch": 0.3036, + "grad_norm": 0.8786888998024635, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 30360 + }, + { + "epoch": 0.30361, + "grad_norm": 0.8983794293591455, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 30361 + }, + { + "epoch": 0.30362, + "grad_norm": 0.812332159574046, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 30362 + }, + { + "epoch": 0.30363, + "grad_norm": 0.6745126366424232, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 30363 + }, + { + "epoch": 0.30364, + "grad_norm": 0.6277426253225633, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 30364 + }, + { + "epoch": 0.30365, + "grad_norm": 0.7692258225181239, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 30365 + }, + { + "epoch": 0.30366, + "grad_norm": 0.9150954907539325, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 30366 + }, + { + "epoch": 0.30367, + "grad_norm": 1.0885310718077903, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 30367 + }, + { + "epoch": 0.30368, + "grad_norm": 1.154268162036633, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 30368 + }, + { + "epoch": 0.30369, + "grad_norm": 0.7038154395605817, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 30369 + }, + { + "epoch": 0.3037, + "grad_norm": 0.6402108328496792, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 30370 + }, + { + "epoch": 0.30371, + "grad_norm": 0.852178766271217, + "learning_rate": 0.003, + "loss": 3.9927, + "step": 30371 + }, + { + "epoch": 0.30372, + "grad_norm": 1.0335590463761, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 30372 + }, + { + "epoch": 0.30373, + "grad_norm": 1.0331789257182817, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 30373 + }, + { + "epoch": 0.30374, + "grad_norm": 0.9043022035134948, + "learning_rate": 0.003, + "loss": 3.9993, + "step": 30374 + }, + { + "epoch": 0.30375, + "grad_norm": 0.8487205741784615, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 30375 + }, + { + "epoch": 0.30376, + "grad_norm": 0.808127922579622, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 30376 + }, + { + "epoch": 0.30377, + "grad_norm": 0.8043081446236571, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 30377 + }, + { + "epoch": 0.30378, + "grad_norm": 0.7623884755492006, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 30378 + }, + { + "epoch": 0.30379, + "grad_norm": 0.7547818540906263, + "learning_rate": 0.003, + "loss": 4.041, + "step": 30379 + }, + { + "epoch": 0.3038, + "grad_norm": 0.7133047006229598, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 30380 + }, + { + "epoch": 0.30381, + "grad_norm": 0.7456587987280763, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 30381 + }, + { + "epoch": 0.30382, + "grad_norm": 0.8225692378688939, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 30382 + }, + { + "epoch": 0.30383, + "grad_norm": 0.9214230816784313, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 30383 + }, + { + "epoch": 0.30384, + "grad_norm": 1.2489936152749403, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 30384 + }, + { + "epoch": 0.30385, + "grad_norm": 1.0303308440839072, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 30385 + }, + { + "epoch": 0.30386, + "grad_norm": 0.8221298431272507, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 30386 + }, + { + "epoch": 0.30387, + "grad_norm": 0.8047891127843105, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 30387 + }, + { + "epoch": 0.30388, + "grad_norm": 1.0532873735691581, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 30388 + }, + { + "epoch": 0.30389, + "grad_norm": 1.0579235718529443, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 30389 + }, + { + "epoch": 0.3039, + "grad_norm": 0.8540267265579224, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 30390 + }, + { + "epoch": 0.30391, + "grad_norm": 0.709782253769263, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30391 + }, + { + "epoch": 0.30392, + "grad_norm": 0.7060590662653896, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 30392 + }, + { + "epoch": 0.30393, + "grad_norm": 0.6470973568133837, + "learning_rate": 0.003, + "loss": 4.047, + "step": 30393 + }, + { + "epoch": 0.30394, + "grad_norm": 0.6997078132953025, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 30394 + }, + { + "epoch": 0.30395, + "grad_norm": 0.7534929531287325, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 30395 + }, + { + "epoch": 0.30396, + "grad_norm": 0.8247698008104903, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 30396 + }, + { + "epoch": 0.30397, + "grad_norm": 0.934412112346228, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 30397 + }, + { + "epoch": 0.30398, + "grad_norm": 0.9171367338808356, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 30398 + }, + { + "epoch": 0.30399, + "grad_norm": 0.9178614635710145, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 30399 + }, + { + "epoch": 0.304, + "grad_norm": 0.8361821417284744, + "learning_rate": 0.003, + "loss": 4.031, + "step": 30400 + }, + { + "epoch": 0.30401, + "grad_norm": 0.7622378866193045, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 30401 + }, + { + "epoch": 0.30402, + "grad_norm": 0.8011949664231661, + "learning_rate": 0.003, + "loss": 4.047, + "step": 30402 + }, + { + "epoch": 0.30403, + "grad_norm": 0.6668999293878359, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 30403 + }, + { + "epoch": 0.30404, + "grad_norm": 0.5943269098529899, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 30404 + }, + { + "epoch": 0.30405, + "grad_norm": 0.6021651981630596, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 30405 + }, + { + "epoch": 0.30406, + "grad_norm": 0.5703754902250703, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 30406 + }, + { + "epoch": 0.30407, + "grad_norm": 0.6541869662720878, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 30407 + }, + { + "epoch": 0.30408, + "grad_norm": 0.9105963388081413, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 30408 + }, + { + "epoch": 0.30409, + "grad_norm": 1.343175281846438, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 30409 + }, + { + "epoch": 0.3041, + "grad_norm": 0.827795638475697, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 30410 + }, + { + "epoch": 0.30411, + "grad_norm": 0.6956455471378123, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 30411 + }, + { + "epoch": 0.30412, + "grad_norm": 0.8203770484420269, + "learning_rate": 0.003, + "loss": 4.042, + "step": 30412 + }, + { + "epoch": 0.30413, + "grad_norm": 0.7906248366577798, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30413 + }, + { + "epoch": 0.30414, + "grad_norm": 0.8718052118562648, + "learning_rate": 0.003, + "loss": 4.043, + "step": 30414 + }, + { + "epoch": 0.30415, + "grad_norm": 0.8933365564966166, + "learning_rate": 0.003, + "loss": 4.045, + "step": 30415 + }, + { + "epoch": 0.30416, + "grad_norm": 0.8904494742186979, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 30416 + }, + { + "epoch": 0.30417, + "grad_norm": 1.06988936338908, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 30417 + }, + { + "epoch": 0.30418, + "grad_norm": 1.0233534495781111, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 30418 + }, + { + "epoch": 0.30419, + "grad_norm": 1.0859681486277621, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 30419 + }, + { + "epoch": 0.3042, + "grad_norm": 1.0611062573736434, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 30420 + }, + { + "epoch": 0.30421, + "grad_norm": 0.9130582343278664, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 30421 + }, + { + "epoch": 0.30422, + "grad_norm": 0.949190403199203, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 30422 + }, + { + "epoch": 0.30423, + "grad_norm": 1.0452269589112806, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 30423 + }, + { + "epoch": 0.30424, + "grad_norm": 1.1601117408552906, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 30424 + }, + { + "epoch": 0.30425, + "grad_norm": 1.2062509466885893, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 30425 + }, + { + "epoch": 0.30426, + "grad_norm": 0.7633774157556034, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 30426 + }, + { + "epoch": 0.30427, + "grad_norm": 0.6648448145524332, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 30427 + }, + { + "epoch": 0.30428, + "grad_norm": 0.6529134540603916, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 30428 + }, + { + "epoch": 0.30429, + "grad_norm": 0.7048279810122813, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 30429 + }, + { + "epoch": 0.3043, + "grad_norm": 0.7955976938689573, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 30430 + }, + { + "epoch": 0.30431, + "grad_norm": 0.9720577874014509, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 30431 + }, + { + "epoch": 0.30432, + "grad_norm": 1.1268461390707243, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 30432 + }, + { + "epoch": 0.30433, + "grad_norm": 0.7636657829703977, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 30433 + }, + { + "epoch": 0.30434, + "grad_norm": 0.7030947605766785, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 30434 + }, + { + "epoch": 0.30435, + "grad_norm": 0.5670352688566129, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 30435 + }, + { + "epoch": 0.30436, + "grad_norm": 0.6837463365073297, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 30436 + }, + { + "epoch": 0.30437, + "grad_norm": 0.7911284538229383, + "learning_rate": 0.003, + "loss": 4.027, + "step": 30437 + }, + { + "epoch": 0.30438, + "grad_norm": 0.8024377689290391, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 30438 + }, + { + "epoch": 0.30439, + "grad_norm": 0.7336471641850336, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 30439 + }, + { + "epoch": 0.3044, + "grad_norm": 0.8031943414482025, + "learning_rate": 0.003, + "loss": 3.9853, + "step": 30440 + }, + { + "epoch": 0.30441, + "grad_norm": 0.9310863609266561, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 30441 + }, + { + "epoch": 0.30442, + "grad_norm": 1.084328173625542, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 30442 + }, + { + "epoch": 0.30443, + "grad_norm": 0.81091650004587, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 30443 + }, + { + "epoch": 0.30444, + "grad_norm": 0.7283605198772602, + "learning_rate": 0.003, + "loss": 4.049, + "step": 30444 + }, + { + "epoch": 0.30445, + "grad_norm": 0.726949573576708, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 30445 + }, + { + "epoch": 0.30446, + "grad_norm": 0.7050301561999607, + "learning_rate": 0.003, + "loss": 4.0043, + "step": 30446 + }, + { + "epoch": 0.30447, + "grad_norm": 0.6356960710954501, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 30447 + }, + { + "epoch": 0.30448, + "grad_norm": 0.7117935539646044, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 30448 + }, + { + "epoch": 0.30449, + "grad_norm": 0.6701131598886387, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 30449 + }, + { + "epoch": 0.3045, + "grad_norm": 0.6896217900115332, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30450 + }, + { + "epoch": 0.30451, + "grad_norm": 0.851917201949667, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 30451 + }, + { + "epoch": 0.30452, + "grad_norm": 1.0226558531052654, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 30452 + }, + { + "epoch": 0.30453, + "grad_norm": 1.030817193177102, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 30453 + }, + { + "epoch": 0.30454, + "grad_norm": 1.0778862701257226, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 30454 + }, + { + "epoch": 0.30455, + "grad_norm": 0.9012286792449349, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 30455 + }, + { + "epoch": 0.30456, + "grad_norm": 0.8644815132532928, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 30456 + }, + { + "epoch": 0.30457, + "grad_norm": 0.8638120009318355, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 30457 + }, + { + "epoch": 0.30458, + "grad_norm": 1.02810107246511, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 30458 + }, + { + "epoch": 0.30459, + "grad_norm": 1.121234329621712, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 30459 + }, + { + "epoch": 0.3046, + "grad_norm": 0.8549847885210355, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 30460 + }, + { + "epoch": 0.30461, + "grad_norm": 0.8971993868883004, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 30461 + }, + { + "epoch": 0.30462, + "grad_norm": 0.8394039121803021, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 30462 + }, + { + "epoch": 0.30463, + "grad_norm": 0.8460804544011828, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 30463 + }, + { + "epoch": 0.30464, + "grad_norm": 0.848653454075666, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 30464 + }, + { + "epoch": 0.30465, + "grad_norm": 0.9782271380574802, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 30465 + }, + { + "epoch": 0.30466, + "grad_norm": 1.0659785694179789, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 30466 + }, + { + "epoch": 0.30467, + "grad_norm": 1.0248519914732717, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 30467 + }, + { + "epoch": 0.30468, + "grad_norm": 1.1156964821586373, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 30468 + }, + { + "epoch": 0.30469, + "grad_norm": 0.8743157369442729, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 30469 + }, + { + "epoch": 0.3047, + "grad_norm": 0.7959386058210299, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 30470 + }, + { + "epoch": 0.30471, + "grad_norm": 0.862088085197215, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 30471 + }, + { + "epoch": 0.30472, + "grad_norm": 0.8846574972125563, + "learning_rate": 0.003, + "loss": 4.065, + "step": 30472 + }, + { + "epoch": 0.30473, + "grad_norm": 0.8483636677170218, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 30473 + }, + { + "epoch": 0.30474, + "grad_norm": 0.8817447383655752, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 30474 + }, + { + "epoch": 0.30475, + "grad_norm": 0.9966391894746914, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 30475 + }, + { + "epoch": 0.30476, + "grad_norm": 1.1962880796765865, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 30476 + }, + { + "epoch": 0.30477, + "grad_norm": 0.9625264371357545, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 30477 + }, + { + "epoch": 0.30478, + "grad_norm": 0.9750212509489287, + "learning_rate": 0.003, + "loss": 4.077, + "step": 30478 + }, + { + "epoch": 0.30479, + "grad_norm": 0.9481598119386552, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 30479 + }, + { + "epoch": 0.3048, + "grad_norm": 0.7988261716212065, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 30480 + }, + { + "epoch": 0.30481, + "grad_norm": 0.847550955821006, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30481 + }, + { + "epoch": 0.30482, + "grad_norm": 0.8917790063521829, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 30482 + }, + { + "epoch": 0.30483, + "grad_norm": 0.7224393477258957, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 30483 + }, + { + "epoch": 0.30484, + "grad_norm": 0.700643387335102, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 30484 + }, + { + "epoch": 0.30485, + "grad_norm": 0.7039354099915305, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 30485 + }, + { + "epoch": 0.30486, + "grad_norm": 0.815230929702849, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 30486 + }, + { + "epoch": 0.30487, + "grad_norm": 1.092441901058937, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 30487 + }, + { + "epoch": 0.30488, + "grad_norm": 1.0436284620564016, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 30488 + }, + { + "epoch": 0.30489, + "grad_norm": 1.0122071127388763, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 30489 + }, + { + "epoch": 0.3049, + "grad_norm": 1.071092342491752, + "learning_rate": 0.003, + "loss": 4.074, + "step": 30490 + }, + { + "epoch": 0.30491, + "grad_norm": 0.8436317067588053, + "learning_rate": 0.003, + "loss": 4.0076, + "step": 30491 + }, + { + "epoch": 0.30492, + "grad_norm": 0.7614694409654934, + "learning_rate": 0.003, + "loss": 4.045, + "step": 30492 + }, + { + "epoch": 0.30493, + "grad_norm": 0.6153804046136563, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 30493 + }, + { + "epoch": 0.30494, + "grad_norm": 0.6106193435479481, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 30494 + }, + { + "epoch": 0.30495, + "grad_norm": 0.5330262898240072, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 30495 + }, + { + "epoch": 0.30496, + "grad_norm": 0.5738942532228786, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 30496 + }, + { + "epoch": 0.30497, + "grad_norm": 0.575719497939543, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 30497 + }, + { + "epoch": 0.30498, + "grad_norm": 0.6417501536600894, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 30498 + }, + { + "epoch": 0.30499, + "grad_norm": 0.65048652159219, + "learning_rate": 0.003, + "loss": 3.9942, + "step": 30499 + }, + { + "epoch": 0.305, + "grad_norm": 0.6868841833889263, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 30500 + }, + { + "epoch": 0.30501, + "grad_norm": 0.8744731226918196, + "learning_rate": 0.003, + "loss": 4.013, + "step": 30501 + }, + { + "epoch": 0.30502, + "grad_norm": 1.1334367837373611, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 30502 + }, + { + "epoch": 0.30503, + "grad_norm": 0.8577381633053462, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 30503 + }, + { + "epoch": 0.30504, + "grad_norm": 0.6758927448067685, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 30504 + }, + { + "epoch": 0.30505, + "grad_norm": 0.8543288397328942, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 30505 + }, + { + "epoch": 0.30506, + "grad_norm": 0.9280161416946323, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 30506 + }, + { + "epoch": 0.30507, + "grad_norm": 0.8625977229270854, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 30507 + }, + { + "epoch": 0.30508, + "grad_norm": 0.7481427245766996, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 30508 + }, + { + "epoch": 0.30509, + "grad_norm": 0.7460777333936117, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 30509 + }, + { + "epoch": 0.3051, + "grad_norm": 0.8450310358772559, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 30510 + }, + { + "epoch": 0.30511, + "grad_norm": 0.8982895605793537, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 30511 + }, + { + "epoch": 0.30512, + "grad_norm": 1.0241248435221675, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 30512 + }, + { + "epoch": 0.30513, + "grad_norm": 0.9244849904760327, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30513 + }, + { + "epoch": 0.30514, + "grad_norm": 0.8647594021727987, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 30514 + }, + { + "epoch": 0.30515, + "grad_norm": 0.8321297948978755, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 30515 + }, + { + "epoch": 0.30516, + "grad_norm": 0.7647133873584363, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 30516 + }, + { + "epoch": 0.30517, + "grad_norm": 0.6969550977042609, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 30517 + }, + { + "epoch": 0.30518, + "grad_norm": 0.7121579713015437, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 30518 + }, + { + "epoch": 0.30519, + "grad_norm": 0.7030654037798773, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 30519 + }, + { + "epoch": 0.3052, + "grad_norm": 0.7175191410393085, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 30520 + }, + { + "epoch": 0.30521, + "grad_norm": 0.9671655814895952, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 30521 + }, + { + "epoch": 0.30522, + "grad_norm": 1.094423253419572, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 30522 + }, + { + "epoch": 0.30523, + "grad_norm": 0.9172453916709014, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 30523 + }, + { + "epoch": 0.30524, + "grad_norm": 0.9449523071791175, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 30524 + }, + { + "epoch": 0.30525, + "grad_norm": 1.1251277533478277, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 30525 + }, + { + "epoch": 0.30526, + "grad_norm": 1.11090565433013, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 30526 + }, + { + "epoch": 0.30527, + "grad_norm": 1.2751994365432053, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 30527 + }, + { + "epoch": 0.30528, + "grad_norm": 1.0351062424991189, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 30528 + }, + { + "epoch": 0.30529, + "grad_norm": 0.9795659440229405, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 30529 + }, + { + "epoch": 0.3053, + "grad_norm": 1.0004309652448256, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 30530 + }, + { + "epoch": 0.30531, + "grad_norm": 0.9577289598588042, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 30531 + }, + { + "epoch": 0.30532, + "grad_norm": 1.0858908739094182, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 30532 + }, + { + "epoch": 0.30533, + "grad_norm": 1.0028607330201866, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 30533 + }, + { + "epoch": 0.30534, + "grad_norm": 1.010395079641831, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 30534 + }, + { + "epoch": 0.30535, + "grad_norm": 1.0323193791886558, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 30535 + }, + { + "epoch": 0.30536, + "grad_norm": 1.11833446147928, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 30536 + }, + { + "epoch": 0.30537, + "grad_norm": 1.0003198632891857, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 30537 + }, + { + "epoch": 0.30538, + "grad_norm": 1.0114548478464949, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 30538 + }, + { + "epoch": 0.30539, + "grad_norm": 0.935820896496027, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 30539 + }, + { + "epoch": 0.3054, + "grad_norm": 0.8795592934592581, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 30540 + }, + { + "epoch": 0.30541, + "grad_norm": 0.7636974905220982, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 30541 + }, + { + "epoch": 0.30542, + "grad_norm": 0.7754541921729162, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 30542 + }, + { + "epoch": 0.30543, + "grad_norm": 0.689826645480617, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 30543 + }, + { + "epoch": 0.30544, + "grad_norm": 0.6819288447751873, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 30544 + }, + { + "epoch": 0.30545, + "grad_norm": 0.7125149965574896, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 30545 + }, + { + "epoch": 0.30546, + "grad_norm": 0.9091752497513007, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 30546 + }, + { + "epoch": 0.30547, + "grad_norm": 0.9367698536764194, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 30547 + }, + { + "epoch": 0.30548, + "grad_norm": 0.7897532811621403, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 30548 + }, + { + "epoch": 0.30549, + "grad_norm": 0.8530441336889497, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 30549 + }, + { + "epoch": 0.3055, + "grad_norm": 1.0069920211125059, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 30550 + }, + { + "epoch": 0.30551, + "grad_norm": 1.0644644544064013, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 30551 + }, + { + "epoch": 0.30552, + "grad_norm": 0.8860204378475169, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 30552 + }, + { + "epoch": 0.30553, + "grad_norm": 0.8583705941414066, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 30553 + }, + { + "epoch": 0.30554, + "grad_norm": 0.7766281771727838, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 30554 + }, + { + "epoch": 0.30555, + "grad_norm": 0.7059658486029725, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 30555 + }, + { + "epoch": 0.30556, + "grad_norm": 0.8801782763900726, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 30556 + }, + { + "epoch": 0.30557, + "grad_norm": 1.170160288310564, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 30557 + }, + { + "epoch": 0.30558, + "grad_norm": 0.9929777763212249, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 30558 + }, + { + "epoch": 0.30559, + "grad_norm": 0.9514168585274976, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 30559 + }, + { + "epoch": 0.3056, + "grad_norm": 1.0573484859536102, + "learning_rate": 0.003, + "loss": 4.1069, + "step": 30560 + }, + { + "epoch": 0.30561, + "grad_norm": 1.1638415987963802, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 30561 + }, + { + "epoch": 0.30562, + "grad_norm": 0.7856256036015478, + "learning_rate": 0.003, + "loss": 4.077, + "step": 30562 + }, + { + "epoch": 0.30563, + "grad_norm": 0.7992188013287786, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 30563 + }, + { + "epoch": 0.30564, + "grad_norm": 0.9011935903417795, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 30564 + }, + { + "epoch": 0.30565, + "grad_norm": 0.9613462140651601, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 30565 + }, + { + "epoch": 0.30566, + "grad_norm": 1.1770448830485234, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 30566 + }, + { + "epoch": 0.30567, + "grad_norm": 0.9866294548294452, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 30567 + }, + { + "epoch": 0.30568, + "grad_norm": 0.8587004059062714, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 30568 + }, + { + "epoch": 0.30569, + "grad_norm": 0.7460376820100256, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 30569 + }, + { + "epoch": 0.3057, + "grad_norm": 0.8152461645244726, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 30570 + }, + { + "epoch": 0.30571, + "grad_norm": 0.6711375809235586, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 30571 + }, + { + "epoch": 0.30572, + "grad_norm": 0.5792487971960065, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 30572 + }, + { + "epoch": 0.30573, + "grad_norm": 0.567052993699175, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 30573 + }, + { + "epoch": 0.30574, + "grad_norm": 0.527951144700206, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 30574 + }, + { + "epoch": 0.30575, + "grad_norm": 0.5718809410268307, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 30575 + }, + { + "epoch": 0.30576, + "grad_norm": 0.7378430186273189, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 30576 + }, + { + "epoch": 0.30577, + "grad_norm": 0.912170837473285, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 30577 + }, + { + "epoch": 0.30578, + "grad_norm": 1.053663314316171, + "learning_rate": 0.003, + "loss": 4.041, + "step": 30578 + }, + { + "epoch": 0.30579, + "grad_norm": 1.0188598772298014, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 30579 + }, + { + "epoch": 0.3058, + "grad_norm": 0.7933571510957059, + "learning_rate": 0.003, + "loss": 4.045, + "step": 30580 + }, + { + "epoch": 0.30581, + "grad_norm": 0.7423629803025442, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 30581 + }, + { + "epoch": 0.30582, + "grad_norm": 0.9288754452642052, + "learning_rate": 0.003, + "loss": 4.048, + "step": 30582 + }, + { + "epoch": 0.30583, + "grad_norm": 0.9870822357771121, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 30583 + }, + { + "epoch": 0.30584, + "grad_norm": 0.9460554914760135, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 30584 + }, + { + "epoch": 0.30585, + "grad_norm": 0.8507144445210616, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 30585 + }, + { + "epoch": 0.30586, + "grad_norm": 0.808819355522907, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 30586 + }, + { + "epoch": 0.30587, + "grad_norm": 0.7880603756458274, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 30587 + }, + { + "epoch": 0.30588, + "grad_norm": 0.7425789147273255, + "learning_rate": 0.003, + "loss": 4.05, + "step": 30588 + }, + { + "epoch": 0.30589, + "grad_norm": 0.7452021928875582, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 30589 + }, + { + "epoch": 0.3059, + "grad_norm": 0.736722001570775, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 30590 + }, + { + "epoch": 0.30591, + "grad_norm": 0.6768935947292244, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 30591 + }, + { + "epoch": 0.30592, + "grad_norm": 0.7349154221715212, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 30592 + }, + { + "epoch": 0.30593, + "grad_norm": 0.812926759726785, + "learning_rate": 0.003, + "loss": 4.027, + "step": 30593 + }, + { + "epoch": 0.30594, + "grad_norm": 0.9142196346174669, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 30594 + }, + { + "epoch": 0.30595, + "grad_norm": 0.9459242455123975, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 30595 + }, + { + "epoch": 0.30596, + "grad_norm": 0.9603696829288325, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 30596 + }, + { + "epoch": 0.30597, + "grad_norm": 0.9464740078390893, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 30597 + }, + { + "epoch": 0.30598, + "grad_norm": 0.7345739115336234, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 30598 + }, + { + "epoch": 0.30599, + "grad_norm": 0.6468237801965143, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 30599 + }, + { + "epoch": 0.306, + "grad_norm": 0.5937269127504389, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 30600 + }, + { + "epoch": 0.30601, + "grad_norm": 0.5489578885187612, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 30601 + }, + { + "epoch": 0.30602, + "grad_norm": 0.5807362357943842, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 30602 + }, + { + "epoch": 0.30603, + "grad_norm": 0.682308873246767, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 30603 + }, + { + "epoch": 0.30604, + "grad_norm": 0.8225353030731932, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 30604 + }, + { + "epoch": 0.30605, + "grad_norm": 0.9959542909348267, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 30605 + }, + { + "epoch": 0.30606, + "grad_norm": 1.0820119134820312, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 30606 + }, + { + "epoch": 0.30607, + "grad_norm": 0.7269649660820245, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 30607 + }, + { + "epoch": 0.30608, + "grad_norm": 0.5665326210790931, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 30608 + }, + { + "epoch": 0.30609, + "grad_norm": 0.6946651238918355, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 30609 + }, + { + "epoch": 0.3061, + "grad_norm": 0.7798391941355483, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 30610 + }, + { + "epoch": 0.30611, + "grad_norm": 0.7308736040999012, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 30611 + }, + { + "epoch": 0.30612, + "grad_norm": 0.7139268096499337, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 30612 + }, + { + "epoch": 0.30613, + "grad_norm": 0.9613536881897801, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 30613 + }, + { + "epoch": 0.30614, + "grad_norm": 1.2135614617239538, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 30614 + }, + { + "epoch": 0.30615, + "grad_norm": 0.8008531705751949, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 30615 + }, + { + "epoch": 0.30616, + "grad_norm": 0.6638249458354882, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 30616 + }, + { + "epoch": 0.30617, + "grad_norm": 0.6825650409506161, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 30617 + }, + { + "epoch": 0.30618, + "grad_norm": 0.6608174467119787, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 30618 + }, + { + "epoch": 0.30619, + "grad_norm": 0.7136153781779269, + "learning_rate": 0.003, + "loss": 4.047, + "step": 30619 + }, + { + "epoch": 0.3062, + "grad_norm": 0.7558530286089896, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 30620 + }, + { + "epoch": 0.30621, + "grad_norm": 1.0218703744776076, + "learning_rate": 0.003, + "loss": 4.037, + "step": 30621 + }, + { + "epoch": 0.30622, + "grad_norm": 1.282246599701646, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 30622 + }, + { + "epoch": 0.30623, + "grad_norm": 0.8571704376223703, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 30623 + }, + { + "epoch": 0.30624, + "grad_norm": 0.7082722612587846, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 30624 + }, + { + "epoch": 0.30625, + "grad_norm": 0.56463135643707, + "learning_rate": 0.003, + "loss": 4.0034, + "step": 30625 + }, + { + "epoch": 0.30626, + "grad_norm": 0.58473763401941, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 30626 + }, + { + "epoch": 0.30627, + "grad_norm": 0.6657359129845389, + "learning_rate": 0.003, + "loss": 4.037, + "step": 30627 + }, + { + "epoch": 0.30628, + "grad_norm": 0.7879959047768911, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 30628 + }, + { + "epoch": 0.30629, + "grad_norm": 0.752512042438393, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 30629 + }, + { + "epoch": 0.3063, + "grad_norm": 0.6755480612240439, + "learning_rate": 0.003, + "loss": 4.053, + "step": 30630 + }, + { + "epoch": 0.30631, + "grad_norm": 0.7047429811229652, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 30631 + }, + { + "epoch": 0.30632, + "grad_norm": 0.6944437339468187, + "learning_rate": 0.003, + "loss": 4.048, + "step": 30632 + }, + { + "epoch": 0.30633, + "grad_norm": 0.7891599804467359, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 30633 + }, + { + "epoch": 0.30634, + "grad_norm": 0.9680067253513436, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 30634 + }, + { + "epoch": 0.30635, + "grad_norm": 1.2627476927545154, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 30635 + }, + { + "epoch": 0.30636, + "grad_norm": 0.8720670851829797, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 30636 + }, + { + "epoch": 0.30637, + "grad_norm": 0.750197936028176, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 30637 + }, + { + "epoch": 0.30638, + "grad_norm": 0.7150018146792149, + "learning_rate": 0.003, + "loss": 3.9884, + "step": 30638 + }, + { + "epoch": 0.30639, + "grad_norm": 0.6863645011478116, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 30639 + }, + { + "epoch": 0.3064, + "grad_norm": 0.8151166348577135, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 30640 + }, + { + "epoch": 0.30641, + "grad_norm": 0.8980925804134118, + "learning_rate": 0.003, + "loss": 4.046, + "step": 30641 + }, + { + "epoch": 0.30642, + "grad_norm": 1.09202452975863, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 30642 + }, + { + "epoch": 0.30643, + "grad_norm": 1.1743605930409449, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 30643 + }, + { + "epoch": 0.30644, + "grad_norm": 0.8558510527686386, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 30644 + }, + { + "epoch": 0.30645, + "grad_norm": 0.7707390344297989, + "learning_rate": 0.003, + "loss": 4.027, + "step": 30645 + }, + { + "epoch": 0.30646, + "grad_norm": 0.7947706431714266, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 30646 + }, + { + "epoch": 0.30647, + "grad_norm": 0.8104892367891933, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 30647 + }, + { + "epoch": 0.30648, + "grad_norm": 0.9092938592949533, + "learning_rate": 0.003, + "loss": 4.016, + "step": 30648 + }, + { + "epoch": 0.30649, + "grad_norm": 0.9414620171263496, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 30649 + }, + { + "epoch": 0.3065, + "grad_norm": 0.8766207163948251, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 30650 + }, + { + "epoch": 0.30651, + "grad_norm": 0.6792322753026214, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 30651 + }, + { + "epoch": 0.30652, + "grad_norm": 0.6911356129148062, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30652 + }, + { + "epoch": 0.30653, + "grad_norm": 0.7638155195380214, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 30653 + }, + { + "epoch": 0.30654, + "grad_norm": 0.8542605001923141, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 30654 + }, + { + "epoch": 0.30655, + "grad_norm": 0.9602365126244305, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 30655 + }, + { + "epoch": 0.30656, + "grad_norm": 1.2354362880311769, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 30656 + }, + { + "epoch": 0.30657, + "grad_norm": 0.7183424225104932, + "learning_rate": 0.003, + "loss": 4.012, + "step": 30657 + }, + { + "epoch": 0.30658, + "grad_norm": 0.6680684768754789, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 30658 + }, + { + "epoch": 0.30659, + "grad_norm": 0.8483015811732115, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 30659 + }, + { + "epoch": 0.3066, + "grad_norm": 0.8684039498961769, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 30660 + }, + { + "epoch": 0.30661, + "grad_norm": 0.923783096800622, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 30661 + }, + { + "epoch": 0.30662, + "grad_norm": 1.0160246191968734, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 30662 + }, + { + "epoch": 0.30663, + "grad_norm": 1.109590049741251, + "learning_rate": 0.003, + "loss": 4.025, + "step": 30663 + }, + { + "epoch": 0.30664, + "grad_norm": 0.8715465770190681, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 30664 + }, + { + "epoch": 0.30665, + "grad_norm": 0.7476768010657511, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 30665 + }, + { + "epoch": 0.30666, + "grad_norm": 0.7515838215983088, + "learning_rate": 0.003, + "loss": 4.032, + "step": 30666 + }, + { + "epoch": 0.30667, + "grad_norm": 0.7117954565992811, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 30667 + }, + { + "epoch": 0.30668, + "grad_norm": 0.6725148966287215, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 30668 + }, + { + "epoch": 0.30669, + "grad_norm": 0.6428016803998156, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 30669 + }, + { + "epoch": 0.3067, + "grad_norm": 0.6881369132104858, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 30670 + }, + { + "epoch": 0.30671, + "grad_norm": 0.8124269866500381, + "learning_rate": 0.003, + "loss": 4.028, + "step": 30671 + }, + { + "epoch": 0.30672, + "grad_norm": 0.8635767436865468, + "learning_rate": 0.003, + "loss": 4.002, + "step": 30672 + }, + { + "epoch": 0.30673, + "grad_norm": 1.0622452012900765, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 30673 + }, + { + "epoch": 0.30674, + "grad_norm": 1.223491916872636, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 30674 + }, + { + "epoch": 0.30675, + "grad_norm": 0.9484656323084134, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 30675 + }, + { + "epoch": 0.30676, + "grad_norm": 0.9658077797568466, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 30676 + }, + { + "epoch": 0.30677, + "grad_norm": 1.0729349158756862, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 30677 + }, + { + "epoch": 0.30678, + "grad_norm": 0.9254564039432714, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 30678 + }, + { + "epoch": 0.30679, + "grad_norm": 0.8677982876488929, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 30679 + }, + { + "epoch": 0.3068, + "grad_norm": 0.872402191526423, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 30680 + }, + { + "epoch": 0.30681, + "grad_norm": 0.9351671989504667, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 30681 + }, + { + "epoch": 0.30682, + "grad_norm": 0.8673203961207866, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 30682 + }, + { + "epoch": 0.30683, + "grad_norm": 0.7703040328833971, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 30683 + }, + { + "epoch": 0.30684, + "grad_norm": 0.8357480239427207, + "learning_rate": 0.003, + "loss": 4.054, + "step": 30684 + }, + { + "epoch": 0.30685, + "grad_norm": 0.8884315351312037, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 30685 + }, + { + "epoch": 0.30686, + "grad_norm": 0.924637440083138, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 30686 + }, + { + "epoch": 0.30687, + "grad_norm": 1.019542626364121, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 30687 + }, + { + "epoch": 0.30688, + "grad_norm": 0.9272995870744947, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 30688 + }, + { + "epoch": 0.30689, + "grad_norm": 0.797097838928584, + "learning_rate": 0.003, + "loss": 4.028, + "step": 30689 + }, + { + "epoch": 0.3069, + "grad_norm": 0.6586977756784121, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 30690 + }, + { + "epoch": 0.30691, + "grad_norm": 0.6648619160547126, + "learning_rate": 0.003, + "loss": 4.007, + "step": 30691 + }, + { + "epoch": 0.30692, + "grad_norm": 0.6907664307163258, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 30692 + }, + { + "epoch": 0.30693, + "grad_norm": 0.7235317717380572, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 30693 + }, + { + "epoch": 0.30694, + "grad_norm": 0.7525606313776578, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 30694 + }, + { + "epoch": 0.30695, + "grad_norm": 0.9072978984072535, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 30695 + }, + { + "epoch": 0.30696, + "grad_norm": 1.0555384750352588, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 30696 + }, + { + "epoch": 0.30697, + "grad_norm": 1.0615189935440474, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 30697 + }, + { + "epoch": 0.30698, + "grad_norm": 0.9906547515860503, + "learning_rate": 0.003, + "loss": 4.042, + "step": 30698 + }, + { + "epoch": 0.30699, + "grad_norm": 1.0762846268094246, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 30699 + }, + { + "epoch": 0.307, + "grad_norm": 0.926707911512607, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 30700 + }, + { + "epoch": 0.30701, + "grad_norm": 0.9633372217034292, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 30701 + }, + { + "epoch": 0.30702, + "grad_norm": 0.8086401603134522, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 30702 + }, + { + "epoch": 0.30703, + "grad_norm": 0.7232553759407089, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 30703 + }, + { + "epoch": 0.30704, + "grad_norm": 0.7200771870020278, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 30704 + }, + { + "epoch": 0.30705, + "grad_norm": 0.7334945392059585, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 30705 + }, + { + "epoch": 0.30706, + "grad_norm": 0.8101202961658248, + "learning_rate": 0.003, + "loss": 4.046, + "step": 30706 + }, + { + "epoch": 0.30707, + "grad_norm": 0.68569279375923, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 30707 + }, + { + "epoch": 0.30708, + "grad_norm": 0.6723718648987741, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 30708 + }, + { + "epoch": 0.30709, + "grad_norm": 0.7331277554464692, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 30709 + }, + { + "epoch": 0.3071, + "grad_norm": 0.9774950553158962, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30710 + }, + { + "epoch": 0.30711, + "grad_norm": 1.1288102519230838, + "learning_rate": 0.003, + "loss": 4.037, + "step": 30711 + }, + { + "epoch": 0.30712, + "grad_norm": 0.9404214799378747, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 30712 + }, + { + "epoch": 0.30713, + "grad_norm": 0.7677574751436614, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 30713 + }, + { + "epoch": 0.30714, + "grad_norm": 0.6811551460108075, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 30714 + }, + { + "epoch": 0.30715, + "grad_norm": 0.7481709530439791, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 30715 + }, + { + "epoch": 0.30716, + "grad_norm": 0.7657774624992527, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 30716 + }, + { + "epoch": 0.30717, + "grad_norm": 0.79596476756303, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 30717 + }, + { + "epoch": 0.30718, + "grad_norm": 0.8558118615860202, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 30718 + }, + { + "epoch": 0.30719, + "grad_norm": 0.9065773726730117, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 30719 + }, + { + "epoch": 0.3072, + "grad_norm": 0.9957608095537034, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 30720 + }, + { + "epoch": 0.30721, + "grad_norm": 1.0625921967061749, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 30721 + }, + { + "epoch": 0.30722, + "grad_norm": 0.9823392548062775, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 30722 + }, + { + "epoch": 0.30723, + "grad_norm": 1.0091725179097797, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 30723 + }, + { + "epoch": 0.30724, + "grad_norm": 0.9543424714192384, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 30724 + }, + { + "epoch": 0.30725, + "grad_norm": 0.8544466302349751, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 30725 + }, + { + "epoch": 0.30726, + "grad_norm": 0.722953222684606, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 30726 + }, + { + "epoch": 0.30727, + "grad_norm": 0.6733410345085835, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 30727 + }, + { + "epoch": 0.30728, + "grad_norm": 0.6932912273937791, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 30728 + }, + { + "epoch": 0.30729, + "grad_norm": 0.6469664067554134, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 30729 + }, + { + "epoch": 0.3073, + "grad_norm": 0.6276250865015277, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 30730 + }, + { + "epoch": 0.30731, + "grad_norm": 0.7085524018596607, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 30731 + }, + { + "epoch": 0.30732, + "grad_norm": 0.8788837999463663, + "learning_rate": 0.003, + "loss": 3.986, + "step": 30732 + }, + { + "epoch": 0.30733, + "grad_norm": 1.1045865189338693, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 30733 + }, + { + "epoch": 0.30734, + "grad_norm": 1.2239367269734398, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 30734 + }, + { + "epoch": 0.30735, + "grad_norm": 0.9000751469387722, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 30735 + }, + { + "epoch": 0.30736, + "grad_norm": 0.7531563503305398, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 30736 + }, + { + "epoch": 0.30737, + "grad_norm": 0.7578205678049688, + "learning_rate": 0.003, + "loss": 4.036, + "step": 30737 + }, + { + "epoch": 0.30738, + "grad_norm": 0.750360089831253, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 30738 + }, + { + "epoch": 0.30739, + "grad_norm": 0.7990357997325144, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 30739 + }, + { + "epoch": 0.3074, + "grad_norm": 0.6683234160878258, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 30740 + }, + { + "epoch": 0.30741, + "grad_norm": 0.6196475046195603, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 30741 + }, + { + "epoch": 0.30742, + "grad_norm": 0.6338111350587821, + "learning_rate": 0.003, + "loss": 4.039, + "step": 30742 + }, + { + "epoch": 0.30743, + "grad_norm": 0.5894121607969608, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 30743 + }, + { + "epoch": 0.30744, + "grad_norm": 0.5696970641148376, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 30744 + }, + { + "epoch": 0.30745, + "grad_norm": 0.6689444183158609, + "learning_rate": 0.003, + "loss": 4.033, + "step": 30745 + }, + { + "epoch": 0.30746, + "grad_norm": 0.889369189773622, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 30746 + }, + { + "epoch": 0.30747, + "grad_norm": 1.2530287115991772, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 30747 + }, + { + "epoch": 0.30748, + "grad_norm": 0.9262956253487464, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 30748 + }, + { + "epoch": 0.30749, + "grad_norm": 0.9417936702951035, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 30749 + }, + { + "epoch": 0.3075, + "grad_norm": 0.9895502121683721, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 30750 + }, + { + "epoch": 0.30751, + "grad_norm": 1.1743798058987138, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 30751 + }, + { + "epoch": 0.30752, + "grad_norm": 0.8308033324837698, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 30752 + }, + { + "epoch": 0.30753, + "grad_norm": 0.736248693918308, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 30753 + }, + { + "epoch": 0.30754, + "grad_norm": 0.7478044554207499, + "learning_rate": 0.003, + "loss": 4.017, + "step": 30754 + }, + { + "epoch": 0.30755, + "grad_norm": 0.8205637750415998, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 30755 + }, + { + "epoch": 0.30756, + "grad_norm": 0.8808907269838356, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 30756 + }, + { + "epoch": 0.30757, + "grad_norm": 0.9630424453638969, + "learning_rate": 0.003, + "loss": 4.043, + "step": 30757 + }, + { + "epoch": 0.30758, + "grad_norm": 1.0198432688178847, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 30758 + }, + { + "epoch": 0.30759, + "grad_norm": 1.0615540073933272, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 30759 + }, + { + "epoch": 0.3076, + "grad_norm": 0.8878911904851673, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 30760 + }, + { + "epoch": 0.30761, + "grad_norm": 0.8398407204491872, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 30761 + }, + { + "epoch": 0.30762, + "grad_norm": 0.8354126947854734, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30762 + }, + { + "epoch": 0.30763, + "grad_norm": 0.7706710221221755, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 30763 + }, + { + "epoch": 0.30764, + "grad_norm": 0.7139683242905777, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 30764 + }, + { + "epoch": 0.30765, + "grad_norm": 0.6266968520992895, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 30765 + }, + { + "epoch": 0.30766, + "grad_norm": 0.6850966865394219, + "learning_rate": 0.003, + "loss": 4.03, + "step": 30766 + }, + { + "epoch": 0.30767, + "grad_norm": 0.7739496069807652, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 30767 + }, + { + "epoch": 0.30768, + "grad_norm": 0.9300343281916225, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 30768 + }, + { + "epoch": 0.30769, + "grad_norm": 1.0123155637226164, + "learning_rate": 0.003, + "loss": 4.056, + "step": 30769 + }, + { + "epoch": 0.3077, + "grad_norm": 0.9438090466462085, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 30770 + }, + { + "epoch": 0.30771, + "grad_norm": 0.8929899468730477, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 30771 + }, + { + "epoch": 0.30772, + "grad_norm": 0.9930606707602103, + "learning_rate": 0.003, + "loss": 4.0957, + "step": 30772 + }, + { + "epoch": 0.30773, + "grad_norm": 1.078348769144722, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 30773 + }, + { + "epoch": 0.30774, + "grad_norm": 0.9683700832091975, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 30774 + }, + { + "epoch": 0.30775, + "grad_norm": 0.9851883506879314, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 30775 + }, + { + "epoch": 0.30776, + "grad_norm": 0.9117119240334299, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 30776 + }, + { + "epoch": 0.30777, + "grad_norm": 0.8501132354062436, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 30777 + }, + { + "epoch": 0.30778, + "grad_norm": 0.8039859642464788, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 30778 + }, + { + "epoch": 0.30779, + "grad_norm": 0.7974274445564175, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 30779 + }, + { + "epoch": 0.3078, + "grad_norm": 0.7328806124359419, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 30780 + }, + { + "epoch": 0.30781, + "grad_norm": 0.6843948783087196, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 30781 + }, + { + "epoch": 0.30782, + "grad_norm": 0.7941531535343496, + "learning_rate": 0.003, + "loss": 4.073, + "step": 30782 + }, + { + "epoch": 0.30783, + "grad_norm": 0.821662142286765, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 30783 + }, + { + "epoch": 0.30784, + "grad_norm": 0.8409291472726417, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 30784 + }, + { + "epoch": 0.30785, + "grad_norm": 0.8588215281473421, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 30785 + }, + { + "epoch": 0.30786, + "grad_norm": 0.9470323416410826, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 30786 + }, + { + "epoch": 0.30787, + "grad_norm": 0.9683240303288001, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 30787 + }, + { + "epoch": 0.30788, + "grad_norm": 0.8352878720201966, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 30788 + }, + { + "epoch": 0.30789, + "grad_norm": 0.8143190605352566, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 30789 + }, + { + "epoch": 0.3079, + "grad_norm": 0.8790867337610303, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 30790 + }, + { + "epoch": 0.30791, + "grad_norm": 0.9975166326202086, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 30791 + }, + { + "epoch": 0.30792, + "grad_norm": 1.2475447282973846, + "learning_rate": 0.003, + "loss": 4.0915, + "step": 30792 + }, + { + "epoch": 0.30793, + "grad_norm": 0.7264304267212495, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 30793 + }, + { + "epoch": 0.30794, + "grad_norm": 0.7947401426509082, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 30794 + }, + { + "epoch": 0.30795, + "grad_norm": 0.8208642665880733, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 30795 + }, + { + "epoch": 0.30796, + "grad_norm": 0.7854857138475605, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 30796 + }, + { + "epoch": 0.30797, + "grad_norm": 0.7976923160247547, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 30797 + }, + { + "epoch": 0.30798, + "grad_norm": 0.9465109887178134, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 30798 + }, + { + "epoch": 0.30799, + "grad_norm": 1.0277431708472036, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 30799 + }, + { + "epoch": 0.308, + "grad_norm": 1.109523459814763, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 30800 + }, + { + "epoch": 0.30801, + "grad_norm": 0.9430055645442009, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 30801 + }, + { + "epoch": 0.30802, + "grad_norm": 0.9904230065172027, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 30802 + }, + { + "epoch": 0.30803, + "grad_norm": 0.9908632799235534, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 30803 + }, + { + "epoch": 0.30804, + "grad_norm": 0.9580612853285466, + "learning_rate": 0.003, + "loss": 4.032, + "step": 30804 + }, + { + "epoch": 0.30805, + "grad_norm": 0.9466218623375239, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 30805 + }, + { + "epoch": 0.30806, + "grad_norm": 0.9044946005622532, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 30806 + }, + { + "epoch": 0.30807, + "grad_norm": 0.759377359296469, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 30807 + }, + { + "epoch": 0.30808, + "grad_norm": 0.7360337601373378, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 30808 + }, + { + "epoch": 0.30809, + "grad_norm": 0.6797214623427076, + "learning_rate": 0.003, + "loss": 4.06, + "step": 30809 + }, + { + "epoch": 0.3081, + "grad_norm": 0.6745378555274573, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 30810 + }, + { + "epoch": 0.30811, + "grad_norm": 0.6566073207211455, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 30811 + }, + { + "epoch": 0.30812, + "grad_norm": 0.7250193120881936, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 30812 + }, + { + "epoch": 0.30813, + "grad_norm": 0.7072851276136148, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 30813 + }, + { + "epoch": 0.30814, + "grad_norm": 0.705444759086317, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 30814 + }, + { + "epoch": 0.30815, + "grad_norm": 0.785556256205228, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 30815 + }, + { + "epoch": 0.30816, + "grad_norm": 0.7948409605113511, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 30816 + }, + { + "epoch": 0.30817, + "grad_norm": 0.8212499143360835, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 30817 + }, + { + "epoch": 0.30818, + "grad_norm": 0.8466019317405559, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 30818 + }, + { + "epoch": 0.30819, + "grad_norm": 0.7824228247767014, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 30819 + }, + { + "epoch": 0.3082, + "grad_norm": 0.8231279167401168, + "learning_rate": 0.003, + "loss": 4.044, + "step": 30820 + }, + { + "epoch": 0.30821, + "grad_norm": 0.932067987771609, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 30821 + }, + { + "epoch": 0.30822, + "grad_norm": 0.9272212911225559, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 30822 + }, + { + "epoch": 0.30823, + "grad_norm": 0.7866391777583547, + "learning_rate": 0.003, + "loss": 4.032, + "step": 30823 + }, + { + "epoch": 0.30824, + "grad_norm": 0.7411285656144493, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 30824 + }, + { + "epoch": 0.30825, + "grad_norm": 0.7174124341762038, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 30825 + }, + { + "epoch": 0.30826, + "grad_norm": 0.7195212426240878, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 30826 + }, + { + "epoch": 0.30827, + "grad_norm": 0.6710068338457987, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 30827 + }, + { + "epoch": 0.30828, + "grad_norm": 0.6049136288262137, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 30828 + }, + { + "epoch": 0.30829, + "grad_norm": 0.5915762371102405, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 30829 + }, + { + "epoch": 0.3083, + "grad_norm": 0.6229245979850135, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 30830 + }, + { + "epoch": 0.30831, + "grad_norm": 0.6048280048742432, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 30831 + }, + { + "epoch": 0.30832, + "grad_norm": 0.6311498907802161, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 30832 + }, + { + "epoch": 0.30833, + "grad_norm": 0.6379437212409611, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 30833 + }, + { + "epoch": 0.30834, + "grad_norm": 0.6894403038237408, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 30834 + }, + { + "epoch": 0.30835, + "grad_norm": 0.7762573748993623, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 30835 + }, + { + "epoch": 0.30836, + "grad_norm": 0.9956761437422733, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 30836 + }, + { + "epoch": 0.30837, + "grad_norm": 1.3858113047805518, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 30837 + }, + { + "epoch": 0.30838, + "grad_norm": 0.8368330440204933, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 30838 + }, + { + "epoch": 0.30839, + "grad_norm": 0.703661452550211, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 30839 + }, + { + "epoch": 0.3084, + "grad_norm": 0.7062758242542257, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 30840 + }, + { + "epoch": 0.30841, + "grad_norm": 0.6470620033172044, + "learning_rate": 0.003, + "loss": 4.022, + "step": 30841 + }, + { + "epoch": 0.30842, + "grad_norm": 0.7397643359200448, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 30842 + }, + { + "epoch": 0.30843, + "grad_norm": 0.8105070688175162, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 30843 + }, + { + "epoch": 0.30844, + "grad_norm": 1.0636992493769797, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 30844 + }, + { + "epoch": 0.30845, + "grad_norm": 1.4280990489498842, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 30845 + }, + { + "epoch": 0.30846, + "grad_norm": 0.6928591669261425, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 30846 + }, + { + "epoch": 0.30847, + "grad_norm": 0.7076770874058445, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 30847 + }, + { + "epoch": 0.30848, + "grad_norm": 0.7743231000964611, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 30848 + }, + { + "epoch": 0.30849, + "grad_norm": 0.8675294369801922, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 30849 + }, + { + "epoch": 0.3085, + "grad_norm": 0.7675108237300807, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 30850 + }, + { + "epoch": 0.30851, + "grad_norm": 0.8459778347215771, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 30851 + }, + { + "epoch": 0.30852, + "grad_norm": 0.9763670540508455, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 30852 + }, + { + "epoch": 0.30853, + "grad_norm": 1.144642861624166, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 30853 + }, + { + "epoch": 0.30854, + "grad_norm": 0.9054883532072058, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 30854 + }, + { + "epoch": 0.30855, + "grad_norm": 0.8311714558514375, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 30855 + }, + { + "epoch": 0.30856, + "grad_norm": 0.8384225521014615, + "learning_rate": 0.003, + "loss": 4.021, + "step": 30856 + }, + { + "epoch": 0.30857, + "grad_norm": 0.8999811598207876, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 30857 + }, + { + "epoch": 0.30858, + "grad_norm": 1.0353681727919666, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 30858 + }, + { + "epoch": 0.30859, + "grad_norm": 0.9084499237406256, + "learning_rate": 0.003, + "loss": 4.043, + "step": 30859 + }, + { + "epoch": 0.3086, + "grad_norm": 0.7600951994651148, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 30860 + }, + { + "epoch": 0.30861, + "grad_norm": 0.6602037715887461, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 30861 + }, + { + "epoch": 0.30862, + "grad_norm": 0.7078148812033994, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 30862 + }, + { + "epoch": 0.30863, + "grad_norm": 0.914458865067099, + "learning_rate": 0.003, + "loss": 4.023, + "step": 30863 + }, + { + "epoch": 0.30864, + "grad_norm": 0.93483134601536, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 30864 + }, + { + "epoch": 0.30865, + "grad_norm": 0.7822020584614777, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 30865 + }, + { + "epoch": 0.30866, + "grad_norm": 0.7971823943460915, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 30866 + }, + { + "epoch": 0.30867, + "grad_norm": 0.9202323508184236, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 30867 + }, + { + "epoch": 0.30868, + "grad_norm": 1.1638717464557773, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 30868 + }, + { + "epoch": 0.30869, + "grad_norm": 1.1658812798223301, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 30869 + }, + { + "epoch": 0.3087, + "grad_norm": 0.9124421812408606, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 30870 + }, + { + "epoch": 0.30871, + "grad_norm": 0.9245718639081646, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 30871 + }, + { + "epoch": 0.30872, + "grad_norm": 1.0407781148622681, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 30872 + }, + { + "epoch": 0.30873, + "grad_norm": 1.0210650990759353, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 30873 + }, + { + "epoch": 0.30874, + "grad_norm": 0.9241671278028997, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 30874 + }, + { + "epoch": 0.30875, + "grad_norm": 0.7949661151547252, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 30875 + }, + { + "epoch": 0.30876, + "grad_norm": 0.7715586240858586, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 30876 + }, + { + "epoch": 0.30877, + "grad_norm": 0.976657661635659, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 30877 + }, + { + "epoch": 0.30878, + "grad_norm": 1.1563548355269564, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 30878 + }, + { + "epoch": 0.30879, + "grad_norm": 0.8792173900225312, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 30879 + }, + { + "epoch": 0.3088, + "grad_norm": 0.9716157978466045, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30880 + }, + { + "epoch": 0.30881, + "grad_norm": 1.0947633268207957, + "learning_rate": 0.003, + "loss": 4.053, + "step": 30881 + }, + { + "epoch": 0.30882, + "grad_norm": 0.9817303479996596, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 30882 + }, + { + "epoch": 0.30883, + "grad_norm": 1.081444475160503, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 30883 + }, + { + "epoch": 0.30884, + "grad_norm": 0.867408073539162, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 30884 + }, + { + "epoch": 0.30885, + "grad_norm": 0.8747604860284365, + "learning_rate": 0.003, + "loss": 4.06, + "step": 30885 + }, + { + "epoch": 0.30886, + "grad_norm": 0.845118724240814, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 30886 + }, + { + "epoch": 0.30887, + "grad_norm": 0.7991888115608858, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 30887 + }, + { + "epoch": 0.30888, + "grad_norm": 0.7493543678681479, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 30888 + }, + { + "epoch": 0.30889, + "grad_norm": 0.7578823190588796, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 30889 + }, + { + "epoch": 0.3089, + "grad_norm": 0.8294631525847005, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 30890 + }, + { + "epoch": 0.30891, + "grad_norm": 0.7116140868217002, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 30891 + }, + { + "epoch": 0.30892, + "grad_norm": 0.654311368383866, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 30892 + }, + { + "epoch": 0.30893, + "grad_norm": 0.6774471319477997, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 30893 + }, + { + "epoch": 0.30894, + "grad_norm": 0.7047855297940542, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 30894 + }, + { + "epoch": 0.30895, + "grad_norm": 0.7092946895890737, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 30895 + }, + { + "epoch": 0.30896, + "grad_norm": 0.684974846546833, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 30896 + }, + { + "epoch": 0.30897, + "grad_norm": 0.7568743249712528, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 30897 + }, + { + "epoch": 0.30898, + "grad_norm": 1.078842950755968, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 30898 + }, + { + "epoch": 0.30899, + "grad_norm": 1.2599499182714489, + "learning_rate": 0.003, + "loss": 3.9964, + "step": 30899 + }, + { + "epoch": 0.309, + "grad_norm": 0.778162779113319, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 30900 + }, + { + "epoch": 0.30901, + "grad_norm": 0.7736141724046166, + "learning_rate": 0.003, + "loss": 4.048, + "step": 30901 + }, + { + "epoch": 0.30902, + "grad_norm": 0.864063956862464, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 30902 + }, + { + "epoch": 0.30903, + "grad_norm": 0.8314649489435436, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 30903 + }, + { + "epoch": 0.30904, + "grad_norm": 0.8560067482178059, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 30904 + }, + { + "epoch": 0.30905, + "grad_norm": 0.9500970616273259, + "learning_rate": 0.003, + "loss": 4.058, + "step": 30905 + }, + { + "epoch": 0.30906, + "grad_norm": 1.0445573042332406, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 30906 + }, + { + "epoch": 0.30907, + "grad_norm": 1.0786049859555682, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 30907 + }, + { + "epoch": 0.30908, + "grad_norm": 0.9407802372914399, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 30908 + }, + { + "epoch": 0.30909, + "grad_norm": 0.861749779455875, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 30909 + }, + { + "epoch": 0.3091, + "grad_norm": 0.9265995937642065, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 30910 + }, + { + "epoch": 0.30911, + "grad_norm": 0.8328510738699823, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 30911 + }, + { + "epoch": 0.30912, + "grad_norm": 0.6866715758244357, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 30912 + }, + { + "epoch": 0.30913, + "grad_norm": 0.6281154588420825, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 30913 + }, + { + "epoch": 0.30914, + "grad_norm": 0.5716326698369153, + "learning_rate": 0.003, + "loss": 3.9884, + "step": 30914 + }, + { + "epoch": 0.30915, + "grad_norm": 0.6910570005670672, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 30915 + }, + { + "epoch": 0.30916, + "grad_norm": 0.8043744295696714, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 30916 + }, + { + "epoch": 0.30917, + "grad_norm": 0.8849616687699069, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 30917 + }, + { + "epoch": 0.30918, + "grad_norm": 0.9119514383093178, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 30918 + }, + { + "epoch": 0.30919, + "grad_norm": 0.8393485831475815, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 30919 + }, + { + "epoch": 0.3092, + "grad_norm": 0.8601155065363455, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 30920 + }, + { + "epoch": 0.30921, + "grad_norm": 0.8677062198822795, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 30921 + }, + { + "epoch": 0.30922, + "grad_norm": 0.9365731305587933, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 30922 + }, + { + "epoch": 0.30923, + "grad_norm": 1.0871017199482813, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 30923 + }, + { + "epoch": 0.30924, + "grad_norm": 0.8500562001363362, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 30924 + }, + { + "epoch": 0.30925, + "grad_norm": 0.9464031272829073, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 30925 + }, + { + "epoch": 0.30926, + "grad_norm": 1.037790182972451, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 30926 + }, + { + "epoch": 0.30927, + "grad_norm": 0.9953459733870422, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 30927 + }, + { + "epoch": 0.30928, + "grad_norm": 0.9877682036439628, + "learning_rate": 0.003, + "loss": 4.048, + "step": 30928 + }, + { + "epoch": 0.30929, + "grad_norm": 0.8721875420586115, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 30929 + }, + { + "epoch": 0.3093, + "grad_norm": 0.7987285598352868, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 30930 + }, + { + "epoch": 0.30931, + "grad_norm": 0.8278858765422459, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 30931 + }, + { + "epoch": 0.30932, + "grad_norm": 0.8786533316448552, + "learning_rate": 0.003, + "loss": 4.038, + "step": 30932 + }, + { + "epoch": 0.30933, + "grad_norm": 0.977061469030929, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 30933 + }, + { + "epoch": 0.30934, + "grad_norm": 1.0469392450688666, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 30934 + }, + { + "epoch": 0.30935, + "grad_norm": 1.0680501192410035, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 30935 + }, + { + "epoch": 0.30936, + "grad_norm": 0.7741441716279966, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 30936 + }, + { + "epoch": 0.30937, + "grad_norm": 0.6731085510291004, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 30937 + }, + { + "epoch": 0.30938, + "grad_norm": 0.7686498915267687, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 30938 + }, + { + "epoch": 0.30939, + "grad_norm": 0.8081173299818549, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 30939 + }, + { + "epoch": 0.3094, + "grad_norm": 0.7104581117488192, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 30940 + }, + { + "epoch": 0.30941, + "grad_norm": 0.6948617356650509, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 30941 + }, + { + "epoch": 0.30942, + "grad_norm": 0.6962096529647628, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 30942 + }, + { + "epoch": 0.30943, + "grad_norm": 0.6740869619255537, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 30943 + }, + { + "epoch": 0.30944, + "grad_norm": 0.6707998718170635, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 30944 + }, + { + "epoch": 0.30945, + "grad_norm": 0.7060922173281504, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 30945 + }, + { + "epoch": 0.30946, + "grad_norm": 0.7144997641833017, + "learning_rate": 0.003, + "loss": 4.0032, + "step": 30946 + }, + { + "epoch": 0.30947, + "grad_norm": 0.7202735774920945, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 30947 + }, + { + "epoch": 0.30948, + "grad_norm": 0.8363543652847316, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 30948 + }, + { + "epoch": 0.30949, + "grad_norm": 1.0279347679092345, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 30949 + }, + { + "epoch": 0.3095, + "grad_norm": 1.153782803661401, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 30950 + }, + { + "epoch": 0.30951, + "grad_norm": 0.6971226568892515, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 30951 + }, + { + "epoch": 0.30952, + "grad_norm": 0.717740490118408, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 30952 + }, + { + "epoch": 0.30953, + "grad_norm": 0.8717125098624835, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 30953 + }, + { + "epoch": 0.30954, + "grad_norm": 0.9389074051286843, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 30954 + }, + { + "epoch": 0.30955, + "grad_norm": 0.8891870187625002, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 30955 + }, + { + "epoch": 0.30956, + "grad_norm": 0.8001640179460102, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 30956 + }, + { + "epoch": 0.30957, + "grad_norm": 0.8525564035168054, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 30957 + }, + { + "epoch": 0.30958, + "grad_norm": 0.8210509522842179, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 30958 + }, + { + "epoch": 0.30959, + "grad_norm": 0.6982756881635637, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 30959 + }, + { + "epoch": 0.3096, + "grad_norm": 0.6678187264713864, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 30960 + }, + { + "epoch": 0.30961, + "grad_norm": 0.7175310383546256, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 30961 + }, + { + "epoch": 0.30962, + "grad_norm": 0.8758024362543068, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 30962 + }, + { + "epoch": 0.30963, + "grad_norm": 1.055418853626481, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 30963 + }, + { + "epoch": 0.30964, + "grad_norm": 1.0413565293167013, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 30964 + }, + { + "epoch": 0.30965, + "grad_norm": 1.161427818455989, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 30965 + }, + { + "epoch": 0.30966, + "grad_norm": 0.820736439227036, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 30966 + }, + { + "epoch": 0.30967, + "grad_norm": 0.6556348644330107, + "learning_rate": 0.003, + "loss": 4.022, + "step": 30967 + }, + { + "epoch": 0.30968, + "grad_norm": 0.682304018705786, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 30968 + }, + { + "epoch": 0.30969, + "grad_norm": 0.7048564642048618, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 30969 + }, + { + "epoch": 0.3097, + "grad_norm": 0.745457284050935, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 30970 + }, + { + "epoch": 0.30971, + "grad_norm": 0.7828806572321504, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 30971 + }, + { + "epoch": 0.30972, + "grad_norm": 0.9286712905943237, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 30972 + }, + { + "epoch": 0.30973, + "grad_norm": 1.0864493218562645, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 30973 + }, + { + "epoch": 0.30974, + "grad_norm": 0.9710078484457113, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 30974 + }, + { + "epoch": 0.30975, + "grad_norm": 1.1108268964741557, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 30975 + }, + { + "epoch": 0.30976, + "grad_norm": 1.091960849735545, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 30976 + }, + { + "epoch": 0.30977, + "grad_norm": 1.006376369751325, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 30977 + }, + { + "epoch": 0.30978, + "grad_norm": 0.8294378277833414, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 30978 + }, + { + "epoch": 0.30979, + "grad_norm": 0.6829509783813068, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 30979 + }, + { + "epoch": 0.3098, + "grad_norm": 0.7442709759447237, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 30980 + }, + { + "epoch": 0.30981, + "grad_norm": 0.885562801073043, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 30981 + }, + { + "epoch": 0.30982, + "grad_norm": 0.9573031245840209, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 30982 + }, + { + "epoch": 0.30983, + "grad_norm": 1.026908261272219, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 30983 + }, + { + "epoch": 0.30984, + "grad_norm": 0.9462754429563887, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 30984 + }, + { + "epoch": 0.30985, + "grad_norm": 0.9680137468194956, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 30985 + }, + { + "epoch": 0.30986, + "grad_norm": 1.0837831790425496, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 30986 + }, + { + "epoch": 0.30987, + "grad_norm": 0.926782225671113, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 30987 + }, + { + "epoch": 0.30988, + "grad_norm": 0.9847682386833083, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 30988 + }, + { + "epoch": 0.30989, + "grad_norm": 1.0284223130794317, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 30989 + }, + { + "epoch": 0.3099, + "grad_norm": 0.8803483620370607, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 30990 + }, + { + "epoch": 0.30991, + "grad_norm": 0.8339326251242171, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 30991 + }, + { + "epoch": 0.30992, + "grad_norm": 0.7892997591454144, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 30992 + }, + { + "epoch": 0.30993, + "grad_norm": 0.811248594174698, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 30993 + }, + { + "epoch": 0.30994, + "grad_norm": 0.8062876234992918, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 30994 + }, + { + "epoch": 0.30995, + "grad_norm": 0.7518451160126378, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 30995 + }, + { + "epoch": 0.30996, + "grad_norm": 0.6559777697465964, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 30996 + }, + { + "epoch": 0.30997, + "grad_norm": 0.605975918738231, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 30997 + }, + { + "epoch": 0.30998, + "grad_norm": 0.671370272773477, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 30998 + }, + { + "epoch": 0.30999, + "grad_norm": 0.6709486752236141, + "learning_rate": 0.003, + "loss": 4.033, + "step": 30999 + }, + { + "epoch": 0.31, + "grad_norm": 0.7338335374064964, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 31000 + }, + { + "epoch": 0.31001, + "grad_norm": 0.835707778809499, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 31001 + }, + { + "epoch": 0.31002, + "grad_norm": 1.0131585189066932, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 31002 + }, + { + "epoch": 0.31003, + "grad_norm": 1.2302812820593076, + "learning_rate": 0.003, + "loss": 4.027, + "step": 31003 + }, + { + "epoch": 0.31004, + "grad_norm": 0.7293395447434757, + "learning_rate": 0.003, + "loss": 4.033, + "step": 31004 + }, + { + "epoch": 0.31005, + "grad_norm": 0.6551589996734186, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 31005 + }, + { + "epoch": 0.31006, + "grad_norm": 0.6339743227078345, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 31006 + }, + { + "epoch": 0.31007, + "grad_norm": 0.596116908478433, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 31007 + }, + { + "epoch": 0.31008, + "grad_norm": 0.5515872747727149, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 31008 + }, + { + "epoch": 0.31009, + "grad_norm": 0.5287433989556892, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 31009 + }, + { + "epoch": 0.3101, + "grad_norm": 0.5746812971715765, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 31010 + }, + { + "epoch": 0.31011, + "grad_norm": 0.7118937154303365, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 31011 + }, + { + "epoch": 0.31012, + "grad_norm": 0.8162206969463668, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 31012 + }, + { + "epoch": 0.31013, + "grad_norm": 0.7548004345470204, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 31013 + }, + { + "epoch": 0.31014, + "grad_norm": 0.6830373572298705, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 31014 + }, + { + "epoch": 0.31015, + "grad_norm": 0.7012547541580102, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 31015 + }, + { + "epoch": 0.31016, + "grad_norm": 0.7553299293266569, + "learning_rate": 0.003, + "loss": 4.028, + "step": 31016 + }, + { + "epoch": 0.31017, + "grad_norm": 0.8339908407659491, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 31017 + }, + { + "epoch": 0.31018, + "grad_norm": 1.0224020663266091, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 31018 + }, + { + "epoch": 0.31019, + "grad_norm": 1.0961216129609759, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 31019 + }, + { + "epoch": 0.3102, + "grad_norm": 0.932002824376987, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31020 + }, + { + "epoch": 0.31021, + "grad_norm": 0.9687302484483488, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 31021 + }, + { + "epoch": 0.31022, + "grad_norm": 0.9552239547163932, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 31022 + }, + { + "epoch": 0.31023, + "grad_norm": 1.257045589159289, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 31023 + }, + { + "epoch": 0.31024, + "grad_norm": 0.9853366323513992, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 31024 + }, + { + "epoch": 0.31025, + "grad_norm": 0.9317811307956058, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 31025 + }, + { + "epoch": 0.31026, + "grad_norm": 0.9484528903511715, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 31026 + }, + { + "epoch": 0.31027, + "grad_norm": 1.0703352376046482, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 31027 + }, + { + "epoch": 0.31028, + "grad_norm": 1.0817165846514012, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 31028 + }, + { + "epoch": 0.31029, + "grad_norm": 0.8139267355162888, + "learning_rate": 0.003, + "loss": 4.04, + "step": 31029 + }, + { + "epoch": 0.3103, + "grad_norm": 0.7798121267458615, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 31030 + }, + { + "epoch": 0.31031, + "grad_norm": 0.8224564797754106, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 31031 + }, + { + "epoch": 0.31032, + "grad_norm": 0.7570758717756095, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 31032 + }, + { + "epoch": 0.31033, + "grad_norm": 0.7576721999266691, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 31033 + }, + { + "epoch": 0.31034, + "grad_norm": 0.8064131656169007, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 31034 + }, + { + "epoch": 0.31035, + "grad_norm": 0.9081472479934298, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 31035 + }, + { + "epoch": 0.31036, + "grad_norm": 1.0124863061071248, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 31036 + }, + { + "epoch": 0.31037, + "grad_norm": 1.2004888205712148, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 31037 + }, + { + "epoch": 0.31038, + "grad_norm": 0.885908380875526, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 31038 + }, + { + "epoch": 0.31039, + "grad_norm": 0.7946190508781668, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 31039 + }, + { + "epoch": 0.3104, + "grad_norm": 0.8398811895755552, + "learning_rate": 0.003, + "loss": 4.046, + "step": 31040 + }, + { + "epoch": 0.31041, + "grad_norm": 0.8079893266326794, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 31041 + }, + { + "epoch": 0.31042, + "grad_norm": 0.7912827310592263, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 31042 + }, + { + "epoch": 0.31043, + "grad_norm": 0.9084398130608581, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 31043 + }, + { + "epoch": 0.31044, + "grad_norm": 0.947306554734898, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 31044 + }, + { + "epoch": 0.31045, + "grad_norm": 0.8358163981934753, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 31045 + }, + { + "epoch": 0.31046, + "grad_norm": 0.7817003343741666, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 31046 + }, + { + "epoch": 0.31047, + "grad_norm": 0.6613145028427724, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 31047 + }, + { + "epoch": 0.31048, + "grad_norm": 0.6936189603118382, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 31048 + }, + { + "epoch": 0.31049, + "grad_norm": 0.7590241324708386, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 31049 + }, + { + "epoch": 0.3105, + "grad_norm": 0.7132149902605485, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 31050 + }, + { + "epoch": 0.31051, + "grad_norm": 0.7856538510872045, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 31051 + }, + { + "epoch": 0.31052, + "grad_norm": 1.0454499292239772, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 31052 + }, + { + "epoch": 0.31053, + "grad_norm": 1.216803873518095, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 31053 + }, + { + "epoch": 0.31054, + "grad_norm": 0.686016305284535, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 31054 + }, + { + "epoch": 0.31055, + "grad_norm": 0.649826183521942, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 31055 + }, + { + "epoch": 0.31056, + "grad_norm": 0.6965922841821345, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 31056 + }, + { + "epoch": 0.31057, + "grad_norm": 0.6893290408468875, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 31057 + }, + { + "epoch": 0.31058, + "grad_norm": 0.689710327966547, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 31058 + }, + { + "epoch": 0.31059, + "grad_norm": 0.7078059697593403, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 31059 + }, + { + "epoch": 0.3106, + "grad_norm": 0.7266861777177281, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 31060 + }, + { + "epoch": 0.31061, + "grad_norm": 1.014360325459925, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 31061 + }, + { + "epoch": 0.31062, + "grad_norm": 1.2100780791637273, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 31062 + }, + { + "epoch": 0.31063, + "grad_norm": 0.7814388638722781, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 31063 + }, + { + "epoch": 0.31064, + "grad_norm": 0.7987441523062871, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 31064 + }, + { + "epoch": 0.31065, + "grad_norm": 0.8063535819177563, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 31065 + }, + { + "epoch": 0.31066, + "grad_norm": 0.8790037433793394, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 31066 + }, + { + "epoch": 0.31067, + "grad_norm": 1.0004837095558254, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 31067 + }, + { + "epoch": 0.31068, + "grad_norm": 0.9883808148319644, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 31068 + }, + { + "epoch": 0.31069, + "grad_norm": 0.8481105701572573, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 31069 + }, + { + "epoch": 0.3107, + "grad_norm": 0.8537340899326175, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 31070 + }, + { + "epoch": 0.31071, + "grad_norm": 0.8679776259807989, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 31071 + }, + { + "epoch": 0.31072, + "grad_norm": 0.9282566013463376, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 31072 + }, + { + "epoch": 0.31073, + "grad_norm": 0.9032554650560465, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 31073 + }, + { + "epoch": 0.31074, + "grad_norm": 0.8712692842079965, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 31074 + }, + { + "epoch": 0.31075, + "grad_norm": 0.9278415717566548, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 31075 + }, + { + "epoch": 0.31076, + "grad_norm": 1.0005566571876519, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 31076 + }, + { + "epoch": 0.31077, + "grad_norm": 1.157025356897531, + "learning_rate": 0.003, + "loss": 4.059, + "step": 31077 + }, + { + "epoch": 0.31078, + "grad_norm": 0.9825780881863968, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 31078 + }, + { + "epoch": 0.31079, + "grad_norm": 0.9130960374629797, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 31079 + }, + { + "epoch": 0.3108, + "grad_norm": 0.7799005755232936, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 31080 + }, + { + "epoch": 0.31081, + "grad_norm": 0.8526306648918017, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 31081 + }, + { + "epoch": 0.31082, + "grad_norm": 1.0947067777263588, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 31082 + }, + { + "epoch": 0.31083, + "grad_norm": 1.1673920614488493, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 31083 + }, + { + "epoch": 0.31084, + "grad_norm": 0.7377706791397789, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 31084 + }, + { + "epoch": 0.31085, + "grad_norm": 0.6411479960245204, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 31085 + }, + { + "epoch": 0.31086, + "grad_norm": 0.6203767068765476, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 31086 + }, + { + "epoch": 0.31087, + "grad_norm": 0.6627128439228822, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 31087 + }, + { + "epoch": 0.31088, + "grad_norm": 0.6324818538763578, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 31088 + }, + { + "epoch": 0.31089, + "grad_norm": 0.5448869350551743, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 31089 + }, + { + "epoch": 0.3109, + "grad_norm": 0.4964369232111863, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 31090 + }, + { + "epoch": 0.31091, + "grad_norm": 0.4623353225491741, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 31091 + }, + { + "epoch": 0.31092, + "grad_norm": 0.5268667654976494, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 31092 + }, + { + "epoch": 0.31093, + "grad_norm": 0.5374996747579416, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 31093 + }, + { + "epoch": 0.31094, + "grad_norm": 0.5861042164616919, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31094 + }, + { + "epoch": 0.31095, + "grad_norm": 0.5795076824572069, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 31095 + }, + { + "epoch": 0.31096, + "grad_norm": 0.5924979126186535, + "learning_rate": 0.003, + "loss": 4.018, + "step": 31096 + }, + { + "epoch": 0.31097, + "grad_norm": 0.6394316032476024, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 31097 + }, + { + "epoch": 0.31098, + "grad_norm": 0.7502514250773625, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 31098 + }, + { + "epoch": 0.31099, + "grad_norm": 1.203459635573585, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 31099 + }, + { + "epoch": 0.311, + "grad_norm": 1.3422316334643631, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 31100 + }, + { + "epoch": 0.31101, + "grad_norm": 0.6338723194433382, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 31101 + }, + { + "epoch": 0.31102, + "grad_norm": 0.7626809188218358, + "learning_rate": 0.003, + "loss": 4.003, + "step": 31102 + }, + { + "epoch": 0.31103, + "grad_norm": 0.8721442591783304, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 31103 + }, + { + "epoch": 0.31104, + "grad_norm": 0.9059819078573021, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 31104 + }, + { + "epoch": 0.31105, + "grad_norm": 0.8776106723378734, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 31105 + }, + { + "epoch": 0.31106, + "grad_norm": 0.8368024936953757, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 31106 + }, + { + "epoch": 0.31107, + "grad_norm": 0.7350101081747782, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 31107 + }, + { + "epoch": 0.31108, + "grad_norm": 0.7820924972807999, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 31108 + }, + { + "epoch": 0.31109, + "grad_norm": 0.9627609301477671, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 31109 + }, + { + "epoch": 0.3111, + "grad_norm": 1.1433743004983135, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 31110 + }, + { + "epoch": 0.31111, + "grad_norm": 1.102115815977687, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 31111 + }, + { + "epoch": 0.31112, + "grad_norm": 1.163092415598571, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 31112 + }, + { + "epoch": 0.31113, + "grad_norm": 0.8018394827470376, + "learning_rate": 0.003, + "loss": 4.048, + "step": 31113 + }, + { + "epoch": 0.31114, + "grad_norm": 0.8417857953568431, + "learning_rate": 0.003, + "loss": 4.019, + "step": 31114 + }, + { + "epoch": 0.31115, + "grad_norm": 1.006621112755898, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 31115 + }, + { + "epoch": 0.31116, + "grad_norm": 1.0530222688174324, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 31116 + }, + { + "epoch": 0.31117, + "grad_norm": 0.8205619452524133, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 31117 + }, + { + "epoch": 0.31118, + "grad_norm": 0.9097267082533321, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 31118 + }, + { + "epoch": 0.31119, + "grad_norm": 0.835064870487331, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 31119 + }, + { + "epoch": 0.3112, + "grad_norm": 0.8352121026036825, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 31120 + }, + { + "epoch": 0.31121, + "grad_norm": 0.9050982692869832, + "learning_rate": 0.003, + "loss": 4.055, + "step": 31121 + }, + { + "epoch": 0.31122, + "grad_norm": 1.0397825182148517, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 31122 + }, + { + "epoch": 0.31123, + "grad_norm": 1.0114958501051587, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 31123 + }, + { + "epoch": 0.31124, + "grad_norm": 1.0248608227633966, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 31124 + }, + { + "epoch": 0.31125, + "grad_norm": 0.9472824474811777, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 31125 + }, + { + "epoch": 0.31126, + "grad_norm": 0.7461692781053041, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 31126 + }, + { + "epoch": 0.31127, + "grad_norm": 0.6113110203431844, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 31127 + }, + { + "epoch": 0.31128, + "grad_norm": 0.607752402401117, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 31128 + }, + { + "epoch": 0.31129, + "grad_norm": 0.6592647692413477, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 31129 + }, + { + "epoch": 0.3113, + "grad_norm": 0.7907267354504305, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 31130 + }, + { + "epoch": 0.31131, + "grad_norm": 0.8905261796330373, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 31131 + }, + { + "epoch": 0.31132, + "grad_norm": 0.996129675215651, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 31132 + }, + { + "epoch": 0.31133, + "grad_norm": 1.1016336688174444, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 31133 + }, + { + "epoch": 0.31134, + "grad_norm": 0.9922264625070881, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 31134 + }, + { + "epoch": 0.31135, + "grad_norm": 1.009947512332734, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 31135 + }, + { + "epoch": 0.31136, + "grad_norm": 0.8769436963227613, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 31136 + }, + { + "epoch": 0.31137, + "grad_norm": 0.7014712809017916, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 31137 + }, + { + "epoch": 0.31138, + "grad_norm": 0.702218812811567, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 31138 + }, + { + "epoch": 0.31139, + "grad_norm": 0.7439306043055479, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 31139 + }, + { + "epoch": 0.3114, + "grad_norm": 0.9224585574994658, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 31140 + }, + { + "epoch": 0.31141, + "grad_norm": 1.0446365990854478, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 31141 + }, + { + "epoch": 0.31142, + "grad_norm": 1.105236990680016, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 31142 + }, + { + "epoch": 0.31143, + "grad_norm": 1.0396398340126907, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 31143 + }, + { + "epoch": 0.31144, + "grad_norm": 0.8302062274489633, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 31144 + }, + { + "epoch": 0.31145, + "grad_norm": 0.7613756825808332, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 31145 + }, + { + "epoch": 0.31146, + "grad_norm": 0.8546883548982939, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 31146 + }, + { + "epoch": 0.31147, + "grad_norm": 1.0161584344289527, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 31147 + }, + { + "epoch": 0.31148, + "grad_norm": 1.083246619995139, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 31148 + }, + { + "epoch": 0.31149, + "grad_norm": 0.9901365152875319, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 31149 + }, + { + "epoch": 0.3115, + "grad_norm": 1.1206056175869872, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 31150 + }, + { + "epoch": 0.31151, + "grad_norm": 0.8388386088619617, + "learning_rate": 0.003, + "loss": 4.042, + "step": 31151 + }, + { + "epoch": 0.31152, + "grad_norm": 0.8404325325388726, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 31152 + }, + { + "epoch": 0.31153, + "grad_norm": 0.7588165159526111, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 31153 + }, + { + "epoch": 0.31154, + "grad_norm": 0.7769497270635726, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 31154 + }, + { + "epoch": 0.31155, + "grad_norm": 0.8877061047103989, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 31155 + }, + { + "epoch": 0.31156, + "grad_norm": 0.908931265495274, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 31156 + }, + { + "epoch": 0.31157, + "grad_norm": 1.121312814408468, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 31157 + }, + { + "epoch": 0.31158, + "grad_norm": 1.0555540397999081, + "learning_rate": 0.003, + "loss": 4.042, + "step": 31158 + }, + { + "epoch": 0.31159, + "grad_norm": 0.8203079032313007, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 31159 + }, + { + "epoch": 0.3116, + "grad_norm": 0.8161678281528759, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 31160 + }, + { + "epoch": 0.31161, + "grad_norm": 0.8143157120474863, + "learning_rate": 0.003, + "loss": 4.023, + "step": 31161 + }, + { + "epoch": 0.31162, + "grad_norm": 0.7004886376149924, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 31162 + }, + { + "epoch": 0.31163, + "grad_norm": 0.7083221034778194, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 31163 + }, + { + "epoch": 0.31164, + "grad_norm": 0.6261289049636733, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 31164 + }, + { + "epoch": 0.31165, + "grad_norm": 0.6291110112905594, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 31165 + }, + { + "epoch": 0.31166, + "grad_norm": 0.7116744880511073, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 31166 + }, + { + "epoch": 0.31167, + "grad_norm": 0.8192851469984462, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 31167 + }, + { + "epoch": 0.31168, + "grad_norm": 0.947055552564821, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 31168 + }, + { + "epoch": 0.31169, + "grad_norm": 0.9857294685078094, + "learning_rate": 0.003, + "loss": 4.07, + "step": 31169 + }, + { + "epoch": 0.3117, + "grad_norm": 0.8594826753007103, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31170 + }, + { + "epoch": 0.31171, + "grad_norm": 0.6354570199502176, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 31171 + }, + { + "epoch": 0.31172, + "grad_norm": 0.6753245339627981, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 31172 + }, + { + "epoch": 0.31173, + "grad_norm": 0.6433501948393092, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 31173 + }, + { + "epoch": 0.31174, + "grad_norm": 0.5653324812738522, + "learning_rate": 0.003, + "loss": 4.03, + "step": 31174 + }, + { + "epoch": 0.31175, + "grad_norm": 0.6040435639652715, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 31175 + }, + { + "epoch": 0.31176, + "grad_norm": 0.6337658695834149, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 31176 + }, + { + "epoch": 0.31177, + "grad_norm": 0.667875234211774, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 31177 + }, + { + "epoch": 0.31178, + "grad_norm": 0.7578344412188126, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 31178 + }, + { + "epoch": 0.31179, + "grad_norm": 1.0074489125803223, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 31179 + }, + { + "epoch": 0.3118, + "grad_norm": 1.2710010078722682, + "learning_rate": 0.003, + "loss": 4.0058, + "step": 31180 + }, + { + "epoch": 0.31181, + "grad_norm": 0.8532903662863325, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 31181 + }, + { + "epoch": 0.31182, + "grad_norm": 0.6950093586071396, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 31182 + }, + { + "epoch": 0.31183, + "grad_norm": 0.7659615971190149, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 31183 + }, + { + "epoch": 0.31184, + "grad_norm": 0.7709934089389846, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 31184 + }, + { + "epoch": 0.31185, + "grad_norm": 0.8931813774604814, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 31185 + }, + { + "epoch": 0.31186, + "grad_norm": 0.9024451847565017, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 31186 + }, + { + "epoch": 0.31187, + "grad_norm": 1.0764724245940203, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 31187 + }, + { + "epoch": 0.31188, + "grad_norm": 1.0566140651124067, + "learning_rate": 0.003, + "loss": 3.9906, + "step": 31188 + }, + { + "epoch": 0.31189, + "grad_norm": 0.94519639485893, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 31189 + }, + { + "epoch": 0.3119, + "grad_norm": 0.9445190421239597, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 31190 + }, + { + "epoch": 0.31191, + "grad_norm": 0.8960995302646945, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 31191 + }, + { + "epoch": 0.31192, + "grad_norm": 0.9604325377420415, + "learning_rate": 0.003, + "loss": 4.044, + "step": 31192 + }, + { + "epoch": 0.31193, + "grad_norm": 0.853307273334864, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 31193 + }, + { + "epoch": 0.31194, + "grad_norm": 0.8203948862762644, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 31194 + }, + { + "epoch": 0.31195, + "grad_norm": 0.8259406214037516, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 31195 + }, + { + "epoch": 0.31196, + "grad_norm": 0.8213421034820623, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 31196 + }, + { + "epoch": 0.31197, + "grad_norm": 0.8222416138652324, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 31197 + }, + { + "epoch": 0.31198, + "grad_norm": 0.7855947038427131, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 31198 + }, + { + "epoch": 0.31199, + "grad_norm": 0.7799791970512702, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 31199 + }, + { + "epoch": 0.312, + "grad_norm": 0.7222829207529543, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 31200 + }, + { + "epoch": 0.31201, + "grad_norm": 0.7044800552835228, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 31201 + }, + { + "epoch": 0.31202, + "grad_norm": 0.8104473490382206, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 31202 + }, + { + "epoch": 0.31203, + "grad_norm": 0.9538999567272132, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 31203 + }, + { + "epoch": 0.31204, + "grad_norm": 1.1508904934712019, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 31204 + }, + { + "epoch": 0.31205, + "grad_norm": 0.7259607346237986, + "learning_rate": 0.003, + "loss": 4.038, + "step": 31205 + }, + { + "epoch": 0.31206, + "grad_norm": 0.5886160340174087, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 31206 + }, + { + "epoch": 0.31207, + "grad_norm": 0.6046105316961712, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 31207 + }, + { + "epoch": 0.31208, + "grad_norm": 0.7286513798323639, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 31208 + }, + { + "epoch": 0.31209, + "grad_norm": 0.8767145849286827, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 31209 + }, + { + "epoch": 0.3121, + "grad_norm": 0.9133316611541883, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 31210 + }, + { + "epoch": 0.31211, + "grad_norm": 0.9176336056733225, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 31211 + }, + { + "epoch": 0.31212, + "grad_norm": 1.143458434556926, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 31212 + }, + { + "epoch": 0.31213, + "grad_norm": 1.0104344144223345, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 31213 + }, + { + "epoch": 0.31214, + "grad_norm": 0.8941844385027341, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 31214 + }, + { + "epoch": 0.31215, + "grad_norm": 0.9712433776199452, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 31215 + }, + { + "epoch": 0.31216, + "grad_norm": 0.9931854447349524, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 31216 + }, + { + "epoch": 0.31217, + "grad_norm": 1.0769058390328516, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31217 + }, + { + "epoch": 0.31218, + "grad_norm": 0.967013220301073, + "learning_rate": 0.003, + "loss": 4.076, + "step": 31218 + }, + { + "epoch": 0.31219, + "grad_norm": 1.0516868253101601, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 31219 + }, + { + "epoch": 0.3122, + "grad_norm": 0.9577932148076674, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 31220 + }, + { + "epoch": 0.31221, + "grad_norm": 0.8959379395188523, + "learning_rate": 0.003, + "loss": 4.055, + "step": 31221 + }, + { + "epoch": 0.31222, + "grad_norm": 0.8763575877702239, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 31222 + }, + { + "epoch": 0.31223, + "grad_norm": 0.8000503880564412, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 31223 + }, + { + "epoch": 0.31224, + "grad_norm": 0.9268896245523388, + "learning_rate": 0.003, + "loss": 4.059, + "step": 31224 + }, + { + "epoch": 0.31225, + "grad_norm": 0.9368815704520915, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 31225 + }, + { + "epoch": 0.31226, + "grad_norm": 0.946862214047098, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 31226 + }, + { + "epoch": 0.31227, + "grad_norm": 0.8608209631936689, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 31227 + }, + { + "epoch": 0.31228, + "grad_norm": 0.7189427748260587, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 31228 + }, + { + "epoch": 0.31229, + "grad_norm": 0.7235214968231489, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 31229 + }, + { + "epoch": 0.3123, + "grad_norm": 0.6878232450952642, + "learning_rate": 0.003, + "loss": 4.0903, + "step": 31230 + }, + { + "epoch": 0.31231, + "grad_norm": 0.8090304665472239, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 31231 + }, + { + "epoch": 0.31232, + "grad_norm": 0.9297765536479975, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 31232 + }, + { + "epoch": 0.31233, + "grad_norm": 0.9699722151862058, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 31233 + }, + { + "epoch": 0.31234, + "grad_norm": 1.0025669497327474, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 31234 + }, + { + "epoch": 0.31235, + "grad_norm": 1.1092291259320062, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 31235 + }, + { + "epoch": 0.31236, + "grad_norm": 0.8202534226358841, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 31236 + }, + { + "epoch": 0.31237, + "grad_norm": 0.7975553984533527, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 31237 + }, + { + "epoch": 0.31238, + "grad_norm": 0.8256148091172824, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 31238 + }, + { + "epoch": 0.31239, + "grad_norm": 0.7484664067909651, + "learning_rate": 0.003, + "loss": 3.9989, + "step": 31239 + }, + { + "epoch": 0.3124, + "grad_norm": 0.581875811854712, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 31240 + }, + { + "epoch": 0.31241, + "grad_norm": 0.4955379386464406, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 31241 + }, + { + "epoch": 0.31242, + "grad_norm": 0.5439994103430057, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 31242 + }, + { + "epoch": 0.31243, + "grad_norm": 0.5613986779769449, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 31243 + }, + { + "epoch": 0.31244, + "grad_norm": 0.5636587556106651, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 31244 + }, + { + "epoch": 0.31245, + "grad_norm": 0.6373450148437212, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 31245 + }, + { + "epoch": 0.31246, + "grad_norm": 0.6811196392484467, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 31246 + }, + { + "epoch": 0.31247, + "grad_norm": 0.8775198527097631, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 31247 + }, + { + "epoch": 0.31248, + "grad_norm": 1.1292727150103001, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 31248 + }, + { + "epoch": 0.31249, + "grad_norm": 1.06569438341617, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 31249 + }, + { + "epoch": 0.3125, + "grad_norm": 0.9780876224448649, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 31250 + }, + { + "epoch": 0.31251, + "grad_norm": 0.8691640580063306, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 31251 + }, + { + "epoch": 0.31252, + "grad_norm": 0.9031264123239988, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 31252 + }, + { + "epoch": 0.31253, + "grad_norm": 0.8440934592328118, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 31253 + }, + { + "epoch": 0.31254, + "grad_norm": 0.9136346820374134, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 31254 + }, + { + "epoch": 0.31255, + "grad_norm": 0.9776081752405139, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 31255 + }, + { + "epoch": 0.31256, + "grad_norm": 1.013687280085767, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 31256 + }, + { + "epoch": 0.31257, + "grad_norm": 0.8211032359131123, + "learning_rate": 0.003, + "loss": 4.061, + "step": 31257 + }, + { + "epoch": 0.31258, + "grad_norm": 0.8214192351733371, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 31258 + }, + { + "epoch": 0.31259, + "grad_norm": 0.7452837584323339, + "learning_rate": 0.003, + "loss": 4.0738, + "step": 31259 + }, + { + "epoch": 0.3126, + "grad_norm": 0.7750382116581828, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 31260 + }, + { + "epoch": 0.31261, + "grad_norm": 0.9493108584886132, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 31261 + }, + { + "epoch": 0.31262, + "grad_norm": 1.320170341017054, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 31262 + }, + { + "epoch": 0.31263, + "grad_norm": 0.8257790432990456, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 31263 + }, + { + "epoch": 0.31264, + "grad_norm": 0.9215070394140378, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 31264 + }, + { + "epoch": 0.31265, + "grad_norm": 0.9617440958599551, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 31265 + }, + { + "epoch": 0.31266, + "grad_norm": 0.9612800516884812, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 31266 + }, + { + "epoch": 0.31267, + "grad_norm": 0.8592863979003663, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 31267 + }, + { + "epoch": 0.31268, + "grad_norm": 0.7474252747736044, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 31268 + }, + { + "epoch": 0.31269, + "grad_norm": 0.6926686736824768, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 31269 + }, + { + "epoch": 0.3127, + "grad_norm": 0.6648553849500414, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 31270 + }, + { + "epoch": 0.31271, + "grad_norm": 0.6503654348039692, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 31271 + }, + { + "epoch": 0.31272, + "grad_norm": 0.6955850998442819, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 31272 + }, + { + "epoch": 0.31273, + "grad_norm": 0.9234904276314626, + "learning_rate": 0.003, + "loss": 4.052, + "step": 31273 + }, + { + "epoch": 0.31274, + "grad_norm": 1.423790160547063, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 31274 + }, + { + "epoch": 0.31275, + "grad_norm": 0.5805242330291865, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 31275 + }, + { + "epoch": 0.31276, + "grad_norm": 0.7735977638447247, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 31276 + }, + { + "epoch": 0.31277, + "grad_norm": 0.9734044611289877, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 31277 + }, + { + "epoch": 0.31278, + "grad_norm": 0.9854708059823793, + "learning_rate": 0.003, + "loss": 4.043, + "step": 31278 + }, + { + "epoch": 0.31279, + "grad_norm": 0.951508274795993, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 31279 + }, + { + "epoch": 0.3128, + "grad_norm": 0.7998936112835342, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 31280 + }, + { + "epoch": 0.31281, + "grad_norm": 0.6284474638940004, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 31281 + }, + { + "epoch": 0.31282, + "grad_norm": 0.576370437160648, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31282 + }, + { + "epoch": 0.31283, + "grad_norm": 0.4850312343434524, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 31283 + }, + { + "epoch": 0.31284, + "grad_norm": 0.5172173956555511, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 31284 + }, + { + "epoch": 0.31285, + "grad_norm": 0.5685522769723065, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 31285 + }, + { + "epoch": 0.31286, + "grad_norm": 0.5742093413699484, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 31286 + }, + { + "epoch": 0.31287, + "grad_norm": 0.6084592366537805, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 31287 + }, + { + "epoch": 0.31288, + "grad_norm": 0.7479579936070707, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 31288 + }, + { + "epoch": 0.31289, + "grad_norm": 1.1496538538413394, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 31289 + }, + { + "epoch": 0.3129, + "grad_norm": 1.0361776798490294, + "learning_rate": 0.003, + "loss": 4.049, + "step": 31290 + }, + { + "epoch": 0.31291, + "grad_norm": 0.8864992519658409, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 31291 + }, + { + "epoch": 0.31292, + "grad_norm": 0.9252412217776611, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 31292 + }, + { + "epoch": 0.31293, + "grad_norm": 0.9590487822230165, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 31293 + }, + { + "epoch": 0.31294, + "grad_norm": 1.057522904484121, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 31294 + }, + { + "epoch": 0.31295, + "grad_norm": 0.9740089651848963, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 31295 + }, + { + "epoch": 0.31296, + "grad_norm": 1.0988298714726008, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 31296 + }, + { + "epoch": 0.31297, + "grad_norm": 0.968915533076145, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 31297 + }, + { + "epoch": 0.31298, + "grad_norm": 1.0196908600993277, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 31298 + }, + { + "epoch": 0.31299, + "grad_norm": 1.0330180172036938, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 31299 + }, + { + "epoch": 0.313, + "grad_norm": 1.189443203185933, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 31300 + }, + { + "epoch": 0.31301, + "grad_norm": 0.7945486445307969, + "learning_rate": 0.003, + "loss": 4.049, + "step": 31301 + }, + { + "epoch": 0.31302, + "grad_norm": 0.775141095416031, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 31302 + }, + { + "epoch": 0.31303, + "grad_norm": 0.7344316537050568, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 31303 + }, + { + "epoch": 0.31304, + "grad_norm": 0.6895561453083252, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 31304 + }, + { + "epoch": 0.31305, + "grad_norm": 0.6137799858184082, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 31305 + }, + { + "epoch": 0.31306, + "grad_norm": 0.5876729994086001, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 31306 + }, + { + "epoch": 0.31307, + "grad_norm": 0.6486700873521241, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 31307 + }, + { + "epoch": 0.31308, + "grad_norm": 0.8280697092738462, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 31308 + }, + { + "epoch": 0.31309, + "grad_norm": 1.0774758805619744, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 31309 + }, + { + "epoch": 0.3131, + "grad_norm": 0.9639864570939809, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 31310 + }, + { + "epoch": 0.31311, + "grad_norm": 0.9502332172816403, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 31311 + }, + { + "epoch": 0.31312, + "grad_norm": 0.8051434649452919, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 31312 + }, + { + "epoch": 0.31313, + "grad_norm": 0.7693556210282144, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 31313 + }, + { + "epoch": 0.31314, + "grad_norm": 0.7739304624622881, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 31314 + }, + { + "epoch": 0.31315, + "grad_norm": 0.7606170307872977, + "learning_rate": 0.003, + "loss": 4.0923, + "step": 31315 + }, + { + "epoch": 0.31316, + "grad_norm": 0.6844505762800396, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 31316 + }, + { + "epoch": 0.31317, + "grad_norm": 0.6130399972916988, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 31317 + }, + { + "epoch": 0.31318, + "grad_norm": 0.7236328680917914, + "learning_rate": 0.003, + "loss": 4.04, + "step": 31318 + }, + { + "epoch": 0.31319, + "grad_norm": 0.7993100556949229, + "learning_rate": 0.003, + "loss": 4.035, + "step": 31319 + }, + { + "epoch": 0.3132, + "grad_norm": 0.8802625181505642, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 31320 + }, + { + "epoch": 0.31321, + "grad_norm": 1.0427957942283264, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 31321 + }, + { + "epoch": 0.31322, + "grad_norm": 1.0347633189559249, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 31322 + }, + { + "epoch": 0.31323, + "grad_norm": 0.9435485380390768, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 31323 + }, + { + "epoch": 0.31324, + "grad_norm": 0.8526402921642833, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 31324 + }, + { + "epoch": 0.31325, + "grad_norm": 0.754478319854867, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 31325 + }, + { + "epoch": 0.31326, + "grad_norm": 0.7605091076587739, + "learning_rate": 0.003, + "loss": 4.026, + "step": 31326 + }, + { + "epoch": 0.31327, + "grad_norm": 0.7794250897314938, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 31327 + }, + { + "epoch": 0.31328, + "grad_norm": 0.7747243589301236, + "learning_rate": 0.003, + "loss": 4.014, + "step": 31328 + }, + { + "epoch": 0.31329, + "grad_norm": 0.7159520310027352, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 31329 + }, + { + "epoch": 0.3133, + "grad_norm": 0.7498574333971377, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 31330 + }, + { + "epoch": 0.31331, + "grad_norm": 1.052734786898398, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 31331 + }, + { + "epoch": 0.31332, + "grad_norm": 1.2403455237449617, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 31332 + }, + { + "epoch": 0.31333, + "grad_norm": 0.7784936556918506, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 31333 + }, + { + "epoch": 0.31334, + "grad_norm": 0.7403412533370851, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 31334 + }, + { + "epoch": 0.31335, + "grad_norm": 0.6015598539407172, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 31335 + }, + { + "epoch": 0.31336, + "grad_norm": 0.6002501411964667, + "learning_rate": 0.003, + "loss": 3.9818, + "step": 31336 + }, + { + "epoch": 0.31337, + "grad_norm": 0.5394909110768534, + "learning_rate": 0.003, + "loss": 4.023, + "step": 31337 + }, + { + "epoch": 0.31338, + "grad_norm": 0.6333523676523857, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 31338 + }, + { + "epoch": 0.31339, + "grad_norm": 0.6723572323852043, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 31339 + }, + { + "epoch": 0.3134, + "grad_norm": 0.7907891248600707, + "learning_rate": 0.003, + "loss": 4.025, + "step": 31340 + }, + { + "epoch": 0.31341, + "grad_norm": 0.9621131493802776, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 31341 + }, + { + "epoch": 0.31342, + "grad_norm": 1.2218285291185964, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 31342 + }, + { + "epoch": 0.31343, + "grad_norm": 0.6238641310899566, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 31343 + }, + { + "epoch": 0.31344, + "grad_norm": 0.7013203843762227, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 31344 + }, + { + "epoch": 0.31345, + "grad_norm": 0.9347271466660745, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 31345 + }, + { + "epoch": 0.31346, + "grad_norm": 0.962401973270663, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 31346 + }, + { + "epoch": 0.31347, + "grad_norm": 1.0224042412482492, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 31347 + }, + { + "epoch": 0.31348, + "grad_norm": 0.8483883267665707, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 31348 + }, + { + "epoch": 0.31349, + "grad_norm": 0.9609667244956539, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 31349 + }, + { + "epoch": 0.3135, + "grad_norm": 1.0923026456566096, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 31350 + }, + { + "epoch": 0.31351, + "grad_norm": 0.9497211163935165, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 31351 + }, + { + "epoch": 0.31352, + "grad_norm": 0.8982547828393189, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 31352 + }, + { + "epoch": 0.31353, + "grad_norm": 0.9820001779664211, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 31353 + }, + { + "epoch": 0.31354, + "grad_norm": 1.1332154795043357, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 31354 + }, + { + "epoch": 0.31355, + "grad_norm": 0.8913804617812923, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 31355 + }, + { + "epoch": 0.31356, + "grad_norm": 0.8221166609024061, + "learning_rate": 0.003, + "loss": 4.074, + "step": 31356 + }, + { + "epoch": 0.31357, + "grad_norm": 0.753767145691851, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 31357 + }, + { + "epoch": 0.31358, + "grad_norm": 0.79083961407806, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 31358 + }, + { + "epoch": 0.31359, + "grad_norm": 0.8455862674734812, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 31359 + }, + { + "epoch": 0.3136, + "grad_norm": 0.9338287553085705, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 31360 + }, + { + "epoch": 0.31361, + "grad_norm": 0.9725815363711021, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 31361 + }, + { + "epoch": 0.31362, + "grad_norm": 0.914073242595534, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 31362 + }, + { + "epoch": 0.31363, + "grad_norm": 0.9431780368673159, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 31363 + }, + { + "epoch": 0.31364, + "grad_norm": 1.0955915714574764, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 31364 + }, + { + "epoch": 0.31365, + "grad_norm": 1.1354548314149375, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 31365 + }, + { + "epoch": 0.31366, + "grad_norm": 1.134973051382539, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 31366 + }, + { + "epoch": 0.31367, + "grad_norm": 0.913351339990013, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 31367 + }, + { + "epoch": 0.31368, + "grad_norm": 1.0151679853784419, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 31368 + }, + { + "epoch": 0.31369, + "grad_norm": 1.0273397406669742, + "learning_rate": 0.003, + "loss": 4.034, + "step": 31369 + }, + { + "epoch": 0.3137, + "grad_norm": 0.9063926345709336, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 31370 + }, + { + "epoch": 0.31371, + "grad_norm": 0.8502527095425136, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 31371 + }, + { + "epoch": 0.31372, + "grad_norm": 0.7350002874353218, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 31372 + }, + { + "epoch": 0.31373, + "grad_norm": 0.7174568885419096, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 31373 + }, + { + "epoch": 0.31374, + "grad_norm": 0.6706716131851381, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 31374 + }, + { + "epoch": 0.31375, + "grad_norm": 0.6521920639968554, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 31375 + }, + { + "epoch": 0.31376, + "grad_norm": 0.7508854223153066, + "learning_rate": 0.003, + "loss": 4.048, + "step": 31376 + }, + { + "epoch": 0.31377, + "grad_norm": 0.7877383284749523, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 31377 + }, + { + "epoch": 0.31378, + "grad_norm": 0.7892395256029134, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 31378 + }, + { + "epoch": 0.31379, + "grad_norm": 0.9024115584641496, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 31379 + }, + { + "epoch": 0.3138, + "grad_norm": 1.171589472978926, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 31380 + }, + { + "epoch": 0.31381, + "grad_norm": 0.8466237237895282, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 31381 + }, + { + "epoch": 0.31382, + "grad_norm": 0.853061371886525, + "learning_rate": 0.003, + "loss": 4.023, + "step": 31382 + }, + { + "epoch": 0.31383, + "grad_norm": 0.8244918145519741, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 31383 + }, + { + "epoch": 0.31384, + "grad_norm": 0.7181811248919989, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 31384 + }, + { + "epoch": 0.31385, + "grad_norm": 0.7575242694894838, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 31385 + }, + { + "epoch": 0.31386, + "grad_norm": 0.7421622516906904, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 31386 + }, + { + "epoch": 0.31387, + "grad_norm": 0.7192331649009231, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 31387 + }, + { + "epoch": 0.31388, + "grad_norm": 0.7294987276373842, + "learning_rate": 0.003, + "loss": 4.05, + "step": 31388 + }, + { + "epoch": 0.31389, + "grad_norm": 0.813634071889361, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 31389 + }, + { + "epoch": 0.3139, + "grad_norm": 0.9210888950879994, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 31390 + }, + { + "epoch": 0.31391, + "grad_norm": 1.009942818579309, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 31391 + }, + { + "epoch": 0.31392, + "grad_norm": 0.9483903261278875, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 31392 + }, + { + "epoch": 0.31393, + "grad_norm": 0.8298731427920731, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 31393 + }, + { + "epoch": 0.31394, + "grad_norm": 0.6811203514394315, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 31394 + }, + { + "epoch": 0.31395, + "grad_norm": 0.5603691076603762, + "learning_rate": 0.003, + "loss": 4.017, + "step": 31395 + }, + { + "epoch": 0.31396, + "grad_norm": 0.521281192037141, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 31396 + }, + { + "epoch": 0.31397, + "grad_norm": 0.5571842269041183, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 31397 + }, + { + "epoch": 0.31398, + "grad_norm": 0.6374310179287845, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 31398 + }, + { + "epoch": 0.31399, + "grad_norm": 0.8125331790677132, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 31399 + }, + { + "epoch": 0.314, + "grad_norm": 0.9703823346408125, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 31400 + }, + { + "epoch": 0.31401, + "grad_norm": 1.0501379222586775, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 31401 + }, + { + "epoch": 0.31402, + "grad_norm": 0.9714619607347084, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 31402 + }, + { + "epoch": 0.31403, + "grad_norm": 0.9256726775508305, + "learning_rate": 0.003, + "loss": 4.025, + "step": 31403 + }, + { + "epoch": 0.31404, + "grad_norm": 0.795905295187092, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 31404 + }, + { + "epoch": 0.31405, + "grad_norm": 0.89964271728939, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 31405 + }, + { + "epoch": 0.31406, + "grad_norm": 0.9999650074767067, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 31406 + }, + { + "epoch": 0.31407, + "grad_norm": 0.8732106738754914, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 31407 + }, + { + "epoch": 0.31408, + "grad_norm": 0.847237204840054, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 31408 + }, + { + "epoch": 0.31409, + "grad_norm": 0.8503949654555671, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 31409 + }, + { + "epoch": 0.3141, + "grad_norm": 0.8297222005689434, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 31410 + }, + { + "epoch": 0.31411, + "grad_norm": 0.8473503787406031, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 31411 + }, + { + "epoch": 0.31412, + "grad_norm": 0.8836119325416425, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 31412 + }, + { + "epoch": 0.31413, + "grad_norm": 0.9517630284095333, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 31413 + }, + { + "epoch": 0.31414, + "grad_norm": 1.0168015190097535, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 31414 + }, + { + "epoch": 0.31415, + "grad_norm": 0.9835037996817881, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 31415 + }, + { + "epoch": 0.31416, + "grad_norm": 1.0423382382293924, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 31416 + }, + { + "epoch": 0.31417, + "grad_norm": 0.897022464956856, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 31417 + }, + { + "epoch": 0.31418, + "grad_norm": 0.8391750410098842, + "learning_rate": 0.003, + "loss": 4.042, + "step": 31418 + }, + { + "epoch": 0.31419, + "grad_norm": 0.8343248748529174, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 31419 + }, + { + "epoch": 0.3142, + "grad_norm": 0.7319467478558885, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 31420 + }, + { + "epoch": 0.31421, + "grad_norm": 0.7183003951678939, + "learning_rate": 0.003, + "loss": 4.041, + "step": 31421 + }, + { + "epoch": 0.31422, + "grad_norm": 0.797679549076921, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 31422 + }, + { + "epoch": 0.31423, + "grad_norm": 0.8881840391071272, + "learning_rate": 0.003, + "loss": 4.064, + "step": 31423 + }, + { + "epoch": 0.31424, + "grad_norm": 1.0936482438892645, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 31424 + }, + { + "epoch": 0.31425, + "grad_norm": 1.0100483402725782, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 31425 + }, + { + "epoch": 0.31426, + "grad_norm": 1.0107734770963108, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 31426 + }, + { + "epoch": 0.31427, + "grad_norm": 0.9211092698513981, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 31427 + }, + { + "epoch": 0.31428, + "grad_norm": 0.9601652131337872, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 31428 + }, + { + "epoch": 0.31429, + "grad_norm": 1.0346837898253092, + "learning_rate": 0.003, + "loss": 4.053, + "step": 31429 + }, + { + "epoch": 0.3143, + "grad_norm": 0.9513651162331734, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 31430 + }, + { + "epoch": 0.31431, + "grad_norm": 0.9944376760722243, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 31431 + }, + { + "epoch": 0.31432, + "grad_norm": 0.894472493022726, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 31432 + }, + { + "epoch": 0.31433, + "grad_norm": 0.7707950061818188, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 31433 + }, + { + "epoch": 0.31434, + "grad_norm": 0.683570533087187, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 31434 + }, + { + "epoch": 0.31435, + "grad_norm": 0.643016121055002, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 31435 + }, + { + "epoch": 0.31436, + "grad_norm": 0.6764699271120648, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 31436 + }, + { + "epoch": 0.31437, + "grad_norm": 0.7016730352478527, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 31437 + }, + { + "epoch": 0.31438, + "grad_norm": 0.7445921125786251, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 31438 + }, + { + "epoch": 0.31439, + "grad_norm": 0.7367089619603503, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 31439 + }, + { + "epoch": 0.3144, + "grad_norm": 0.7968192598453699, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 31440 + }, + { + "epoch": 0.31441, + "grad_norm": 0.9196416964761496, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 31441 + }, + { + "epoch": 0.31442, + "grad_norm": 0.8869221902427373, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 31442 + }, + { + "epoch": 0.31443, + "grad_norm": 0.95563549757164, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 31443 + }, + { + "epoch": 0.31444, + "grad_norm": 1.1535323369859023, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 31444 + }, + { + "epoch": 0.31445, + "grad_norm": 0.7788533204274352, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 31445 + }, + { + "epoch": 0.31446, + "grad_norm": 0.7646667621677142, + "learning_rate": 0.003, + "loss": 4.047, + "step": 31446 + }, + { + "epoch": 0.31447, + "grad_norm": 0.7268424360155621, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 31447 + }, + { + "epoch": 0.31448, + "grad_norm": 0.7051990406057544, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 31448 + }, + { + "epoch": 0.31449, + "grad_norm": 0.7667178976639589, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 31449 + }, + { + "epoch": 0.3145, + "grad_norm": 0.8811877091248324, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 31450 + }, + { + "epoch": 0.31451, + "grad_norm": 1.0606932796869641, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 31451 + }, + { + "epoch": 0.31452, + "grad_norm": 1.138061525280638, + "learning_rate": 0.003, + "loss": 4.068, + "step": 31452 + }, + { + "epoch": 0.31453, + "grad_norm": 0.8392617322165471, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 31453 + }, + { + "epoch": 0.31454, + "grad_norm": 0.7315933226870922, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 31454 + }, + { + "epoch": 0.31455, + "grad_norm": 0.7784931612914113, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 31455 + }, + { + "epoch": 0.31456, + "grad_norm": 0.8374383070017554, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 31456 + }, + { + "epoch": 0.31457, + "grad_norm": 0.7732751663438211, + "learning_rate": 0.003, + "loss": 4.035, + "step": 31457 + }, + { + "epoch": 0.31458, + "grad_norm": 0.8336727542963562, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 31458 + }, + { + "epoch": 0.31459, + "grad_norm": 0.6695288860098132, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 31459 + }, + { + "epoch": 0.3146, + "grad_norm": 0.586430971045819, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 31460 + }, + { + "epoch": 0.31461, + "grad_norm": 0.5760345328379618, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 31461 + }, + { + "epoch": 0.31462, + "grad_norm": 0.5948458999678835, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 31462 + }, + { + "epoch": 0.31463, + "grad_norm": 0.6832035380620919, + "learning_rate": 0.003, + "loss": 4.0025, + "step": 31463 + }, + { + "epoch": 0.31464, + "grad_norm": 0.7349941852892047, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 31464 + }, + { + "epoch": 0.31465, + "grad_norm": 0.7016872208467747, + "learning_rate": 0.003, + "loss": 4.009, + "step": 31465 + }, + { + "epoch": 0.31466, + "grad_norm": 0.8373583991049042, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 31466 + }, + { + "epoch": 0.31467, + "grad_norm": 0.9971947763460353, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 31467 + }, + { + "epoch": 0.31468, + "grad_norm": 1.154593159726765, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 31468 + }, + { + "epoch": 0.31469, + "grad_norm": 1.1229316779199499, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 31469 + }, + { + "epoch": 0.3147, + "grad_norm": 1.004531307478109, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 31470 + }, + { + "epoch": 0.31471, + "grad_norm": 0.9982594618187117, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 31471 + }, + { + "epoch": 0.31472, + "grad_norm": 1.007724904172985, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 31472 + }, + { + "epoch": 0.31473, + "grad_norm": 0.9706938559902623, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 31473 + }, + { + "epoch": 0.31474, + "grad_norm": 0.9560595090167119, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 31474 + }, + { + "epoch": 0.31475, + "grad_norm": 0.9221668082250363, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 31475 + }, + { + "epoch": 0.31476, + "grad_norm": 0.9926479435376707, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 31476 + }, + { + "epoch": 0.31477, + "grad_norm": 0.872413118652345, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 31477 + }, + { + "epoch": 0.31478, + "grad_norm": 0.8372374862555823, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 31478 + }, + { + "epoch": 0.31479, + "grad_norm": 0.8425253151980219, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 31479 + }, + { + "epoch": 0.3148, + "grad_norm": 0.8929528330898638, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 31480 + }, + { + "epoch": 0.31481, + "grad_norm": 1.0401647240304162, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 31481 + }, + { + "epoch": 0.31482, + "grad_norm": 1.0840534674223608, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 31482 + }, + { + "epoch": 0.31483, + "grad_norm": 0.7192967506448255, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 31483 + }, + { + "epoch": 0.31484, + "grad_norm": 0.7798674481374122, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 31484 + }, + { + "epoch": 0.31485, + "grad_norm": 0.8925212821814577, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 31485 + }, + { + "epoch": 0.31486, + "grad_norm": 1.138140324830462, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 31486 + }, + { + "epoch": 0.31487, + "grad_norm": 1.0242470426103574, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 31487 + }, + { + "epoch": 0.31488, + "grad_norm": 0.8917224795240775, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 31488 + }, + { + "epoch": 0.31489, + "grad_norm": 0.7611276213842967, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 31489 + }, + { + "epoch": 0.3149, + "grad_norm": 0.684214619592837, + "learning_rate": 0.003, + "loss": 4.0854, + "step": 31490 + }, + { + "epoch": 0.31491, + "grad_norm": 0.6561065239291558, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 31491 + }, + { + "epoch": 0.31492, + "grad_norm": 0.6690341569632381, + "learning_rate": 0.003, + "loss": 4.045, + "step": 31492 + }, + { + "epoch": 0.31493, + "grad_norm": 0.6222014100675942, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 31493 + }, + { + "epoch": 0.31494, + "grad_norm": 0.5653210687020029, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 31494 + }, + { + "epoch": 0.31495, + "grad_norm": 0.5630389222470825, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 31495 + }, + { + "epoch": 0.31496, + "grad_norm": 0.5825271161347437, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 31496 + }, + { + "epoch": 0.31497, + "grad_norm": 0.6419650109019417, + "learning_rate": 0.003, + "loss": 4.047, + "step": 31497 + }, + { + "epoch": 0.31498, + "grad_norm": 0.8426120560624978, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 31498 + }, + { + "epoch": 0.31499, + "grad_norm": 1.0765552105008858, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 31499 + }, + { + "epoch": 0.315, + "grad_norm": 1.0981563508760492, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 31500 + }, + { + "epoch": 0.31501, + "grad_norm": 0.8168940954774574, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 31501 + }, + { + "epoch": 0.31502, + "grad_norm": 0.6916149314608744, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 31502 + }, + { + "epoch": 0.31503, + "grad_norm": 0.6821972870452591, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 31503 + }, + { + "epoch": 0.31504, + "grad_norm": 0.6927482358310412, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 31504 + }, + { + "epoch": 0.31505, + "grad_norm": 0.6185613111391783, + "learning_rate": 0.003, + "loss": 4.051, + "step": 31505 + }, + { + "epoch": 0.31506, + "grad_norm": 0.5968993551486497, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 31506 + }, + { + "epoch": 0.31507, + "grad_norm": 0.6079414972176316, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 31507 + }, + { + "epoch": 0.31508, + "grad_norm": 0.6260245227724134, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 31508 + }, + { + "epoch": 0.31509, + "grad_norm": 0.8187426271740212, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 31509 + }, + { + "epoch": 0.3151, + "grad_norm": 1.1473446948935555, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 31510 + }, + { + "epoch": 0.31511, + "grad_norm": 0.9626633620945743, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 31511 + }, + { + "epoch": 0.31512, + "grad_norm": 0.6863010312824587, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 31512 + }, + { + "epoch": 0.31513, + "grad_norm": 0.6480230887291982, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 31513 + }, + { + "epoch": 0.31514, + "grad_norm": 0.7782447658659731, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 31514 + }, + { + "epoch": 0.31515, + "grad_norm": 0.9154335692282332, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 31515 + }, + { + "epoch": 0.31516, + "grad_norm": 0.9776740823729358, + "learning_rate": 0.003, + "loss": 4.054, + "step": 31516 + }, + { + "epoch": 0.31517, + "grad_norm": 0.8324537101275667, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 31517 + }, + { + "epoch": 0.31518, + "grad_norm": 0.8101574605261932, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 31518 + }, + { + "epoch": 0.31519, + "grad_norm": 0.868121041458849, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 31519 + }, + { + "epoch": 0.3152, + "grad_norm": 0.9715844219118202, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 31520 + }, + { + "epoch": 0.31521, + "grad_norm": 0.8990374450057724, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 31521 + }, + { + "epoch": 0.31522, + "grad_norm": 1.0474237364989, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 31522 + }, + { + "epoch": 0.31523, + "grad_norm": 1.1190412437367427, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 31523 + }, + { + "epoch": 0.31524, + "grad_norm": 0.8823892293574602, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 31524 + }, + { + "epoch": 0.31525, + "grad_norm": 0.8257457363939992, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 31525 + }, + { + "epoch": 0.31526, + "grad_norm": 0.8759215257229253, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 31526 + }, + { + "epoch": 0.31527, + "grad_norm": 0.974700153371874, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 31527 + }, + { + "epoch": 0.31528, + "grad_norm": 1.0986262461602914, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 31528 + }, + { + "epoch": 0.31529, + "grad_norm": 0.7514007730805605, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 31529 + }, + { + "epoch": 0.3153, + "grad_norm": 0.7451664883802264, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 31530 + }, + { + "epoch": 0.31531, + "grad_norm": 0.7750083124644541, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 31531 + }, + { + "epoch": 0.31532, + "grad_norm": 0.8015763184808444, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 31532 + }, + { + "epoch": 0.31533, + "grad_norm": 0.8780765317590714, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 31533 + }, + { + "epoch": 0.31534, + "grad_norm": 1.0553633125108448, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 31534 + }, + { + "epoch": 0.31535, + "grad_norm": 1.1709473349607877, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 31535 + }, + { + "epoch": 0.31536, + "grad_norm": 0.9446207601406597, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 31536 + }, + { + "epoch": 0.31537, + "grad_norm": 0.8886430581539335, + "learning_rate": 0.003, + "loss": 4.021, + "step": 31537 + }, + { + "epoch": 0.31538, + "grad_norm": 0.8050184923219098, + "learning_rate": 0.003, + "loss": 4.022, + "step": 31538 + }, + { + "epoch": 0.31539, + "grad_norm": 0.8238695859243506, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 31539 + }, + { + "epoch": 0.3154, + "grad_norm": 0.8014267068429404, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 31540 + }, + { + "epoch": 0.31541, + "grad_norm": 0.8558140754638787, + "learning_rate": 0.003, + "loss": 4.056, + "step": 31541 + }, + { + "epoch": 0.31542, + "grad_norm": 0.8078201320536111, + "learning_rate": 0.003, + "loss": 3.9982, + "step": 31542 + }, + { + "epoch": 0.31543, + "grad_norm": 0.7317805602344942, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 31543 + }, + { + "epoch": 0.31544, + "grad_norm": 0.711560996956267, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 31544 + }, + { + "epoch": 0.31545, + "grad_norm": 0.8254289502078649, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 31545 + }, + { + "epoch": 0.31546, + "grad_norm": 0.8947727967870772, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 31546 + }, + { + "epoch": 0.31547, + "grad_norm": 0.9048624298124993, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 31547 + }, + { + "epoch": 0.31548, + "grad_norm": 0.9418557247184818, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 31548 + }, + { + "epoch": 0.31549, + "grad_norm": 0.920059445591814, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 31549 + }, + { + "epoch": 0.3155, + "grad_norm": 0.8567144024422263, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 31550 + }, + { + "epoch": 0.31551, + "grad_norm": 0.8506463099646814, + "learning_rate": 0.003, + "loss": 4.059, + "step": 31551 + }, + { + "epoch": 0.31552, + "grad_norm": 0.9609793592118786, + "learning_rate": 0.003, + "loss": 4.034, + "step": 31552 + }, + { + "epoch": 0.31553, + "grad_norm": 1.301669821468197, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 31553 + }, + { + "epoch": 0.31554, + "grad_norm": 0.8756560890625696, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 31554 + }, + { + "epoch": 0.31555, + "grad_norm": 0.7554566068095743, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 31555 + }, + { + "epoch": 0.31556, + "grad_norm": 0.7526543188687029, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 31556 + }, + { + "epoch": 0.31557, + "grad_norm": 0.8330966220631019, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 31557 + }, + { + "epoch": 0.31558, + "grad_norm": 0.9124007294069366, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 31558 + }, + { + "epoch": 0.31559, + "grad_norm": 0.9659274000695888, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 31559 + }, + { + "epoch": 0.3156, + "grad_norm": 0.9999265484823098, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 31560 + }, + { + "epoch": 0.31561, + "grad_norm": 0.9582478299715528, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 31561 + }, + { + "epoch": 0.31562, + "grad_norm": 0.9060478913375641, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 31562 + }, + { + "epoch": 0.31563, + "grad_norm": 0.8325520195295539, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 31563 + }, + { + "epoch": 0.31564, + "grad_norm": 0.7389213534346812, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 31564 + }, + { + "epoch": 0.31565, + "grad_norm": 0.7146758572090178, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 31565 + }, + { + "epoch": 0.31566, + "grad_norm": 0.747967947045861, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 31566 + }, + { + "epoch": 0.31567, + "grad_norm": 0.7005297479117018, + "learning_rate": 0.003, + "loss": 4.051, + "step": 31567 + }, + { + "epoch": 0.31568, + "grad_norm": 0.6254137617205997, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 31568 + }, + { + "epoch": 0.31569, + "grad_norm": 0.6590887087229161, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 31569 + }, + { + "epoch": 0.3157, + "grad_norm": 0.6481777506187365, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 31570 + }, + { + "epoch": 0.31571, + "grad_norm": 0.6754959180795943, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 31571 + }, + { + "epoch": 0.31572, + "grad_norm": 0.8033778993739903, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 31572 + }, + { + "epoch": 0.31573, + "grad_norm": 0.9251453559014015, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 31573 + }, + { + "epoch": 0.31574, + "grad_norm": 1.0766641364846352, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 31574 + }, + { + "epoch": 0.31575, + "grad_norm": 1.1876597270526545, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 31575 + }, + { + "epoch": 0.31576, + "grad_norm": 0.9450521358017108, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 31576 + }, + { + "epoch": 0.31577, + "grad_norm": 0.877600090883553, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 31577 + }, + { + "epoch": 0.31578, + "grad_norm": 0.8551764731496907, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 31578 + }, + { + "epoch": 0.31579, + "grad_norm": 0.8068423121888187, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 31579 + }, + { + "epoch": 0.3158, + "grad_norm": 0.8439504832750326, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 31580 + }, + { + "epoch": 0.31581, + "grad_norm": 0.9088579821771007, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 31581 + }, + { + "epoch": 0.31582, + "grad_norm": 1.0079748329243585, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 31582 + }, + { + "epoch": 0.31583, + "grad_norm": 1.0290257633094382, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 31583 + }, + { + "epoch": 0.31584, + "grad_norm": 0.8130658427301913, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 31584 + }, + { + "epoch": 0.31585, + "grad_norm": 0.6804106040802933, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 31585 + }, + { + "epoch": 0.31586, + "grad_norm": 0.6643786630886397, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 31586 + }, + { + "epoch": 0.31587, + "grad_norm": 0.65495435499212, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 31587 + }, + { + "epoch": 0.31588, + "grad_norm": 0.6342785492568463, + "learning_rate": 0.003, + "loss": 4.043, + "step": 31588 + }, + { + "epoch": 0.31589, + "grad_norm": 0.6313748263619188, + "learning_rate": 0.003, + "loss": 4.023, + "step": 31589 + }, + { + "epoch": 0.3159, + "grad_norm": 0.6787853360862045, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 31590 + }, + { + "epoch": 0.31591, + "grad_norm": 0.7657142999190774, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 31591 + }, + { + "epoch": 0.31592, + "grad_norm": 1.0232697747908814, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 31592 + }, + { + "epoch": 0.31593, + "grad_norm": 1.2400184879033547, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 31593 + }, + { + "epoch": 0.31594, + "grad_norm": 0.929521238600925, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 31594 + }, + { + "epoch": 0.31595, + "grad_norm": 0.8651041277415614, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 31595 + }, + { + "epoch": 0.31596, + "grad_norm": 0.7952548300946874, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 31596 + }, + { + "epoch": 0.31597, + "grad_norm": 0.7351706087981932, + "learning_rate": 0.003, + "loss": 4.05, + "step": 31597 + }, + { + "epoch": 0.31598, + "grad_norm": 0.7099837993513373, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 31598 + }, + { + "epoch": 0.31599, + "grad_norm": 0.6841804804460871, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 31599 + }, + { + "epoch": 0.316, + "grad_norm": 0.7081385041857993, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 31600 + }, + { + "epoch": 0.31601, + "grad_norm": 0.9570912725399182, + "learning_rate": 0.003, + "loss": 4.031, + "step": 31601 + }, + { + "epoch": 0.31602, + "grad_norm": 1.240250894516626, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 31602 + }, + { + "epoch": 0.31603, + "grad_norm": 0.8134500309928108, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 31603 + }, + { + "epoch": 0.31604, + "grad_norm": 0.6839302874059167, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 31604 + }, + { + "epoch": 0.31605, + "grad_norm": 0.7260660295926408, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 31605 + }, + { + "epoch": 0.31606, + "grad_norm": 0.7245199102326637, + "learning_rate": 0.003, + "loss": 3.9995, + "step": 31606 + }, + { + "epoch": 0.31607, + "grad_norm": 0.7668130856449261, + "learning_rate": 0.003, + "loss": 4.033, + "step": 31607 + }, + { + "epoch": 0.31608, + "grad_norm": 0.7958127175057887, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 31608 + }, + { + "epoch": 0.31609, + "grad_norm": 0.834865414299694, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 31609 + }, + { + "epoch": 0.3161, + "grad_norm": 0.7847391831492195, + "learning_rate": 0.003, + "loss": 4.036, + "step": 31610 + }, + { + "epoch": 0.31611, + "grad_norm": 0.704113949103065, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 31611 + }, + { + "epoch": 0.31612, + "grad_norm": 0.8075935574503997, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 31612 + }, + { + "epoch": 0.31613, + "grad_norm": 1.0637410204692654, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 31613 + }, + { + "epoch": 0.31614, + "grad_norm": 1.2567720941913594, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 31614 + }, + { + "epoch": 0.31615, + "grad_norm": 0.8725355820747278, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 31615 + }, + { + "epoch": 0.31616, + "grad_norm": 0.8549274490791835, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 31616 + }, + { + "epoch": 0.31617, + "grad_norm": 0.9214495021492208, + "learning_rate": 0.003, + "loss": 4.056, + "step": 31617 + }, + { + "epoch": 0.31618, + "grad_norm": 1.0149862847536824, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 31618 + }, + { + "epoch": 0.31619, + "grad_norm": 0.907353275277867, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 31619 + }, + { + "epoch": 0.3162, + "grad_norm": 0.904757399125384, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 31620 + }, + { + "epoch": 0.31621, + "grad_norm": 0.9865243249951052, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 31621 + }, + { + "epoch": 0.31622, + "grad_norm": 0.9569986916542709, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 31622 + }, + { + "epoch": 0.31623, + "grad_norm": 0.9371445823565684, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 31623 + }, + { + "epoch": 0.31624, + "grad_norm": 0.9607636681326829, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 31624 + }, + { + "epoch": 0.31625, + "grad_norm": 0.9508530903650089, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 31625 + }, + { + "epoch": 0.31626, + "grad_norm": 0.8889351590108432, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 31626 + }, + { + "epoch": 0.31627, + "grad_norm": 0.7349610638343067, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 31627 + }, + { + "epoch": 0.31628, + "grad_norm": 0.6856110587766746, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 31628 + }, + { + "epoch": 0.31629, + "grad_norm": 0.6822003746599821, + "learning_rate": 0.003, + "loss": 4.027, + "step": 31629 + }, + { + "epoch": 0.3163, + "grad_norm": 0.691430913414624, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 31630 + }, + { + "epoch": 0.31631, + "grad_norm": 0.8317875291135071, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 31631 + }, + { + "epoch": 0.31632, + "grad_norm": 1.0335665785727393, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 31632 + }, + { + "epoch": 0.31633, + "grad_norm": 1.351428069233235, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 31633 + }, + { + "epoch": 0.31634, + "grad_norm": 0.6805819769020331, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 31634 + }, + { + "epoch": 0.31635, + "grad_norm": 0.6294144043689914, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 31635 + }, + { + "epoch": 0.31636, + "grad_norm": 0.7170792948388789, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 31636 + }, + { + "epoch": 0.31637, + "grad_norm": 0.6110946813975363, + "learning_rate": 0.003, + "loss": 4.031, + "step": 31637 + }, + { + "epoch": 0.31638, + "grad_norm": 0.5552248375990422, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 31638 + }, + { + "epoch": 0.31639, + "grad_norm": 0.5417346708688879, + "learning_rate": 0.003, + "loss": 3.997, + "step": 31639 + }, + { + "epoch": 0.3164, + "grad_norm": 0.5981597260356412, + "learning_rate": 0.003, + "loss": 4.0101, + "step": 31640 + }, + { + "epoch": 0.31641, + "grad_norm": 0.6544517110030807, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 31641 + }, + { + "epoch": 0.31642, + "grad_norm": 0.6904413697582259, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 31642 + }, + { + "epoch": 0.31643, + "grad_norm": 0.8899234276497832, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 31643 + }, + { + "epoch": 0.31644, + "grad_norm": 1.2369451691829019, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 31644 + }, + { + "epoch": 0.31645, + "grad_norm": 0.7487791474959529, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 31645 + }, + { + "epoch": 0.31646, + "grad_norm": 0.6083826103279368, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 31646 + }, + { + "epoch": 0.31647, + "grad_norm": 0.64825076375314, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 31647 + }, + { + "epoch": 0.31648, + "grad_norm": 0.6822954676765187, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 31648 + }, + { + "epoch": 0.31649, + "grad_norm": 0.784412459404125, + "learning_rate": 0.003, + "loss": 4.015, + "step": 31649 + }, + { + "epoch": 0.3165, + "grad_norm": 0.9886739316691603, + "learning_rate": 0.003, + "loss": 4.018, + "step": 31650 + }, + { + "epoch": 0.31651, + "grad_norm": 1.3003920899010992, + "learning_rate": 0.003, + "loss": 4.057, + "step": 31651 + }, + { + "epoch": 0.31652, + "grad_norm": 0.6703372930904052, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 31652 + }, + { + "epoch": 0.31653, + "grad_norm": 0.6941063937268447, + "learning_rate": 0.003, + "loss": 4.017, + "step": 31653 + }, + { + "epoch": 0.31654, + "grad_norm": 0.6938753940432911, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 31654 + }, + { + "epoch": 0.31655, + "grad_norm": 0.6649660841946136, + "learning_rate": 0.003, + "loss": 3.9934, + "step": 31655 + }, + { + "epoch": 0.31656, + "grad_norm": 0.6470272695669527, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 31656 + }, + { + "epoch": 0.31657, + "grad_norm": 0.6891802057526383, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 31657 + }, + { + "epoch": 0.31658, + "grad_norm": 0.7514515597838847, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 31658 + }, + { + "epoch": 0.31659, + "grad_norm": 0.8235411986104749, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 31659 + }, + { + "epoch": 0.3166, + "grad_norm": 0.8367570328730097, + "learning_rate": 0.003, + "loss": 3.998, + "step": 31660 + }, + { + "epoch": 0.31661, + "grad_norm": 0.7828977411965163, + "learning_rate": 0.003, + "loss": 4.015, + "step": 31661 + }, + { + "epoch": 0.31662, + "grad_norm": 0.8224378610466981, + "learning_rate": 0.003, + "loss": 4.026, + "step": 31662 + }, + { + "epoch": 0.31663, + "grad_norm": 0.9459133092552918, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 31663 + }, + { + "epoch": 0.31664, + "grad_norm": 1.1968785427206308, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 31664 + }, + { + "epoch": 0.31665, + "grad_norm": 1.004923488105499, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 31665 + }, + { + "epoch": 0.31666, + "grad_norm": 1.034715300979389, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 31666 + }, + { + "epoch": 0.31667, + "grad_norm": 1.1687606828565789, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 31667 + }, + { + "epoch": 0.31668, + "grad_norm": 0.951580207007303, + "learning_rate": 0.003, + "loss": 4.044, + "step": 31668 + }, + { + "epoch": 0.31669, + "grad_norm": 0.8574852377520135, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 31669 + }, + { + "epoch": 0.3167, + "grad_norm": 0.8214861522461784, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 31670 + }, + { + "epoch": 0.31671, + "grad_norm": 0.8266482803256938, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 31671 + }, + { + "epoch": 0.31672, + "grad_norm": 0.8304211857262456, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 31672 + }, + { + "epoch": 0.31673, + "grad_norm": 0.848662824027803, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 31673 + }, + { + "epoch": 0.31674, + "grad_norm": 0.8509902943356481, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 31674 + }, + { + "epoch": 0.31675, + "grad_norm": 0.9661900118811518, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 31675 + }, + { + "epoch": 0.31676, + "grad_norm": 1.1948854276986531, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 31676 + }, + { + "epoch": 0.31677, + "grad_norm": 0.9194235524734188, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 31677 + }, + { + "epoch": 0.31678, + "grad_norm": 0.9207504793325362, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 31678 + }, + { + "epoch": 0.31679, + "grad_norm": 0.9561254133682046, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 31679 + }, + { + "epoch": 0.3168, + "grad_norm": 1.1280401166332852, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 31680 + }, + { + "epoch": 0.31681, + "grad_norm": 0.950069108278576, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 31681 + }, + { + "epoch": 0.31682, + "grad_norm": 0.8097316415915052, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 31682 + }, + { + "epoch": 0.31683, + "grad_norm": 0.8423696212864596, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 31683 + }, + { + "epoch": 0.31684, + "grad_norm": 0.8574379662117941, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 31684 + }, + { + "epoch": 0.31685, + "grad_norm": 0.9072313518922669, + "learning_rate": 0.003, + "loss": 4.054, + "step": 31685 + }, + { + "epoch": 0.31686, + "grad_norm": 0.8724464981229997, + "learning_rate": 0.003, + "loss": 4.061, + "step": 31686 + }, + { + "epoch": 0.31687, + "grad_norm": 0.7467563046873948, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 31687 + }, + { + "epoch": 0.31688, + "grad_norm": 0.7994163277917888, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 31688 + }, + { + "epoch": 0.31689, + "grad_norm": 0.857007422215465, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 31689 + }, + { + "epoch": 0.3169, + "grad_norm": 0.9664597742598159, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 31690 + }, + { + "epoch": 0.31691, + "grad_norm": 1.0617823139984794, + "learning_rate": 0.003, + "loss": 4.0878, + "step": 31691 + }, + { + "epoch": 0.31692, + "grad_norm": 1.0896529161154136, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 31692 + }, + { + "epoch": 0.31693, + "grad_norm": 0.8921868647076213, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 31693 + }, + { + "epoch": 0.31694, + "grad_norm": 0.8021453358951197, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 31694 + }, + { + "epoch": 0.31695, + "grad_norm": 0.6973597093958827, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 31695 + }, + { + "epoch": 0.31696, + "grad_norm": 0.6969098857510676, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 31696 + }, + { + "epoch": 0.31697, + "grad_norm": 0.6990828159371222, + "learning_rate": 0.003, + "loss": 4.013, + "step": 31697 + }, + { + "epoch": 0.31698, + "grad_norm": 0.7132234232793425, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 31698 + }, + { + "epoch": 0.31699, + "grad_norm": 0.7674043000928494, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 31699 + }, + { + "epoch": 0.317, + "grad_norm": 0.8127487960959816, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 31700 + }, + { + "epoch": 0.31701, + "grad_norm": 0.8636070818618966, + "learning_rate": 0.003, + "loss": 4.059, + "step": 31701 + }, + { + "epoch": 0.31702, + "grad_norm": 0.8575853197039052, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 31702 + }, + { + "epoch": 0.31703, + "grad_norm": 0.8145277062983434, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 31703 + }, + { + "epoch": 0.31704, + "grad_norm": 0.727773256028851, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 31704 + }, + { + "epoch": 0.31705, + "grad_norm": 0.8249551611424246, + "learning_rate": 0.003, + "loss": 4.051, + "step": 31705 + }, + { + "epoch": 0.31706, + "grad_norm": 1.1589169527104535, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 31706 + }, + { + "epoch": 0.31707, + "grad_norm": 0.9803638921902325, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 31707 + }, + { + "epoch": 0.31708, + "grad_norm": 0.8400111912969246, + "learning_rate": 0.003, + "loss": 4.033, + "step": 31708 + }, + { + "epoch": 0.31709, + "grad_norm": 0.7685439607886513, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 31709 + }, + { + "epoch": 0.3171, + "grad_norm": 0.6850874966656303, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 31710 + }, + { + "epoch": 0.31711, + "grad_norm": 0.6137210311218297, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 31711 + }, + { + "epoch": 0.31712, + "grad_norm": 0.6164414020722567, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 31712 + }, + { + "epoch": 0.31713, + "grad_norm": 0.6596844960028684, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 31713 + }, + { + "epoch": 0.31714, + "grad_norm": 0.7848467065290823, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 31714 + }, + { + "epoch": 0.31715, + "grad_norm": 0.8593193421150901, + "learning_rate": 0.003, + "loss": 4.018, + "step": 31715 + }, + { + "epoch": 0.31716, + "grad_norm": 0.955574776903467, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 31716 + }, + { + "epoch": 0.31717, + "grad_norm": 1.0859192160420044, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 31717 + }, + { + "epoch": 0.31718, + "grad_norm": 1.0733369936776542, + "learning_rate": 0.003, + "loss": 4.046, + "step": 31718 + }, + { + "epoch": 0.31719, + "grad_norm": 0.9977448488604509, + "learning_rate": 0.003, + "loss": 4.007, + "step": 31719 + }, + { + "epoch": 0.3172, + "grad_norm": 0.9332232947103541, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 31720 + }, + { + "epoch": 0.31721, + "grad_norm": 0.82974840898276, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 31721 + }, + { + "epoch": 0.31722, + "grad_norm": 0.7352922255500388, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 31722 + }, + { + "epoch": 0.31723, + "grad_norm": 0.7733070717397945, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 31723 + }, + { + "epoch": 0.31724, + "grad_norm": 0.8369714710907813, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 31724 + }, + { + "epoch": 0.31725, + "grad_norm": 0.9619466767659887, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 31725 + }, + { + "epoch": 0.31726, + "grad_norm": 1.0139836647152107, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 31726 + }, + { + "epoch": 0.31727, + "grad_norm": 0.9424026051218393, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 31727 + }, + { + "epoch": 0.31728, + "grad_norm": 1.0059392783450427, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 31728 + }, + { + "epoch": 0.31729, + "grad_norm": 1.0422474170309832, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 31729 + }, + { + "epoch": 0.3173, + "grad_norm": 0.8777005901796452, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 31730 + }, + { + "epoch": 0.31731, + "grad_norm": 0.7515284097710908, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 31731 + }, + { + "epoch": 0.31732, + "grad_norm": 0.6774220962361013, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 31732 + }, + { + "epoch": 0.31733, + "grad_norm": 0.604393727124694, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 31733 + }, + { + "epoch": 0.31734, + "grad_norm": 0.71149275639189, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 31734 + }, + { + "epoch": 0.31735, + "grad_norm": 0.7857232795347587, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 31735 + }, + { + "epoch": 0.31736, + "grad_norm": 0.8808998443633108, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 31736 + }, + { + "epoch": 0.31737, + "grad_norm": 1.0460501392505324, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 31737 + }, + { + "epoch": 0.31738, + "grad_norm": 1.0058761512516605, + "learning_rate": 0.003, + "loss": 4.04, + "step": 31738 + }, + { + "epoch": 0.31739, + "grad_norm": 0.9262393520806653, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 31739 + }, + { + "epoch": 0.3174, + "grad_norm": 0.8754055895703489, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 31740 + }, + { + "epoch": 0.31741, + "grad_norm": 0.8153754084106215, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 31741 + }, + { + "epoch": 0.31742, + "grad_norm": 0.8107919854134675, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 31742 + }, + { + "epoch": 0.31743, + "grad_norm": 0.824600599144597, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 31743 + }, + { + "epoch": 0.31744, + "grad_norm": 0.7925837642496514, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 31744 + }, + { + "epoch": 0.31745, + "grad_norm": 0.9702372691210485, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 31745 + }, + { + "epoch": 0.31746, + "grad_norm": 1.0954310675584324, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 31746 + }, + { + "epoch": 0.31747, + "grad_norm": 0.9881123968325731, + "learning_rate": 0.003, + "loss": 4.0009, + "step": 31747 + }, + { + "epoch": 0.31748, + "grad_norm": 1.0108546815993849, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 31748 + }, + { + "epoch": 0.31749, + "grad_norm": 1.0209991262149867, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 31749 + }, + { + "epoch": 0.3175, + "grad_norm": 0.9782786457242102, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 31750 + }, + { + "epoch": 0.31751, + "grad_norm": 0.8836964357966485, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 31751 + }, + { + "epoch": 0.31752, + "grad_norm": 0.8419562556047677, + "learning_rate": 0.003, + "loss": 4.043, + "step": 31752 + }, + { + "epoch": 0.31753, + "grad_norm": 0.9554254109110003, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 31753 + }, + { + "epoch": 0.31754, + "grad_norm": 1.0496575472030754, + "learning_rate": 0.003, + "loss": 3.9964, + "step": 31754 + }, + { + "epoch": 0.31755, + "grad_norm": 1.046292446016317, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 31755 + }, + { + "epoch": 0.31756, + "grad_norm": 0.7713437348006492, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 31756 + }, + { + "epoch": 0.31757, + "grad_norm": 0.7420067650435133, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 31757 + }, + { + "epoch": 0.31758, + "grad_norm": 0.7617284070458882, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 31758 + }, + { + "epoch": 0.31759, + "grad_norm": 0.7398636782303852, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 31759 + }, + { + "epoch": 0.3176, + "grad_norm": 0.7378628819663552, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 31760 + }, + { + "epoch": 0.31761, + "grad_norm": 0.7508560010749952, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 31761 + }, + { + "epoch": 0.31762, + "grad_norm": 0.6717340930663201, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 31762 + }, + { + "epoch": 0.31763, + "grad_norm": 0.6503279374981161, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 31763 + }, + { + "epoch": 0.31764, + "grad_norm": 0.7061339129080761, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 31764 + }, + { + "epoch": 0.31765, + "grad_norm": 0.6467245609740618, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 31765 + }, + { + "epoch": 0.31766, + "grad_norm": 0.5553141776217694, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 31766 + }, + { + "epoch": 0.31767, + "grad_norm": 0.5794638145098655, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 31767 + }, + { + "epoch": 0.31768, + "grad_norm": 0.62262483503146, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 31768 + }, + { + "epoch": 0.31769, + "grad_norm": 0.5634303803936263, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 31769 + }, + { + "epoch": 0.3177, + "grad_norm": 0.6668720426893265, + "learning_rate": 0.003, + "loss": 4.0041, + "step": 31770 + }, + { + "epoch": 0.31771, + "grad_norm": 0.963613265686622, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 31771 + }, + { + "epoch": 0.31772, + "grad_norm": 1.3017730927257694, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 31772 + }, + { + "epoch": 0.31773, + "grad_norm": 0.6333041936758826, + "learning_rate": 0.003, + "loss": 4.045, + "step": 31773 + }, + { + "epoch": 0.31774, + "grad_norm": 0.7171855708505732, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 31774 + }, + { + "epoch": 0.31775, + "grad_norm": 0.6895554748952585, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 31775 + }, + { + "epoch": 0.31776, + "grad_norm": 0.6684608967333775, + "learning_rate": 0.003, + "loss": 4.017, + "step": 31776 + }, + { + "epoch": 0.31777, + "grad_norm": 0.6404235195527236, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 31777 + }, + { + "epoch": 0.31778, + "grad_norm": 0.5941257335083212, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 31778 + }, + { + "epoch": 0.31779, + "grad_norm": 0.5392608318340594, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 31779 + }, + { + "epoch": 0.3178, + "grad_norm": 0.540607353715306, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 31780 + }, + { + "epoch": 0.31781, + "grad_norm": 0.6447232723666853, + "learning_rate": 0.003, + "loss": 4.004, + "step": 31781 + }, + { + "epoch": 0.31782, + "grad_norm": 0.688999424955781, + "learning_rate": 0.003, + "loss": 4.026, + "step": 31782 + }, + { + "epoch": 0.31783, + "grad_norm": 0.6956078300118124, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 31783 + }, + { + "epoch": 0.31784, + "grad_norm": 0.7476707106380601, + "learning_rate": 0.003, + "loss": 3.9988, + "step": 31784 + }, + { + "epoch": 0.31785, + "grad_norm": 1.014231075078544, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 31785 + }, + { + "epoch": 0.31786, + "grad_norm": 1.2479250128178547, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 31786 + }, + { + "epoch": 0.31787, + "grad_norm": 0.928681568221827, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 31787 + }, + { + "epoch": 0.31788, + "grad_norm": 1.0137099070039748, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 31788 + }, + { + "epoch": 0.31789, + "grad_norm": 0.9075292268362357, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 31789 + }, + { + "epoch": 0.3179, + "grad_norm": 0.9179471014236559, + "learning_rate": 0.003, + "loss": 4.027, + "step": 31790 + }, + { + "epoch": 0.31791, + "grad_norm": 0.9303417786314597, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 31791 + }, + { + "epoch": 0.31792, + "grad_norm": 0.9137408217415345, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 31792 + }, + { + "epoch": 0.31793, + "grad_norm": 0.9165941561414078, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 31793 + }, + { + "epoch": 0.31794, + "grad_norm": 0.9661702796667364, + "learning_rate": 0.003, + "loss": 4.081, + "step": 31794 + }, + { + "epoch": 0.31795, + "grad_norm": 0.9775097566264104, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 31795 + }, + { + "epoch": 0.31796, + "grad_norm": 1.1157457502233663, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 31796 + }, + { + "epoch": 0.31797, + "grad_norm": 0.8856904998154366, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 31797 + }, + { + "epoch": 0.31798, + "grad_norm": 0.8519214561708793, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 31798 + }, + { + "epoch": 0.31799, + "grad_norm": 1.0307727318815474, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 31799 + }, + { + "epoch": 0.318, + "grad_norm": 1.1260676327763377, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 31800 + }, + { + "epoch": 0.31801, + "grad_norm": 0.9716022797439507, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 31801 + }, + { + "epoch": 0.31802, + "grad_norm": 0.9371381317882487, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 31802 + }, + { + "epoch": 0.31803, + "grad_norm": 0.9031837590848479, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 31803 + }, + { + "epoch": 0.31804, + "grad_norm": 0.8394145265131439, + "learning_rate": 0.003, + "loss": 4.041, + "step": 31804 + }, + { + "epoch": 0.31805, + "grad_norm": 0.7248972748404832, + "learning_rate": 0.003, + "loss": 4.025, + "step": 31805 + }, + { + "epoch": 0.31806, + "grad_norm": 0.682553450623044, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 31806 + }, + { + "epoch": 0.31807, + "grad_norm": 0.7222777840702896, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 31807 + }, + { + "epoch": 0.31808, + "grad_norm": 0.8005529487674533, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 31808 + }, + { + "epoch": 0.31809, + "grad_norm": 0.8115304921841447, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 31809 + }, + { + "epoch": 0.3181, + "grad_norm": 0.7669719454365862, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 31810 + }, + { + "epoch": 0.31811, + "grad_norm": 0.7895201875934068, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 31811 + }, + { + "epoch": 0.31812, + "grad_norm": 0.8927677112580112, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 31812 + }, + { + "epoch": 0.31813, + "grad_norm": 0.9568060675741136, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 31813 + }, + { + "epoch": 0.31814, + "grad_norm": 0.9118318764611992, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 31814 + }, + { + "epoch": 0.31815, + "grad_norm": 0.8634885176463749, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 31815 + }, + { + "epoch": 0.31816, + "grad_norm": 0.8958948540650317, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 31816 + }, + { + "epoch": 0.31817, + "grad_norm": 0.8108622053475567, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 31817 + }, + { + "epoch": 0.31818, + "grad_norm": 0.8957666022651664, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 31818 + }, + { + "epoch": 0.31819, + "grad_norm": 1.0827120044309515, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 31819 + }, + { + "epoch": 0.3182, + "grad_norm": 1.0211789663115969, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 31820 + }, + { + "epoch": 0.31821, + "grad_norm": 1.1339129467277016, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 31821 + }, + { + "epoch": 0.31822, + "grad_norm": 0.892129634986221, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 31822 + }, + { + "epoch": 0.31823, + "grad_norm": 0.7482746469833818, + "learning_rate": 0.003, + "loss": 4.027, + "step": 31823 + }, + { + "epoch": 0.31824, + "grad_norm": 0.6882227874567665, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 31824 + }, + { + "epoch": 0.31825, + "grad_norm": 0.6982827864579789, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 31825 + }, + { + "epoch": 0.31826, + "grad_norm": 0.6531882392468825, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 31826 + }, + { + "epoch": 0.31827, + "grad_norm": 0.5800531556713253, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 31827 + }, + { + "epoch": 0.31828, + "grad_norm": 0.6042458106764691, + "learning_rate": 0.003, + "loss": 3.998, + "step": 31828 + }, + { + "epoch": 0.31829, + "grad_norm": 0.6422986811910036, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 31829 + }, + { + "epoch": 0.3183, + "grad_norm": 0.6823041011347539, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 31830 + }, + { + "epoch": 0.31831, + "grad_norm": 0.8486270033535357, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 31831 + }, + { + "epoch": 0.31832, + "grad_norm": 1.1410196262946537, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 31832 + }, + { + "epoch": 0.31833, + "grad_norm": 0.7835169421778341, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 31833 + }, + { + "epoch": 0.31834, + "grad_norm": 0.6837822303249259, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 31834 + }, + { + "epoch": 0.31835, + "grad_norm": 0.7745801809607402, + "learning_rate": 0.003, + "loss": 4.0045, + "step": 31835 + }, + { + "epoch": 0.31836, + "grad_norm": 0.8617406093622209, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 31836 + }, + { + "epoch": 0.31837, + "grad_norm": 1.1386860868326663, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 31837 + }, + { + "epoch": 0.31838, + "grad_norm": 0.9835903219772802, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 31838 + }, + { + "epoch": 0.31839, + "grad_norm": 0.9166239009127802, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 31839 + }, + { + "epoch": 0.3184, + "grad_norm": 0.8592565015918057, + "learning_rate": 0.003, + "loss": 4.063, + "step": 31840 + }, + { + "epoch": 0.31841, + "grad_norm": 0.7758546505070536, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 31841 + }, + { + "epoch": 0.31842, + "grad_norm": 0.6805929767663431, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 31842 + }, + { + "epoch": 0.31843, + "grad_norm": 0.8033730908955115, + "learning_rate": 0.003, + "loss": 4.045, + "step": 31843 + }, + { + "epoch": 0.31844, + "grad_norm": 0.8200219170428044, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 31844 + }, + { + "epoch": 0.31845, + "grad_norm": 0.878433440330917, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 31845 + }, + { + "epoch": 0.31846, + "grad_norm": 1.0453824215515883, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 31846 + }, + { + "epoch": 0.31847, + "grad_norm": 1.0026640689331277, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 31847 + }, + { + "epoch": 0.31848, + "grad_norm": 0.8783832208326118, + "learning_rate": 0.003, + "loss": 4.072, + "step": 31848 + }, + { + "epoch": 0.31849, + "grad_norm": 0.9088781825564077, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 31849 + }, + { + "epoch": 0.3185, + "grad_norm": 0.8797457873021886, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 31850 + }, + { + "epoch": 0.31851, + "grad_norm": 0.7624455043807403, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 31851 + }, + { + "epoch": 0.31852, + "grad_norm": 0.7776255760667412, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 31852 + }, + { + "epoch": 0.31853, + "grad_norm": 1.1551997077015292, + "learning_rate": 0.003, + "loss": 4.072, + "step": 31853 + }, + { + "epoch": 0.31854, + "grad_norm": 1.250925158135599, + "learning_rate": 0.003, + "loss": 4.0826, + "step": 31854 + }, + { + "epoch": 0.31855, + "grad_norm": 0.7086022777841936, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 31855 + }, + { + "epoch": 0.31856, + "grad_norm": 0.7351087587614781, + "learning_rate": 0.003, + "loss": 4.027, + "step": 31856 + }, + { + "epoch": 0.31857, + "grad_norm": 0.7266546245266704, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 31857 + }, + { + "epoch": 0.31858, + "grad_norm": 0.744009910017858, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 31858 + }, + { + "epoch": 0.31859, + "grad_norm": 0.7809740345878033, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 31859 + }, + { + "epoch": 0.3186, + "grad_norm": 0.7778039148190007, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 31860 + }, + { + "epoch": 0.31861, + "grad_norm": 0.7150061711429323, + "learning_rate": 0.003, + "loss": 3.9965, + "step": 31861 + }, + { + "epoch": 0.31862, + "grad_norm": 0.644799035742603, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 31862 + }, + { + "epoch": 0.31863, + "grad_norm": 0.6126949802215979, + "learning_rate": 0.003, + "loss": 4.035, + "step": 31863 + }, + { + "epoch": 0.31864, + "grad_norm": 0.636018137498393, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 31864 + }, + { + "epoch": 0.31865, + "grad_norm": 0.7569269767137659, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 31865 + }, + { + "epoch": 0.31866, + "grad_norm": 0.9729876961048175, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 31866 + }, + { + "epoch": 0.31867, + "grad_norm": 1.1953277764355161, + "learning_rate": 0.003, + "loss": 4.077, + "step": 31867 + }, + { + "epoch": 0.31868, + "grad_norm": 0.7253513343920129, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 31868 + }, + { + "epoch": 0.31869, + "grad_norm": 0.7241339929545165, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 31869 + }, + { + "epoch": 0.3187, + "grad_norm": 0.7606228143120478, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 31870 + }, + { + "epoch": 0.31871, + "grad_norm": 1.0278337448483394, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 31871 + }, + { + "epoch": 0.31872, + "grad_norm": 1.148506643356017, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 31872 + }, + { + "epoch": 0.31873, + "grad_norm": 0.8803192898809925, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 31873 + }, + { + "epoch": 0.31874, + "grad_norm": 0.935921801561099, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 31874 + }, + { + "epoch": 0.31875, + "grad_norm": 0.8294268018534112, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 31875 + }, + { + "epoch": 0.31876, + "grad_norm": 0.870504744694578, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 31876 + }, + { + "epoch": 0.31877, + "grad_norm": 0.9819034957616358, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 31877 + }, + { + "epoch": 0.31878, + "grad_norm": 1.2297107231441387, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 31878 + }, + { + "epoch": 0.31879, + "grad_norm": 0.7977295834568685, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 31879 + }, + { + "epoch": 0.3188, + "grad_norm": 0.9444682432242328, + "learning_rate": 0.003, + "loss": 4.072, + "step": 31880 + }, + { + "epoch": 0.31881, + "grad_norm": 0.9603971893538169, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 31881 + }, + { + "epoch": 0.31882, + "grad_norm": 0.9146825209963074, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 31882 + }, + { + "epoch": 0.31883, + "grad_norm": 0.9947232258316071, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 31883 + }, + { + "epoch": 0.31884, + "grad_norm": 1.0714214145737804, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 31884 + }, + { + "epoch": 0.31885, + "grad_norm": 0.8610196917402594, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 31885 + }, + { + "epoch": 0.31886, + "grad_norm": 0.7501043688858714, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 31886 + }, + { + "epoch": 0.31887, + "grad_norm": 0.8532363873883472, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 31887 + }, + { + "epoch": 0.31888, + "grad_norm": 0.9179891817940065, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 31888 + }, + { + "epoch": 0.31889, + "grad_norm": 0.8141065635257053, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 31889 + }, + { + "epoch": 0.3189, + "grad_norm": 0.7740498787229562, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 31890 + }, + { + "epoch": 0.31891, + "grad_norm": 0.726347293847583, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 31891 + }, + { + "epoch": 0.31892, + "grad_norm": 0.682267283870932, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 31892 + }, + { + "epoch": 0.31893, + "grad_norm": 0.7880407825475153, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 31893 + }, + { + "epoch": 0.31894, + "grad_norm": 0.7971800775877619, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 31894 + }, + { + "epoch": 0.31895, + "grad_norm": 0.7432508734361885, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 31895 + }, + { + "epoch": 0.31896, + "grad_norm": 0.8030601661175567, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 31896 + }, + { + "epoch": 0.31897, + "grad_norm": 0.7918071698226915, + "learning_rate": 0.003, + "loss": 4.057, + "step": 31897 + }, + { + "epoch": 0.31898, + "grad_norm": 0.7323831223777469, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 31898 + }, + { + "epoch": 0.31899, + "grad_norm": 0.6252765559134325, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 31899 + }, + { + "epoch": 0.319, + "grad_norm": 0.6237808591411749, + "learning_rate": 0.003, + "loss": 3.9988, + "step": 31900 + }, + { + "epoch": 0.31901, + "grad_norm": 0.6316253014947295, + "learning_rate": 0.003, + "loss": 4.033, + "step": 31901 + }, + { + "epoch": 0.31902, + "grad_norm": 0.6599030080662733, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31902 + }, + { + "epoch": 0.31903, + "grad_norm": 0.7019997153506355, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 31903 + }, + { + "epoch": 0.31904, + "grad_norm": 1.0287377635358812, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 31904 + }, + { + "epoch": 0.31905, + "grad_norm": 1.4557453400171543, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 31905 + }, + { + "epoch": 0.31906, + "grad_norm": 0.4650270959278082, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 31906 + }, + { + "epoch": 0.31907, + "grad_norm": 0.9024237864340765, + "learning_rate": 0.003, + "loss": 4.068, + "step": 31907 + }, + { + "epoch": 0.31908, + "grad_norm": 1.0455128937368943, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 31908 + }, + { + "epoch": 0.31909, + "grad_norm": 0.909997028275504, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 31909 + }, + { + "epoch": 0.3191, + "grad_norm": 0.8968564276745242, + "learning_rate": 0.003, + "loss": 4.053, + "step": 31910 + }, + { + "epoch": 0.31911, + "grad_norm": 0.8784667503563353, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 31911 + }, + { + "epoch": 0.31912, + "grad_norm": 0.7921184072901972, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 31912 + }, + { + "epoch": 0.31913, + "grad_norm": 0.6897666211196851, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 31913 + }, + { + "epoch": 0.31914, + "grad_norm": 0.647771668483397, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 31914 + }, + { + "epoch": 0.31915, + "grad_norm": 0.7268458730937448, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 31915 + }, + { + "epoch": 0.31916, + "grad_norm": 0.845508764329991, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 31916 + }, + { + "epoch": 0.31917, + "grad_norm": 1.1616235133580697, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 31917 + }, + { + "epoch": 0.31918, + "grad_norm": 0.9374671244888748, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 31918 + }, + { + "epoch": 0.31919, + "grad_norm": 0.7963473771294272, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 31919 + }, + { + "epoch": 0.3192, + "grad_norm": 0.7485844335205557, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 31920 + }, + { + "epoch": 0.31921, + "grad_norm": 0.766799389457276, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 31921 + }, + { + "epoch": 0.31922, + "grad_norm": 0.7377989774261214, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 31922 + }, + { + "epoch": 0.31923, + "grad_norm": 0.7098106538463989, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 31923 + }, + { + "epoch": 0.31924, + "grad_norm": 0.6852794619243266, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 31924 + }, + { + "epoch": 0.31925, + "grad_norm": 0.8898638814841711, + "learning_rate": 0.003, + "loss": 4.041, + "step": 31925 + }, + { + "epoch": 0.31926, + "grad_norm": 1.1011865426529608, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 31926 + }, + { + "epoch": 0.31927, + "grad_norm": 0.9805539855453584, + "learning_rate": 0.003, + "loss": 4.02, + "step": 31927 + }, + { + "epoch": 0.31928, + "grad_norm": 0.9390630574992901, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 31928 + }, + { + "epoch": 0.31929, + "grad_norm": 0.8820689706047953, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 31929 + }, + { + "epoch": 0.3193, + "grad_norm": 0.966870621329203, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 31930 + }, + { + "epoch": 0.31931, + "grad_norm": 0.816414801442235, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 31931 + }, + { + "epoch": 0.31932, + "grad_norm": 0.7819685647382922, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 31932 + }, + { + "epoch": 0.31933, + "grad_norm": 0.7446792355799742, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 31933 + }, + { + "epoch": 0.31934, + "grad_norm": 0.7310166143431301, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 31934 + }, + { + "epoch": 0.31935, + "grad_norm": 0.742723102377141, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 31935 + }, + { + "epoch": 0.31936, + "grad_norm": 0.8960097343645532, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 31936 + }, + { + "epoch": 0.31937, + "grad_norm": 1.1144163132169604, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 31937 + }, + { + "epoch": 0.31938, + "grad_norm": 1.166727186821888, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 31938 + }, + { + "epoch": 0.31939, + "grad_norm": 0.8322959088244993, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 31939 + }, + { + "epoch": 0.3194, + "grad_norm": 0.6499166343754063, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 31940 + }, + { + "epoch": 0.31941, + "grad_norm": 0.6762872507013268, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 31941 + }, + { + "epoch": 0.31942, + "grad_norm": 0.7360632590097111, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 31942 + }, + { + "epoch": 0.31943, + "grad_norm": 0.8995858736423183, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 31943 + }, + { + "epoch": 0.31944, + "grad_norm": 0.9958386505954763, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 31944 + }, + { + "epoch": 0.31945, + "grad_norm": 1.0567270409490133, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 31945 + }, + { + "epoch": 0.31946, + "grad_norm": 0.7778097893841952, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 31946 + }, + { + "epoch": 0.31947, + "grad_norm": 0.7347048817329094, + "learning_rate": 0.003, + "loss": 4.038, + "step": 31947 + }, + { + "epoch": 0.31948, + "grad_norm": 0.751947048501362, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 31948 + }, + { + "epoch": 0.31949, + "grad_norm": 0.7483546704206897, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 31949 + }, + { + "epoch": 0.3195, + "grad_norm": 0.8985366964581385, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 31950 + }, + { + "epoch": 0.31951, + "grad_norm": 1.1174277150462961, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 31951 + }, + { + "epoch": 0.31952, + "grad_norm": 0.783229845699387, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 31952 + }, + { + "epoch": 0.31953, + "grad_norm": 0.7086187501330949, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 31953 + }, + { + "epoch": 0.31954, + "grad_norm": 0.753200063198879, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 31954 + }, + { + "epoch": 0.31955, + "grad_norm": 0.7130914308031854, + "learning_rate": 0.003, + "loss": 4.037, + "step": 31955 + }, + { + "epoch": 0.31956, + "grad_norm": 0.715177737475376, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 31956 + }, + { + "epoch": 0.31957, + "grad_norm": 0.9306238073941161, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 31957 + }, + { + "epoch": 0.31958, + "grad_norm": 1.1424727918683524, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 31958 + }, + { + "epoch": 0.31959, + "grad_norm": 0.7984644889502183, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 31959 + }, + { + "epoch": 0.3196, + "grad_norm": 0.7409439322315312, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 31960 + }, + { + "epoch": 0.31961, + "grad_norm": 0.6104016610407806, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 31961 + }, + { + "epoch": 0.31962, + "grad_norm": 0.6082768451244568, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 31962 + }, + { + "epoch": 0.31963, + "grad_norm": 0.6410355093824731, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 31963 + }, + { + "epoch": 0.31964, + "grad_norm": 0.6514978851839643, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 31964 + }, + { + "epoch": 0.31965, + "grad_norm": 0.760157681534477, + "learning_rate": 0.003, + "loss": 4.088, + "step": 31965 + }, + { + "epoch": 0.31966, + "grad_norm": 0.7916998017448065, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 31966 + }, + { + "epoch": 0.31967, + "grad_norm": 0.7310848997868826, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 31967 + }, + { + "epoch": 0.31968, + "grad_norm": 0.758928172355569, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 31968 + }, + { + "epoch": 0.31969, + "grad_norm": 0.7843576241612726, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 31969 + }, + { + "epoch": 0.3197, + "grad_norm": 0.7893309438032472, + "learning_rate": 0.003, + "loss": 4.0007, + "step": 31970 + }, + { + "epoch": 0.31971, + "grad_norm": 0.7623347897582147, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 31971 + }, + { + "epoch": 0.31972, + "grad_norm": 0.7283270775986949, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 31972 + }, + { + "epoch": 0.31973, + "grad_norm": 0.7759202820621282, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 31973 + }, + { + "epoch": 0.31974, + "grad_norm": 0.9569202192117368, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 31974 + }, + { + "epoch": 0.31975, + "grad_norm": 1.1699870708580904, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 31975 + }, + { + "epoch": 0.31976, + "grad_norm": 1.1598244945937317, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 31976 + }, + { + "epoch": 0.31977, + "grad_norm": 1.0714472782539628, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 31977 + }, + { + "epoch": 0.31978, + "grad_norm": 0.9585752891570171, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 31978 + }, + { + "epoch": 0.31979, + "grad_norm": 0.9935764844332361, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 31979 + }, + { + "epoch": 0.3198, + "grad_norm": 0.917970692055569, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 31980 + }, + { + "epoch": 0.31981, + "grad_norm": 0.8948464076222182, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 31981 + }, + { + "epoch": 0.31982, + "grad_norm": 0.9420091259137481, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 31982 + }, + { + "epoch": 0.31983, + "grad_norm": 0.9080774246774516, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 31983 + }, + { + "epoch": 0.31984, + "grad_norm": 0.8882614337397354, + "learning_rate": 0.003, + "loss": 4.041, + "step": 31984 + }, + { + "epoch": 0.31985, + "grad_norm": 0.9289677986427315, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 31985 + }, + { + "epoch": 0.31986, + "grad_norm": 0.9078188305859279, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 31986 + }, + { + "epoch": 0.31987, + "grad_norm": 0.9063446703135518, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 31987 + }, + { + "epoch": 0.31988, + "grad_norm": 0.9184602061902861, + "learning_rate": 0.003, + "loss": 4.051, + "step": 31988 + }, + { + "epoch": 0.31989, + "grad_norm": 0.8558604591744582, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 31989 + }, + { + "epoch": 0.3199, + "grad_norm": 0.8343724839043628, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 31990 + }, + { + "epoch": 0.31991, + "grad_norm": 0.8710951068245222, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 31991 + }, + { + "epoch": 0.31992, + "grad_norm": 0.9902186178784111, + "learning_rate": 0.003, + "loss": 4.052, + "step": 31992 + }, + { + "epoch": 0.31993, + "grad_norm": 1.0243591008733897, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 31993 + }, + { + "epoch": 0.31994, + "grad_norm": 0.957746057148872, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 31994 + }, + { + "epoch": 0.31995, + "grad_norm": 0.9123419283478792, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 31995 + }, + { + "epoch": 0.31996, + "grad_norm": 1.0690989717824446, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 31996 + }, + { + "epoch": 0.31997, + "grad_norm": 1.0593332050725643, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 31997 + }, + { + "epoch": 0.31998, + "grad_norm": 0.9381075966847782, + "learning_rate": 0.003, + "loss": 4.066, + "step": 31998 + }, + { + "epoch": 0.31999, + "grad_norm": 1.027614119195915, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 31999 + }, + { + "epoch": 0.32, + "grad_norm": 0.9937547999191643, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 32000 + }, + { + "epoch": 0.32001, + "grad_norm": 0.8905727663611102, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 32001 + }, + { + "epoch": 0.32002, + "grad_norm": 0.9283239791206751, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 32002 + }, + { + "epoch": 0.32003, + "grad_norm": 1.0712964649809305, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 32003 + }, + { + "epoch": 0.32004, + "grad_norm": 0.9842351369792838, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 32004 + }, + { + "epoch": 0.32005, + "grad_norm": 1.0342623115897605, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 32005 + }, + { + "epoch": 0.32006, + "grad_norm": 1.0297046518059014, + "learning_rate": 0.003, + "loss": 4.08, + "step": 32006 + }, + { + "epoch": 0.32007, + "grad_norm": 1.0348028632206066, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 32007 + }, + { + "epoch": 0.32008, + "grad_norm": 0.862864699643544, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 32008 + }, + { + "epoch": 0.32009, + "grad_norm": 0.7739315000977706, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 32009 + }, + { + "epoch": 0.3201, + "grad_norm": 0.7595435751321571, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 32010 + }, + { + "epoch": 0.32011, + "grad_norm": 0.8608867843721222, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 32011 + }, + { + "epoch": 0.32012, + "grad_norm": 0.8328745602274971, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 32012 + }, + { + "epoch": 0.32013, + "grad_norm": 0.7177502500998435, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 32013 + }, + { + "epoch": 0.32014, + "grad_norm": 0.552308721647943, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 32014 + }, + { + "epoch": 0.32015, + "grad_norm": 0.5345347648465099, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 32015 + }, + { + "epoch": 0.32016, + "grad_norm": 0.5636578262000033, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 32016 + }, + { + "epoch": 0.32017, + "grad_norm": 0.6157988675474264, + "learning_rate": 0.003, + "loss": 4.0052, + "step": 32017 + }, + { + "epoch": 0.32018, + "grad_norm": 0.6951204648170406, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 32018 + }, + { + "epoch": 0.32019, + "grad_norm": 0.813312603394071, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 32019 + }, + { + "epoch": 0.3202, + "grad_norm": 0.9478989511370306, + "learning_rate": 0.003, + "loss": 4.039, + "step": 32020 + }, + { + "epoch": 0.32021, + "grad_norm": 1.125575212766914, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 32021 + }, + { + "epoch": 0.32022, + "grad_norm": 0.7704911596245103, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 32022 + }, + { + "epoch": 0.32023, + "grad_norm": 0.6673891044289912, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 32023 + }, + { + "epoch": 0.32024, + "grad_norm": 0.5042939130741306, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 32024 + }, + { + "epoch": 0.32025, + "grad_norm": 0.45621597283630866, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 32025 + }, + { + "epoch": 0.32026, + "grad_norm": 0.454114493943505, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 32026 + }, + { + "epoch": 0.32027, + "grad_norm": 0.4850994891568993, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 32027 + }, + { + "epoch": 0.32028, + "grad_norm": 0.47933033162671634, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 32028 + }, + { + "epoch": 0.32029, + "grad_norm": 0.5415419166312868, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 32029 + }, + { + "epoch": 0.3203, + "grad_norm": 0.5856828794473122, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 32030 + }, + { + "epoch": 0.32031, + "grad_norm": 0.7433198055512185, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 32031 + }, + { + "epoch": 0.32032, + "grad_norm": 1.0725019398351328, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 32032 + }, + { + "epoch": 0.32033, + "grad_norm": 0.9178820247270171, + "learning_rate": 0.003, + "loss": 4.0043, + "step": 32033 + }, + { + "epoch": 0.32034, + "grad_norm": 0.6766540669992053, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 32034 + }, + { + "epoch": 0.32035, + "grad_norm": 0.5881938728460938, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 32035 + }, + { + "epoch": 0.32036, + "grad_norm": 0.7511388806391386, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 32036 + }, + { + "epoch": 0.32037, + "grad_norm": 1.0097299657858265, + "learning_rate": 0.003, + "loss": 4.004, + "step": 32037 + }, + { + "epoch": 0.32038, + "grad_norm": 1.1085428214065751, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 32038 + }, + { + "epoch": 0.32039, + "grad_norm": 0.8595820754968798, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 32039 + }, + { + "epoch": 0.3204, + "grad_norm": 0.8859817443084181, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 32040 + }, + { + "epoch": 0.32041, + "grad_norm": 0.8161877203939851, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 32041 + }, + { + "epoch": 0.32042, + "grad_norm": 0.7726546588569224, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 32042 + }, + { + "epoch": 0.32043, + "grad_norm": 0.8550862182864043, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 32043 + }, + { + "epoch": 0.32044, + "grad_norm": 0.8593122043528518, + "learning_rate": 0.003, + "loss": 4.035, + "step": 32044 + }, + { + "epoch": 0.32045, + "grad_norm": 0.9016315772440688, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 32045 + }, + { + "epoch": 0.32046, + "grad_norm": 0.9641285966818746, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 32046 + }, + { + "epoch": 0.32047, + "grad_norm": 0.8240340616146082, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 32047 + }, + { + "epoch": 0.32048, + "grad_norm": 0.8257327154403905, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 32048 + }, + { + "epoch": 0.32049, + "grad_norm": 0.7431354495647136, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 32049 + }, + { + "epoch": 0.3205, + "grad_norm": 0.8331736631628603, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 32050 + }, + { + "epoch": 0.32051, + "grad_norm": 0.9813366636502296, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 32051 + }, + { + "epoch": 0.32052, + "grad_norm": 1.1221775774811966, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 32052 + }, + { + "epoch": 0.32053, + "grad_norm": 0.8054868597704876, + "learning_rate": 0.003, + "loss": 4.032, + "step": 32053 + }, + { + "epoch": 0.32054, + "grad_norm": 0.706791238466667, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 32054 + }, + { + "epoch": 0.32055, + "grad_norm": 0.6960522115682027, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 32055 + }, + { + "epoch": 0.32056, + "grad_norm": 0.8509576114290087, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32056 + }, + { + "epoch": 0.32057, + "grad_norm": 1.0794484715577326, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 32057 + }, + { + "epoch": 0.32058, + "grad_norm": 1.111290368615515, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 32058 + }, + { + "epoch": 0.32059, + "grad_norm": 0.9880047681132434, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 32059 + }, + { + "epoch": 0.3206, + "grad_norm": 1.0896215378464862, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 32060 + }, + { + "epoch": 0.32061, + "grad_norm": 0.9015748358552653, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 32061 + }, + { + "epoch": 0.32062, + "grad_norm": 0.8370710672603827, + "learning_rate": 0.003, + "loss": 4.038, + "step": 32062 + }, + { + "epoch": 0.32063, + "grad_norm": 0.6789203115178115, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 32063 + }, + { + "epoch": 0.32064, + "grad_norm": 0.5744165149204089, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 32064 + }, + { + "epoch": 0.32065, + "grad_norm": 0.5923086862454859, + "learning_rate": 0.003, + "loss": 4.043, + "step": 32065 + }, + { + "epoch": 0.32066, + "grad_norm": 0.5769409259527578, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 32066 + }, + { + "epoch": 0.32067, + "grad_norm": 0.5394694322475562, + "learning_rate": 0.003, + "loss": 4.036, + "step": 32067 + }, + { + "epoch": 0.32068, + "grad_norm": 0.6265894815291089, + "learning_rate": 0.003, + "loss": 4.063, + "step": 32068 + }, + { + "epoch": 0.32069, + "grad_norm": 0.7129656932181448, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 32069 + }, + { + "epoch": 0.3207, + "grad_norm": 0.8413683669794733, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 32070 + }, + { + "epoch": 0.32071, + "grad_norm": 1.084772940164286, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 32071 + }, + { + "epoch": 0.32072, + "grad_norm": 1.06087527649488, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 32072 + }, + { + "epoch": 0.32073, + "grad_norm": 1.1109637223846296, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 32073 + }, + { + "epoch": 0.32074, + "grad_norm": 0.8431063578273923, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 32074 + }, + { + "epoch": 0.32075, + "grad_norm": 0.6849595773444788, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 32075 + }, + { + "epoch": 0.32076, + "grad_norm": 0.6116047493926089, + "learning_rate": 0.003, + "loss": 4.014, + "step": 32076 + }, + { + "epoch": 0.32077, + "grad_norm": 0.6902054443464168, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 32077 + }, + { + "epoch": 0.32078, + "grad_norm": 0.7339449794102021, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 32078 + }, + { + "epoch": 0.32079, + "grad_norm": 0.7340248541272907, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 32079 + }, + { + "epoch": 0.3208, + "grad_norm": 0.8109322069769744, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 32080 + }, + { + "epoch": 0.32081, + "grad_norm": 0.85212664842251, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 32081 + }, + { + "epoch": 0.32082, + "grad_norm": 0.9696166006524929, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 32082 + }, + { + "epoch": 0.32083, + "grad_norm": 1.100816221844497, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 32083 + }, + { + "epoch": 0.32084, + "grad_norm": 0.8610194984483136, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 32084 + }, + { + "epoch": 0.32085, + "grad_norm": 0.8404777189188042, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 32085 + }, + { + "epoch": 0.32086, + "grad_norm": 0.8208337509444262, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 32086 + }, + { + "epoch": 0.32087, + "grad_norm": 0.9331090670080582, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 32087 + }, + { + "epoch": 0.32088, + "grad_norm": 1.1745248102925971, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 32088 + }, + { + "epoch": 0.32089, + "grad_norm": 0.90800290391079, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 32089 + }, + { + "epoch": 0.3209, + "grad_norm": 0.803760798579035, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 32090 + }, + { + "epoch": 0.32091, + "grad_norm": 0.7298883312138952, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 32091 + }, + { + "epoch": 0.32092, + "grad_norm": 0.8000463583581157, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 32092 + }, + { + "epoch": 0.32093, + "grad_norm": 1.0098026179270745, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 32093 + }, + { + "epoch": 0.32094, + "grad_norm": 0.9871673356263472, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 32094 + }, + { + "epoch": 0.32095, + "grad_norm": 0.9244930998870973, + "learning_rate": 0.003, + "loss": 4.07, + "step": 32095 + }, + { + "epoch": 0.32096, + "grad_norm": 0.8390389165978216, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 32096 + }, + { + "epoch": 0.32097, + "grad_norm": 0.9046044260634625, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 32097 + }, + { + "epoch": 0.32098, + "grad_norm": 0.9711921274365544, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 32098 + }, + { + "epoch": 0.32099, + "grad_norm": 0.882981613880237, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 32099 + }, + { + "epoch": 0.321, + "grad_norm": 0.9215891946754206, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 32100 + }, + { + "epoch": 0.32101, + "grad_norm": 0.9419135824789112, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 32101 + }, + { + "epoch": 0.32102, + "grad_norm": 0.8762218318332125, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 32102 + }, + { + "epoch": 0.32103, + "grad_norm": 0.9633899480890369, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 32103 + }, + { + "epoch": 0.32104, + "grad_norm": 1.0866969231590793, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 32104 + }, + { + "epoch": 0.32105, + "grad_norm": 0.8925659852896375, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 32105 + }, + { + "epoch": 0.32106, + "grad_norm": 0.8519269665352203, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 32106 + }, + { + "epoch": 0.32107, + "grad_norm": 0.8586792653479003, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 32107 + }, + { + "epoch": 0.32108, + "grad_norm": 0.8868928651513859, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 32108 + }, + { + "epoch": 0.32109, + "grad_norm": 1.0358699712309498, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 32109 + }, + { + "epoch": 0.3211, + "grad_norm": 1.0209306985413853, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 32110 + }, + { + "epoch": 0.32111, + "grad_norm": 1.1961841303130454, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 32111 + }, + { + "epoch": 0.32112, + "grad_norm": 0.8890401509451059, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 32112 + }, + { + "epoch": 0.32113, + "grad_norm": 0.7257698596482696, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 32113 + }, + { + "epoch": 0.32114, + "grad_norm": 0.7600299616133965, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 32114 + }, + { + "epoch": 0.32115, + "grad_norm": 0.8066728727239163, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 32115 + }, + { + "epoch": 0.32116, + "grad_norm": 0.7775506323895115, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 32116 + }, + { + "epoch": 0.32117, + "grad_norm": 0.799621343871532, + "learning_rate": 0.003, + "loss": 4.034, + "step": 32117 + }, + { + "epoch": 0.32118, + "grad_norm": 0.7836591909383303, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 32118 + }, + { + "epoch": 0.32119, + "grad_norm": 0.7575670372501762, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 32119 + }, + { + "epoch": 0.3212, + "grad_norm": 0.6859761449611931, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 32120 + }, + { + "epoch": 0.32121, + "grad_norm": 0.6530805590337593, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 32121 + }, + { + "epoch": 0.32122, + "grad_norm": 0.6686021483111871, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 32122 + }, + { + "epoch": 0.32123, + "grad_norm": 0.6783093662036036, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 32123 + }, + { + "epoch": 0.32124, + "grad_norm": 0.6840653642252628, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 32124 + }, + { + "epoch": 0.32125, + "grad_norm": 0.7274586638600474, + "learning_rate": 0.003, + "loss": 4.055, + "step": 32125 + }, + { + "epoch": 0.32126, + "grad_norm": 0.7482039171449799, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 32126 + }, + { + "epoch": 0.32127, + "grad_norm": 0.748509753445551, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 32127 + }, + { + "epoch": 0.32128, + "grad_norm": 0.7683615222643407, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 32128 + }, + { + "epoch": 0.32129, + "grad_norm": 0.7865344886340901, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 32129 + }, + { + "epoch": 0.3213, + "grad_norm": 0.7955579252103503, + "learning_rate": 0.003, + "loss": 3.9908, + "step": 32130 + }, + { + "epoch": 0.32131, + "grad_norm": 0.8543216598714775, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 32131 + }, + { + "epoch": 0.32132, + "grad_norm": 0.9790653943017321, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 32132 + }, + { + "epoch": 0.32133, + "grad_norm": 1.0591000207726944, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 32133 + }, + { + "epoch": 0.32134, + "grad_norm": 0.905965588264445, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 32134 + }, + { + "epoch": 0.32135, + "grad_norm": 0.7624284926349406, + "learning_rate": 0.003, + "loss": 4.032, + "step": 32135 + }, + { + "epoch": 0.32136, + "grad_norm": 0.8409851542076126, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 32136 + }, + { + "epoch": 0.32137, + "grad_norm": 0.8487114988418342, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 32137 + }, + { + "epoch": 0.32138, + "grad_norm": 0.9158504983676331, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 32138 + }, + { + "epoch": 0.32139, + "grad_norm": 1.0736450333376792, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 32139 + }, + { + "epoch": 0.3214, + "grad_norm": 1.0195628717716176, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 32140 + }, + { + "epoch": 0.32141, + "grad_norm": 0.9963059262121619, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 32141 + }, + { + "epoch": 0.32142, + "grad_norm": 0.9216265487146964, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 32142 + }, + { + "epoch": 0.32143, + "grad_norm": 0.9165688487642132, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 32143 + }, + { + "epoch": 0.32144, + "grad_norm": 0.9263856305943822, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 32144 + }, + { + "epoch": 0.32145, + "grad_norm": 1.0283616564988651, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 32145 + }, + { + "epoch": 0.32146, + "grad_norm": 1.0672700688422256, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 32146 + }, + { + "epoch": 0.32147, + "grad_norm": 1.0271712466774654, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 32147 + }, + { + "epoch": 0.32148, + "grad_norm": 0.9855380513077183, + "learning_rate": 0.003, + "loss": 4.009, + "step": 32148 + }, + { + "epoch": 0.32149, + "grad_norm": 1.0045653473201643, + "learning_rate": 0.003, + "loss": 3.9872, + "step": 32149 + }, + { + "epoch": 0.3215, + "grad_norm": 0.975240632406502, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 32150 + }, + { + "epoch": 0.32151, + "grad_norm": 0.8976111961558524, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 32151 + }, + { + "epoch": 0.32152, + "grad_norm": 0.899894650641608, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 32152 + }, + { + "epoch": 0.32153, + "grad_norm": 0.9430697230794242, + "learning_rate": 0.003, + "loss": 4.053, + "step": 32153 + }, + { + "epoch": 0.32154, + "grad_norm": 0.95495918562649, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 32154 + }, + { + "epoch": 0.32155, + "grad_norm": 0.998530802639755, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 32155 + }, + { + "epoch": 0.32156, + "grad_norm": 1.0339729858959528, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32156 + }, + { + "epoch": 0.32157, + "grad_norm": 0.8324672480654042, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 32157 + }, + { + "epoch": 0.32158, + "grad_norm": 0.6754175273029288, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 32158 + }, + { + "epoch": 0.32159, + "grad_norm": 0.7061527675052528, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 32159 + }, + { + "epoch": 0.3216, + "grad_norm": 0.682645671276685, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 32160 + }, + { + "epoch": 0.32161, + "grad_norm": 0.7226975796243015, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 32161 + }, + { + "epoch": 0.32162, + "grad_norm": 0.6423839759714424, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 32162 + }, + { + "epoch": 0.32163, + "grad_norm": 0.6171529861686078, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 32163 + }, + { + "epoch": 0.32164, + "grad_norm": 0.5535345037800211, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 32164 + }, + { + "epoch": 0.32165, + "grad_norm": 0.6144168666594476, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 32165 + }, + { + "epoch": 0.32166, + "grad_norm": 0.6993477491479677, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 32166 + }, + { + "epoch": 0.32167, + "grad_norm": 0.8569248077675105, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 32167 + }, + { + "epoch": 0.32168, + "grad_norm": 0.9530898602539767, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 32168 + }, + { + "epoch": 0.32169, + "grad_norm": 0.9685158671700538, + "learning_rate": 0.003, + "loss": 4.0756, + "step": 32169 + }, + { + "epoch": 0.3217, + "grad_norm": 1.036377177468917, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 32170 + }, + { + "epoch": 0.32171, + "grad_norm": 0.971099752842579, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 32171 + }, + { + "epoch": 0.32172, + "grad_norm": 1.005443241965482, + "learning_rate": 0.003, + "loss": 4.072, + "step": 32172 + }, + { + "epoch": 0.32173, + "grad_norm": 0.8936870380582859, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 32173 + }, + { + "epoch": 0.32174, + "grad_norm": 0.8432079011872359, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32174 + }, + { + "epoch": 0.32175, + "grad_norm": 0.8515503027872529, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 32175 + }, + { + "epoch": 0.32176, + "grad_norm": 0.7896117432751558, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 32176 + }, + { + "epoch": 0.32177, + "grad_norm": 0.8345284469269142, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 32177 + }, + { + "epoch": 0.32178, + "grad_norm": 0.8153377890763066, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 32178 + }, + { + "epoch": 0.32179, + "grad_norm": 0.6914299117044597, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 32179 + }, + { + "epoch": 0.3218, + "grad_norm": 0.6469985704339442, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 32180 + }, + { + "epoch": 0.32181, + "grad_norm": 0.6382553203869028, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 32181 + }, + { + "epoch": 0.32182, + "grad_norm": 0.6481811022517221, + "learning_rate": 0.003, + "loss": 4.033, + "step": 32182 + }, + { + "epoch": 0.32183, + "grad_norm": 0.7689969069219974, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 32183 + }, + { + "epoch": 0.32184, + "grad_norm": 0.9185585922855081, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 32184 + }, + { + "epoch": 0.32185, + "grad_norm": 0.9686786411759901, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 32185 + }, + { + "epoch": 0.32186, + "grad_norm": 0.9153408590140227, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 32186 + }, + { + "epoch": 0.32187, + "grad_norm": 0.7854690370017651, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 32187 + }, + { + "epoch": 0.32188, + "grad_norm": 0.6547452956881445, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 32188 + }, + { + "epoch": 0.32189, + "grad_norm": 0.6018545522644374, + "learning_rate": 0.003, + "loss": 4.025, + "step": 32189 + }, + { + "epoch": 0.3219, + "grad_norm": 0.5654801667369025, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 32190 + }, + { + "epoch": 0.32191, + "grad_norm": 0.5998660720510379, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 32191 + }, + { + "epoch": 0.32192, + "grad_norm": 0.6835821034539953, + "learning_rate": 0.003, + "loss": 4.024, + "step": 32192 + }, + { + "epoch": 0.32193, + "grad_norm": 0.8830123389916543, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 32193 + }, + { + "epoch": 0.32194, + "grad_norm": 1.0605787209653907, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 32194 + }, + { + "epoch": 0.32195, + "grad_norm": 0.8424570982848152, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 32195 + }, + { + "epoch": 0.32196, + "grad_norm": 0.7803156647175802, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 32196 + }, + { + "epoch": 0.32197, + "grad_norm": 0.8265753317923835, + "learning_rate": 0.003, + "loss": 4.058, + "step": 32197 + }, + { + "epoch": 0.32198, + "grad_norm": 0.9310412078309459, + "learning_rate": 0.003, + "loss": 4.0869, + "step": 32198 + }, + { + "epoch": 0.32199, + "grad_norm": 1.0017448024794426, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 32199 + }, + { + "epoch": 0.322, + "grad_norm": 0.9041079476170394, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 32200 + }, + { + "epoch": 0.32201, + "grad_norm": 1.1173892516809483, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 32201 + }, + { + "epoch": 0.32202, + "grad_norm": 1.0807424930459617, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 32202 + }, + { + "epoch": 0.32203, + "grad_norm": 0.9509635180401249, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 32203 + }, + { + "epoch": 0.32204, + "grad_norm": 0.9882575237381347, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 32204 + }, + { + "epoch": 0.32205, + "grad_norm": 0.8903634729363463, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 32205 + }, + { + "epoch": 0.32206, + "grad_norm": 0.715512013027559, + "learning_rate": 0.003, + "loss": 4.062, + "step": 32206 + }, + { + "epoch": 0.32207, + "grad_norm": 0.6368173781783356, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 32207 + }, + { + "epoch": 0.32208, + "grad_norm": 0.571687560947578, + "learning_rate": 0.003, + "loss": 4.049, + "step": 32208 + }, + { + "epoch": 0.32209, + "grad_norm": 0.48536364745086374, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 32209 + }, + { + "epoch": 0.3221, + "grad_norm": 0.5017539330898301, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32210 + }, + { + "epoch": 0.32211, + "grad_norm": 0.5651514044797669, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 32211 + }, + { + "epoch": 0.32212, + "grad_norm": 0.6390221402605062, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 32212 + }, + { + "epoch": 0.32213, + "grad_norm": 0.6826391376508518, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 32213 + }, + { + "epoch": 0.32214, + "grad_norm": 0.7487190103251911, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 32214 + }, + { + "epoch": 0.32215, + "grad_norm": 0.8442246955186923, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 32215 + }, + { + "epoch": 0.32216, + "grad_norm": 1.094984354421479, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 32216 + }, + { + "epoch": 0.32217, + "grad_norm": 1.038931952573693, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 32217 + }, + { + "epoch": 0.32218, + "grad_norm": 0.8935098924138787, + "learning_rate": 0.003, + "loss": 4.035, + "step": 32218 + }, + { + "epoch": 0.32219, + "grad_norm": 0.8249351776654388, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 32219 + }, + { + "epoch": 0.3222, + "grad_norm": 0.8168983319342156, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 32220 + }, + { + "epoch": 0.32221, + "grad_norm": 0.7735788835321203, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 32221 + }, + { + "epoch": 0.32222, + "grad_norm": 0.8034233429912587, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 32222 + }, + { + "epoch": 0.32223, + "grad_norm": 1.005386476597215, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 32223 + }, + { + "epoch": 0.32224, + "grad_norm": 1.2644133675662077, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 32224 + }, + { + "epoch": 0.32225, + "grad_norm": 0.7334006188486157, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 32225 + }, + { + "epoch": 0.32226, + "grad_norm": 0.6514463860960846, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 32226 + }, + { + "epoch": 0.32227, + "grad_norm": 0.7512507739703294, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 32227 + }, + { + "epoch": 0.32228, + "grad_norm": 0.7550455186600876, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 32228 + }, + { + "epoch": 0.32229, + "grad_norm": 0.7218245832051776, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 32229 + }, + { + "epoch": 0.3223, + "grad_norm": 0.7698463998195858, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 32230 + }, + { + "epoch": 0.32231, + "grad_norm": 0.8168956262351593, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 32231 + }, + { + "epoch": 0.32232, + "grad_norm": 0.7339963852839679, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 32232 + }, + { + "epoch": 0.32233, + "grad_norm": 0.7360059569554366, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 32233 + }, + { + "epoch": 0.32234, + "grad_norm": 0.746988054502412, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 32234 + }, + { + "epoch": 0.32235, + "grad_norm": 0.8784104458965614, + "learning_rate": 0.003, + "loss": 3.9938, + "step": 32235 + }, + { + "epoch": 0.32236, + "grad_norm": 0.9218926344024032, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 32236 + }, + { + "epoch": 0.32237, + "grad_norm": 1.0026539694055925, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 32237 + }, + { + "epoch": 0.32238, + "grad_norm": 1.2020239460106188, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 32238 + }, + { + "epoch": 0.32239, + "grad_norm": 0.775546711058531, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 32239 + }, + { + "epoch": 0.3224, + "grad_norm": 0.6462218624913312, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 32240 + }, + { + "epoch": 0.32241, + "grad_norm": 0.7854321609735561, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 32241 + }, + { + "epoch": 0.32242, + "grad_norm": 0.8734837347510679, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 32242 + }, + { + "epoch": 0.32243, + "grad_norm": 0.7635065962445767, + "learning_rate": 0.003, + "loss": 4.057, + "step": 32243 + }, + { + "epoch": 0.32244, + "grad_norm": 0.8199533440039678, + "learning_rate": 0.003, + "loss": 4.068, + "step": 32244 + }, + { + "epoch": 0.32245, + "grad_norm": 0.8425384495622505, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 32245 + }, + { + "epoch": 0.32246, + "grad_norm": 0.9037748150721758, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 32246 + }, + { + "epoch": 0.32247, + "grad_norm": 1.0834000111593438, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 32247 + }, + { + "epoch": 0.32248, + "grad_norm": 1.1907425594444014, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 32248 + }, + { + "epoch": 0.32249, + "grad_norm": 0.9383884373978388, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 32249 + }, + { + "epoch": 0.3225, + "grad_norm": 0.9833057176108768, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32250 + }, + { + "epoch": 0.32251, + "grad_norm": 0.9738031767988453, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 32251 + }, + { + "epoch": 0.32252, + "grad_norm": 0.9636566641228305, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 32252 + }, + { + "epoch": 0.32253, + "grad_norm": 1.0201503942677856, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 32253 + }, + { + "epoch": 0.32254, + "grad_norm": 0.9424680182651337, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 32254 + }, + { + "epoch": 0.32255, + "grad_norm": 0.9547605731866455, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 32255 + }, + { + "epoch": 0.32256, + "grad_norm": 0.859976616674784, + "learning_rate": 0.003, + "loss": 4.076, + "step": 32256 + }, + { + "epoch": 0.32257, + "grad_norm": 1.0016660211474069, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 32257 + }, + { + "epoch": 0.32258, + "grad_norm": 1.3184395917959293, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 32258 + }, + { + "epoch": 0.32259, + "grad_norm": 0.6678786243340351, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 32259 + }, + { + "epoch": 0.3226, + "grad_norm": 0.7463943905771643, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 32260 + }, + { + "epoch": 0.32261, + "grad_norm": 0.8661045049533692, + "learning_rate": 0.003, + "loss": 4.061, + "step": 32261 + }, + { + "epoch": 0.32262, + "grad_norm": 0.9178604289463683, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 32262 + }, + { + "epoch": 0.32263, + "grad_norm": 0.8620594528146983, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 32263 + }, + { + "epoch": 0.32264, + "grad_norm": 0.7628013365617935, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 32264 + }, + { + "epoch": 0.32265, + "grad_norm": 0.756130514301219, + "learning_rate": 0.003, + "loss": 4.04, + "step": 32265 + }, + { + "epoch": 0.32266, + "grad_norm": 0.834626731311854, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 32266 + }, + { + "epoch": 0.32267, + "grad_norm": 0.8306501767475094, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 32267 + }, + { + "epoch": 0.32268, + "grad_norm": 0.7783538083204286, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 32268 + }, + { + "epoch": 0.32269, + "grad_norm": 0.6832771857852484, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 32269 + }, + { + "epoch": 0.3227, + "grad_norm": 0.6279022135986003, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 32270 + }, + { + "epoch": 0.32271, + "grad_norm": 0.7990760164346076, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 32271 + }, + { + "epoch": 0.32272, + "grad_norm": 1.1266145440985458, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 32272 + }, + { + "epoch": 0.32273, + "grad_norm": 1.2711223108728127, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 32273 + }, + { + "epoch": 0.32274, + "grad_norm": 0.7119963235200429, + "learning_rate": 0.003, + "loss": 4.013, + "step": 32274 + }, + { + "epoch": 0.32275, + "grad_norm": 0.7682000239029506, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 32275 + }, + { + "epoch": 0.32276, + "grad_norm": 0.71769357951076, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 32276 + }, + { + "epoch": 0.32277, + "grad_norm": 0.6932969427499185, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 32277 + }, + { + "epoch": 0.32278, + "grad_norm": 0.6570708939940232, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 32278 + }, + { + "epoch": 0.32279, + "grad_norm": 0.5824043989436387, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 32279 + }, + { + "epoch": 0.3228, + "grad_norm": 0.5444073045830781, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 32280 + }, + { + "epoch": 0.32281, + "grad_norm": 0.6581936598450012, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 32281 + }, + { + "epoch": 0.32282, + "grad_norm": 0.9393683566391791, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 32282 + }, + { + "epoch": 0.32283, + "grad_norm": 1.138155339324421, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 32283 + }, + { + "epoch": 0.32284, + "grad_norm": 0.8428264184836244, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 32284 + }, + { + "epoch": 0.32285, + "grad_norm": 0.8812750638218401, + "learning_rate": 0.003, + "loss": 4.032, + "step": 32285 + }, + { + "epoch": 0.32286, + "grad_norm": 0.9279909368478223, + "learning_rate": 0.003, + "loss": 4.044, + "step": 32286 + }, + { + "epoch": 0.32287, + "grad_norm": 1.0236116765055276, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 32287 + }, + { + "epoch": 0.32288, + "grad_norm": 0.9126246407422599, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 32288 + }, + { + "epoch": 0.32289, + "grad_norm": 0.8336149842992552, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 32289 + }, + { + "epoch": 0.3229, + "grad_norm": 0.8977170598224049, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 32290 + }, + { + "epoch": 0.32291, + "grad_norm": 0.9226282466663561, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 32291 + }, + { + "epoch": 0.32292, + "grad_norm": 0.8691613370896669, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 32292 + }, + { + "epoch": 0.32293, + "grad_norm": 0.7741567960866625, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 32293 + }, + { + "epoch": 0.32294, + "grad_norm": 0.8100276953842075, + "learning_rate": 0.003, + "loss": 4.062, + "step": 32294 + }, + { + "epoch": 0.32295, + "grad_norm": 0.7670113540564968, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 32295 + }, + { + "epoch": 0.32296, + "grad_norm": 0.7065423049337619, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 32296 + }, + { + "epoch": 0.32297, + "grad_norm": 0.7762058151688797, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 32297 + }, + { + "epoch": 0.32298, + "grad_norm": 0.7300176619704652, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 32298 + }, + { + "epoch": 0.32299, + "grad_norm": 0.6708670946918538, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 32299 + }, + { + "epoch": 0.323, + "grad_norm": 0.700138469446253, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 32300 + }, + { + "epoch": 0.32301, + "grad_norm": 0.8009185701203718, + "learning_rate": 0.003, + "loss": 4.014, + "step": 32301 + }, + { + "epoch": 0.32302, + "grad_norm": 0.9167397325643818, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 32302 + }, + { + "epoch": 0.32303, + "grad_norm": 1.101083733716767, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 32303 + }, + { + "epoch": 0.32304, + "grad_norm": 1.0676385010938045, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 32304 + }, + { + "epoch": 0.32305, + "grad_norm": 0.8468035259911453, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 32305 + }, + { + "epoch": 0.32306, + "grad_norm": 0.7593810368585876, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 32306 + }, + { + "epoch": 0.32307, + "grad_norm": 0.7474168188498049, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 32307 + }, + { + "epoch": 0.32308, + "grad_norm": 0.8604325683152758, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 32308 + }, + { + "epoch": 0.32309, + "grad_norm": 0.9743420119431495, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 32309 + }, + { + "epoch": 0.3231, + "grad_norm": 1.0430454088754981, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 32310 + }, + { + "epoch": 0.32311, + "grad_norm": 1.1703433789662439, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 32311 + }, + { + "epoch": 0.32312, + "grad_norm": 0.8182212921381029, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 32312 + }, + { + "epoch": 0.32313, + "grad_norm": 0.6888783538732546, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 32313 + }, + { + "epoch": 0.32314, + "grad_norm": 0.8264579931993787, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 32314 + }, + { + "epoch": 0.32315, + "grad_norm": 0.8145009516368978, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 32315 + }, + { + "epoch": 0.32316, + "grad_norm": 0.9069230108730862, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 32316 + }, + { + "epoch": 0.32317, + "grad_norm": 1.0212741283976756, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 32317 + }, + { + "epoch": 0.32318, + "grad_norm": 0.8522968841919638, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 32318 + }, + { + "epoch": 0.32319, + "grad_norm": 0.8855478760880819, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 32319 + }, + { + "epoch": 0.3232, + "grad_norm": 0.8817897215981032, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 32320 + }, + { + "epoch": 0.32321, + "grad_norm": 0.7216924611504967, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 32321 + }, + { + "epoch": 0.32322, + "grad_norm": 0.6449237058642023, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 32322 + }, + { + "epoch": 0.32323, + "grad_norm": 0.7448239582745684, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 32323 + }, + { + "epoch": 0.32324, + "grad_norm": 0.8856571268079174, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 32324 + }, + { + "epoch": 0.32325, + "grad_norm": 0.9705643939177859, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 32325 + }, + { + "epoch": 0.32326, + "grad_norm": 1.0406156815874361, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 32326 + }, + { + "epoch": 0.32327, + "grad_norm": 0.8922499633547784, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 32327 + }, + { + "epoch": 0.32328, + "grad_norm": 0.8257915581360175, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 32328 + }, + { + "epoch": 0.32329, + "grad_norm": 0.8189930751473851, + "learning_rate": 0.003, + "loss": 4.035, + "step": 32329 + }, + { + "epoch": 0.3233, + "grad_norm": 1.1155201794963976, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 32330 + }, + { + "epoch": 0.32331, + "grad_norm": 1.1746602607208694, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 32331 + }, + { + "epoch": 0.32332, + "grad_norm": 0.9523337206799993, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 32332 + }, + { + "epoch": 0.32333, + "grad_norm": 0.962626129183462, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 32333 + }, + { + "epoch": 0.32334, + "grad_norm": 0.9314461493164828, + "learning_rate": 0.003, + "loss": 4.029, + "step": 32334 + }, + { + "epoch": 0.32335, + "grad_norm": 0.7536293531586414, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 32335 + }, + { + "epoch": 0.32336, + "grad_norm": 0.6867427069768643, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 32336 + }, + { + "epoch": 0.32337, + "grad_norm": 0.6561471057554228, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 32337 + }, + { + "epoch": 0.32338, + "grad_norm": 0.6679139771988902, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 32338 + }, + { + "epoch": 0.32339, + "grad_norm": 0.6970902407564015, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 32339 + }, + { + "epoch": 0.3234, + "grad_norm": 0.6461613969036247, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 32340 + }, + { + "epoch": 0.32341, + "grad_norm": 0.6775312452786895, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 32341 + }, + { + "epoch": 0.32342, + "grad_norm": 0.7712636293850984, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 32342 + }, + { + "epoch": 0.32343, + "grad_norm": 0.8373951220628817, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 32343 + }, + { + "epoch": 0.32344, + "grad_norm": 0.8704866617007142, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 32344 + }, + { + "epoch": 0.32345, + "grad_norm": 0.7823367022192289, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 32345 + }, + { + "epoch": 0.32346, + "grad_norm": 0.6420866673437643, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 32346 + }, + { + "epoch": 0.32347, + "grad_norm": 0.6193222894922366, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 32347 + }, + { + "epoch": 0.32348, + "grad_norm": 0.5890217712363472, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 32348 + }, + { + "epoch": 0.32349, + "grad_norm": 0.5653786811237625, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 32349 + }, + { + "epoch": 0.3235, + "grad_norm": 0.5455095076404681, + "learning_rate": 0.003, + "loss": 4.046, + "step": 32350 + }, + { + "epoch": 0.32351, + "grad_norm": 0.5684783099105587, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 32351 + }, + { + "epoch": 0.32352, + "grad_norm": 0.5679456273729846, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 32352 + }, + { + "epoch": 0.32353, + "grad_norm": 0.6745113843365564, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 32353 + }, + { + "epoch": 0.32354, + "grad_norm": 0.6761177252948698, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 32354 + }, + { + "epoch": 0.32355, + "grad_norm": 0.7791159911491643, + "learning_rate": 0.003, + "loss": 4.04, + "step": 32355 + }, + { + "epoch": 0.32356, + "grad_norm": 1.0139570939564477, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 32356 + }, + { + "epoch": 0.32357, + "grad_norm": 1.293244369561617, + "learning_rate": 0.003, + "loss": 4.079, + "step": 32357 + }, + { + "epoch": 0.32358, + "grad_norm": 0.6584405051639541, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 32358 + }, + { + "epoch": 0.32359, + "grad_norm": 0.6727183254827315, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 32359 + }, + { + "epoch": 0.3236, + "grad_norm": 0.7664880022610624, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 32360 + }, + { + "epoch": 0.32361, + "grad_norm": 0.8120060101276529, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 32361 + }, + { + "epoch": 0.32362, + "grad_norm": 0.8451679524243048, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 32362 + }, + { + "epoch": 0.32363, + "grad_norm": 0.7601203280413358, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 32363 + }, + { + "epoch": 0.32364, + "grad_norm": 0.7520953416943923, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 32364 + }, + { + "epoch": 0.32365, + "grad_norm": 0.8464035136546044, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 32365 + }, + { + "epoch": 0.32366, + "grad_norm": 0.8860531317365918, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 32366 + }, + { + "epoch": 0.32367, + "grad_norm": 0.9056158035887893, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 32367 + }, + { + "epoch": 0.32368, + "grad_norm": 1.002249721020873, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 32368 + }, + { + "epoch": 0.32369, + "grad_norm": 1.118616940436004, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 32369 + }, + { + "epoch": 0.3237, + "grad_norm": 1.0579637306002452, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 32370 + }, + { + "epoch": 0.32371, + "grad_norm": 1.0490792022585964, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 32371 + }, + { + "epoch": 0.32372, + "grad_norm": 1.1171385781613343, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 32372 + }, + { + "epoch": 0.32373, + "grad_norm": 1.0230382376870235, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 32373 + }, + { + "epoch": 0.32374, + "grad_norm": 1.1772239733294187, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 32374 + }, + { + "epoch": 0.32375, + "grad_norm": 0.9701301214444298, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 32375 + }, + { + "epoch": 0.32376, + "grad_norm": 0.9407467209597434, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 32376 + }, + { + "epoch": 0.32377, + "grad_norm": 0.8615311887723703, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 32377 + }, + { + "epoch": 0.32378, + "grad_norm": 0.7787250639625689, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 32378 + }, + { + "epoch": 0.32379, + "grad_norm": 0.7593865515816075, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 32379 + }, + { + "epoch": 0.3238, + "grad_norm": 0.7605450701845645, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 32380 + }, + { + "epoch": 0.32381, + "grad_norm": 0.7219943621367391, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 32381 + }, + { + "epoch": 0.32382, + "grad_norm": 0.6559584563119266, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 32382 + }, + { + "epoch": 0.32383, + "grad_norm": 0.6982615909167791, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 32383 + }, + { + "epoch": 0.32384, + "grad_norm": 0.8372686169003345, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 32384 + }, + { + "epoch": 0.32385, + "grad_norm": 0.9943234170786358, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 32385 + }, + { + "epoch": 0.32386, + "grad_norm": 0.9797029653688999, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 32386 + }, + { + "epoch": 0.32387, + "grad_norm": 0.8894974745721331, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 32387 + }, + { + "epoch": 0.32388, + "grad_norm": 0.9177078182331543, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 32388 + }, + { + "epoch": 0.32389, + "grad_norm": 0.924833662187325, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 32389 + }, + { + "epoch": 0.3239, + "grad_norm": 0.9495545063290977, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 32390 + }, + { + "epoch": 0.32391, + "grad_norm": 0.9719076719725539, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 32391 + }, + { + "epoch": 0.32392, + "grad_norm": 1.0310010019892792, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 32392 + }, + { + "epoch": 0.32393, + "grad_norm": 1.0116185247720566, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 32393 + }, + { + "epoch": 0.32394, + "grad_norm": 0.9235086129935368, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 32394 + }, + { + "epoch": 0.32395, + "grad_norm": 0.8386486816405722, + "learning_rate": 0.003, + "loss": 4.058, + "step": 32395 + }, + { + "epoch": 0.32396, + "grad_norm": 0.7823133825296625, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 32396 + }, + { + "epoch": 0.32397, + "grad_norm": 0.8079520393848514, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 32397 + }, + { + "epoch": 0.32398, + "grad_norm": 0.8157863464018793, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 32398 + }, + { + "epoch": 0.32399, + "grad_norm": 0.8350030055320209, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 32399 + }, + { + "epoch": 0.324, + "grad_norm": 0.9797250201811051, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 32400 + }, + { + "epoch": 0.32401, + "grad_norm": 1.1273662009011625, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 32401 + }, + { + "epoch": 0.32402, + "grad_norm": 0.8096029942371695, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 32402 + }, + { + "epoch": 0.32403, + "grad_norm": 0.6668564405196714, + "learning_rate": 0.003, + "loss": 4.049, + "step": 32403 + }, + { + "epoch": 0.32404, + "grad_norm": 0.6721271658434185, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 32404 + }, + { + "epoch": 0.32405, + "grad_norm": 0.6740651942228963, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 32405 + }, + { + "epoch": 0.32406, + "grad_norm": 0.7371625302860473, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 32406 + }, + { + "epoch": 0.32407, + "grad_norm": 0.8571869973070066, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 32407 + }, + { + "epoch": 0.32408, + "grad_norm": 1.0757907004249518, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32408 + }, + { + "epoch": 0.32409, + "grad_norm": 0.8630219760202921, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 32409 + }, + { + "epoch": 0.3241, + "grad_norm": 0.6058049018252737, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 32410 + }, + { + "epoch": 0.32411, + "grad_norm": 0.5818891282596081, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 32411 + }, + { + "epoch": 0.32412, + "grad_norm": 0.6050632525656291, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 32412 + }, + { + "epoch": 0.32413, + "grad_norm": 0.6526553256802238, + "learning_rate": 0.003, + "loss": 4.059, + "step": 32413 + }, + { + "epoch": 0.32414, + "grad_norm": 0.651872876229171, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 32414 + }, + { + "epoch": 0.32415, + "grad_norm": 0.6540530871445872, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 32415 + }, + { + "epoch": 0.32416, + "grad_norm": 0.6574382732666066, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 32416 + }, + { + "epoch": 0.32417, + "grad_norm": 0.6097716019132938, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 32417 + }, + { + "epoch": 0.32418, + "grad_norm": 0.5955253668299725, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 32418 + }, + { + "epoch": 0.32419, + "grad_norm": 0.6560988848031566, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 32419 + }, + { + "epoch": 0.3242, + "grad_norm": 0.7748700116349909, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 32420 + }, + { + "epoch": 0.32421, + "grad_norm": 0.8842548857297221, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 32421 + }, + { + "epoch": 0.32422, + "grad_norm": 1.008816486167834, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 32422 + }, + { + "epoch": 0.32423, + "grad_norm": 1.2043388149518008, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 32423 + }, + { + "epoch": 0.32424, + "grad_norm": 1.131819484550399, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 32424 + }, + { + "epoch": 0.32425, + "grad_norm": 0.7525013442716076, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 32425 + }, + { + "epoch": 0.32426, + "grad_norm": 0.8417635683290521, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32426 + }, + { + "epoch": 0.32427, + "grad_norm": 0.8872114675093807, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 32427 + }, + { + "epoch": 0.32428, + "grad_norm": 0.8177402936458994, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 32428 + }, + { + "epoch": 0.32429, + "grad_norm": 0.8061473810521187, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 32429 + }, + { + "epoch": 0.3243, + "grad_norm": 0.7979180966816082, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 32430 + }, + { + "epoch": 0.32431, + "grad_norm": 0.9406104183134433, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 32431 + }, + { + "epoch": 0.32432, + "grad_norm": 1.1706785518644378, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 32432 + }, + { + "epoch": 0.32433, + "grad_norm": 0.9204040336502325, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 32433 + }, + { + "epoch": 0.32434, + "grad_norm": 0.8480485495319959, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 32434 + }, + { + "epoch": 0.32435, + "grad_norm": 0.78424934053959, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 32435 + }, + { + "epoch": 0.32436, + "grad_norm": 0.6617567979613356, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 32436 + }, + { + "epoch": 0.32437, + "grad_norm": 0.6341200918053179, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 32437 + }, + { + "epoch": 0.32438, + "grad_norm": 0.560160292452773, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 32438 + }, + { + "epoch": 0.32439, + "grad_norm": 0.6611119797878421, + "learning_rate": 0.003, + "loss": 4.016, + "step": 32439 + }, + { + "epoch": 0.3244, + "grad_norm": 0.8011911855796873, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 32440 + }, + { + "epoch": 0.32441, + "grad_norm": 0.9568399056004658, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 32441 + }, + { + "epoch": 0.32442, + "grad_norm": 1.081974197471788, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 32442 + }, + { + "epoch": 0.32443, + "grad_norm": 0.9474284962697, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 32443 + }, + { + "epoch": 0.32444, + "grad_norm": 0.946480447449302, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 32444 + }, + { + "epoch": 0.32445, + "grad_norm": 0.8314780218085125, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 32445 + }, + { + "epoch": 0.32446, + "grad_norm": 0.7354593369045038, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 32446 + }, + { + "epoch": 0.32447, + "grad_norm": 0.7420538227228154, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 32447 + }, + { + "epoch": 0.32448, + "grad_norm": 0.7314345987547949, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 32448 + }, + { + "epoch": 0.32449, + "grad_norm": 0.8517977352026128, + "learning_rate": 0.003, + "loss": 4.042, + "step": 32449 + }, + { + "epoch": 0.3245, + "grad_norm": 0.8777509225249325, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 32450 + }, + { + "epoch": 0.32451, + "grad_norm": 0.8615559113619968, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 32451 + }, + { + "epoch": 0.32452, + "grad_norm": 0.990937090091958, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 32452 + }, + { + "epoch": 0.32453, + "grad_norm": 1.1731555510835727, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 32453 + }, + { + "epoch": 0.32454, + "grad_norm": 0.797749326571795, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 32454 + }, + { + "epoch": 0.32455, + "grad_norm": 0.8559594645003353, + "learning_rate": 0.003, + "loss": 4.0831, + "step": 32455 + }, + { + "epoch": 0.32456, + "grad_norm": 0.9064903507231489, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 32456 + }, + { + "epoch": 0.32457, + "grad_norm": 0.816951819575074, + "learning_rate": 0.003, + "loss": 4.066, + "step": 32457 + }, + { + "epoch": 0.32458, + "grad_norm": 0.8267650975129449, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 32458 + }, + { + "epoch": 0.32459, + "grad_norm": 0.8221406596182723, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 32459 + }, + { + "epoch": 0.3246, + "grad_norm": 0.96660614998581, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 32460 + }, + { + "epoch": 0.32461, + "grad_norm": 1.0104880855751777, + "learning_rate": 0.003, + "loss": 4.068, + "step": 32461 + }, + { + "epoch": 0.32462, + "grad_norm": 0.9785938381122375, + "learning_rate": 0.003, + "loss": 4.023, + "step": 32462 + }, + { + "epoch": 0.32463, + "grad_norm": 0.9911124277645665, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 32463 + }, + { + "epoch": 0.32464, + "grad_norm": 0.8342693971009238, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 32464 + }, + { + "epoch": 0.32465, + "grad_norm": 0.7638744435924395, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 32465 + }, + { + "epoch": 0.32466, + "grad_norm": 0.6585887336018129, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 32466 + }, + { + "epoch": 0.32467, + "grad_norm": 0.6786482903148481, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 32467 + }, + { + "epoch": 0.32468, + "grad_norm": 0.7462272688086826, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 32468 + }, + { + "epoch": 0.32469, + "grad_norm": 0.8402071689120815, + "learning_rate": 0.003, + "loss": 4.037, + "step": 32469 + }, + { + "epoch": 0.3247, + "grad_norm": 0.9419210745109278, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 32470 + }, + { + "epoch": 0.32471, + "grad_norm": 1.1349618248796147, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 32471 + }, + { + "epoch": 0.32472, + "grad_norm": 0.7739083411568652, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 32472 + }, + { + "epoch": 0.32473, + "grad_norm": 0.7534394848629673, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 32473 + }, + { + "epoch": 0.32474, + "grad_norm": 0.7538361034758573, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 32474 + }, + { + "epoch": 0.32475, + "grad_norm": 0.8317476640101414, + "learning_rate": 0.003, + "loss": 4.036, + "step": 32475 + }, + { + "epoch": 0.32476, + "grad_norm": 0.962108479555826, + "learning_rate": 0.003, + "loss": 4.036, + "step": 32476 + }, + { + "epoch": 0.32477, + "grad_norm": 1.0076634509523166, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 32477 + }, + { + "epoch": 0.32478, + "grad_norm": 0.9524713160013971, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 32478 + }, + { + "epoch": 0.32479, + "grad_norm": 0.8938946145068402, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 32479 + }, + { + "epoch": 0.3248, + "grad_norm": 0.866598713696185, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 32480 + }, + { + "epoch": 0.32481, + "grad_norm": 0.9518292706137309, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 32481 + }, + { + "epoch": 0.32482, + "grad_norm": 1.0706677475728912, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 32482 + }, + { + "epoch": 0.32483, + "grad_norm": 0.9377819750666391, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 32483 + }, + { + "epoch": 0.32484, + "grad_norm": 0.8795501248471351, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 32484 + }, + { + "epoch": 0.32485, + "grad_norm": 0.8313086996567106, + "learning_rate": 0.003, + "loss": 4.06, + "step": 32485 + }, + { + "epoch": 0.32486, + "grad_norm": 0.8114500049966729, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 32486 + }, + { + "epoch": 0.32487, + "grad_norm": 0.7933047784511851, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 32487 + }, + { + "epoch": 0.32488, + "grad_norm": 0.8284654799040798, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 32488 + }, + { + "epoch": 0.32489, + "grad_norm": 0.8210404517957974, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 32489 + }, + { + "epoch": 0.3249, + "grad_norm": 0.8459774536816805, + "learning_rate": 0.003, + "loss": 4.054, + "step": 32490 + }, + { + "epoch": 0.32491, + "grad_norm": 0.9110390501635937, + "learning_rate": 0.003, + "loss": 4.0799, + "step": 32491 + }, + { + "epoch": 0.32492, + "grad_norm": 0.9012254038577967, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 32492 + }, + { + "epoch": 0.32493, + "grad_norm": 0.9101919667441966, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 32493 + }, + { + "epoch": 0.32494, + "grad_norm": 0.993720091782351, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 32494 + }, + { + "epoch": 0.32495, + "grad_norm": 0.9721134862123603, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 32495 + }, + { + "epoch": 0.32496, + "grad_norm": 0.9936382312282888, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 32496 + }, + { + "epoch": 0.32497, + "grad_norm": 0.9546097052850503, + "learning_rate": 0.003, + "loss": 4.051, + "step": 32497 + }, + { + "epoch": 0.32498, + "grad_norm": 0.8523461580101721, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 32498 + }, + { + "epoch": 0.32499, + "grad_norm": 0.9402993849427704, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 32499 + }, + { + "epoch": 0.325, + "grad_norm": 0.8599630797629967, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 32500 + }, + { + "epoch": 0.32501, + "grad_norm": 0.8505517313609147, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 32501 + }, + { + "epoch": 0.32502, + "grad_norm": 0.8695682316241959, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 32502 + }, + { + "epoch": 0.32503, + "grad_norm": 0.9859328747547556, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 32503 + }, + { + "epoch": 0.32504, + "grad_norm": 0.9527466740339436, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 32504 + }, + { + "epoch": 0.32505, + "grad_norm": 0.8359264286724243, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 32505 + }, + { + "epoch": 0.32506, + "grad_norm": 0.8839760459143862, + "learning_rate": 0.003, + "loss": 4.061, + "step": 32506 + }, + { + "epoch": 0.32507, + "grad_norm": 0.8822061749609003, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 32507 + }, + { + "epoch": 0.32508, + "grad_norm": 0.8653949652705336, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 32508 + }, + { + "epoch": 0.32509, + "grad_norm": 0.9037035648499723, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 32509 + }, + { + "epoch": 0.3251, + "grad_norm": 0.9159744382892694, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 32510 + }, + { + "epoch": 0.32511, + "grad_norm": 0.977121533159995, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 32511 + }, + { + "epoch": 0.32512, + "grad_norm": 1.0465938638963157, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32512 + }, + { + "epoch": 0.32513, + "grad_norm": 1.0311322826718887, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 32513 + }, + { + "epoch": 0.32514, + "grad_norm": 0.8029986298815406, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 32514 + }, + { + "epoch": 0.32515, + "grad_norm": 0.7241715100164978, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 32515 + }, + { + "epoch": 0.32516, + "grad_norm": 0.7662160623473624, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 32516 + }, + { + "epoch": 0.32517, + "grad_norm": 0.7749811847695717, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 32517 + }, + { + "epoch": 0.32518, + "grad_norm": 0.7929216707209552, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 32518 + }, + { + "epoch": 0.32519, + "grad_norm": 0.6921712562372632, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 32519 + }, + { + "epoch": 0.3252, + "grad_norm": 0.6202446185540131, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 32520 + }, + { + "epoch": 0.32521, + "grad_norm": 0.6068708335721467, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 32521 + }, + { + "epoch": 0.32522, + "grad_norm": 0.5791522484779299, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 32522 + }, + { + "epoch": 0.32523, + "grad_norm": 0.6359440606138411, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 32523 + }, + { + "epoch": 0.32524, + "grad_norm": 0.6300473151207283, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 32524 + }, + { + "epoch": 0.32525, + "grad_norm": 0.5875943235496611, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 32525 + }, + { + "epoch": 0.32526, + "grad_norm": 0.6460274321883364, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 32526 + }, + { + "epoch": 0.32527, + "grad_norm": 0.7176503979657928, + "learning_rate": 0.003, + "loss": 4.0018, + "step": 32527 + }, + { + "epoch": 0.32528, + "grad_norm": 0.7569674709344677, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 32528 + }, + { + "epoch": 0.32529, + "grad_norm": 0.7634946695355567, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 32529 + }, + { + "epoch": 0.3253, + "grad_norm": 0.8185039735871804, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 32530 + }, + { + "epoch": 0.32531, + "grad_norm": 0.8879626382820008, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 32531 + }, + { + "epoch": 0.32532, + "grad_norm": 0.9563229441586824, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 32532 + }, + { + "epoch": 0.32533, + "grad_norm": 1.0005716778938123, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 32533 + }, + { + "epoch": 0.32534, + "grad_norm": 1.1631716319156706, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 32534 + }, + { + "epoch": 0.32535, + "grad_norm": 0.9404326691036433, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 32535 + }, + { + "epoch": 0.32536, + "grad_norm": 0.9024408873572741, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 32536 + }, + { + "epoch": 0.32537, + "grad_norm": 0.932940203114261, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 32537 + }, + { + "epoch": 0.32538, + "grad_norm": 1.0129567466035616, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 32538 + }, + { + "epoch": 0.32539, + "grad_norm": 0.8735338061609483, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 32539 + }, + { + "epoch": 0.3254, + "grad_norm": 0.7569360225968319, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 32540 + }, + { + "epoch": 0.32541, + "grad_norm": 0.7531630462745532, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 32541 + }, + { + "epoch": 0.32542, + "grad_norm": 0.6753138375391916, + "learning_rate": 0.003, + "loss": 4.066, + "step": 32542 + }, + { + "epoch": 0.32543, + "grad_norm": 0.6912591709437553, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 32543 + }, + { + "epoch": 0.32544, + "grad_norm": 0.6378119202634904, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 32544 + }, + { + "epoch": 0.32545, + "grad_norm": 0.5736197603656417, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 32545 + }, + { + "epoch": 0.32546, + "grad_norm": 0.567805166502217, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32546 + }, + { + "epoch": 0.32547, + "grad_norm": 0.570924134773479, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 32547 + }, + { + "epoch": 0.32548, + "grad_norm": 0.6161073908471861, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 32548 + }, + { + "epoch": 0.32549, + "grad_norm": 0.7255282757326842, + "learning_rate": 0.003, + "loss": 4.0044, + "step": 32549 + }, + { + "epoch": 0.3255, + "grad_norm": 1.0696041458316223, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 32550 + }, + { + "epoch": 0.32551, + "grad_norm": 1.1798883898243353, + "learning_rate": 0.003, + "loss": 4.021, + "step": 32551 + }, + { + "epoch": 0.32552, + "grad_norm": 0.8727814034711785, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 32552 + }, + { + "epoch": 0.32553, + "grad_norm": 0.8226178698220777, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 32553 + }, + { + "epoch": 0.32554, + "grad_norm": 0.7775382204150688, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 32554 + }, + { + "epoch": 0.32555, + "grad_norm": 0.8981569102385294, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 32555 + }, + { + "epoch": 0.32556, + "grad_norm": 0.8666164341413275, + "learning_rate": 0.003, + "loss": 4.051, + "step": 32556 + }, + { + "epoch": 0.32557, + "grad_norm": 0.8765939746411787, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 32557 + }, + { + "epoch": 0.32558, + "grad_norm": 0.8992140367605779, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 32558 + }, + { + "epoch": 0.32559, + "grad_norm": 0.9739244730448949, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 32559 + }, + { + "epoch": 0.3256, + "grad_norm": 1.0792799810872777, + "learning_rate": 0.003, + "loss": 4.069, + "step": 32560 + }, + { + "epoch": 0.32561, + "grad_norm": 1.1153840983791787, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 32561 + }, + { + "epoch": 0.32562, + "grad_norm": 0.8527095903119498, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 32562 + }, + { + "epoch": 0.32563, + "grad_norm": 0.7073353847577559, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 32563 + }, + { + "epoch": 0.32564, + "grad_norm": 0.7018040334115425, + "learning_rate": 0.003, + "loss": 4.035, + "step": 32564 + }, + { + "epoch": 0.32565, + "grad_norm": 0.6825879397660947, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32565 + }, + { + "epoch": 0.32566, + "grad_norm": 0.7902643142545418, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 32566 + }, + { + "epoch": 0.32567, + "grad_norm": 0.9439713931740356, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 32567 + }, + { + "epoch": 0.32568, + "grad_norm": 1.0969949600602686, + "learning_rate": 0.003, + "loss": 4.059, + "step": 32568 + }, + { + "epoch": 0.32569, + "grad_norm": 1.0292360273393177, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 32569 + }, + { + "epoch": 0.3257, + "grad_norm": 0.8395614529442065, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 32570 + }, + { + "epoch": 0.32571, + "grad_norm": 0.7642680841081277, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 32571 + }, + { + "epoch": 0.32572, + "grad_norm": 0.7343925241461134, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 32572 + }, + { + "epoch": 0.32573, + "grad_norm": 0.8178668749519237, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 32573 + }, + { + "epoch": 0.32574, + "grad_norm": 0.8871834598925148, + "learning_rate": 0.003, + "loss": 4.04, + "step": 32574 + }, + { + "epoch": 0.32575, + "grad_norm": 0.8157414758277594, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 32575 + }, + { + "epoch": 0.32576, + "grad_norm": 0.7082762149351927, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 32576 + }, + { + "epoch": 0.32577, + "grad_norm": 0.7311944200233875, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 32577 + }, + { + "epoch": 0.32578, + "grad_norm": 0.7471532571374218, + "learning_rate": 0.003, + "loss": 4.029, + "step": 32578 + }, + { + "epoch": 0.32579, + "grad_norm": 0.9053340254333775, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 32579 + }, + { + "epoch": 0.3258, + "grad_norm": 1.0563485693937398, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 32580 + }, + { + "epoch": 0.32581, + "grad_norm": 1.0730855603732499, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 32581 + }, + { + "epoch": 0.32582, + "grad_norm": 0.8222737850350875, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 32582 + }, + { + "epoch": 0.32583, + "grad_norm": 0.9230544805793801, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 32583 + }, + { + "epoch": 0.32584, + "grad_norm": 1.0902773122066156, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 32584 + }, + { + "epoch": 0.32585, + "grad_norm": 1.001979374489625, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 32585 + }, + { + "epoch": 0.32586, + "grad_norm": 0.9612509735145369, + "learning_rate": 0.003, + "loss": 4.048, + "step": 32586 + }, + { + "epoch": 0.32587, + "grad_norm": 0.9155126535035115, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 32587 + }, + { + "epoch": 0.32588, + "grad_norm": 0.9090094495265286, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 32588 + }, + { + "epoch": 0.32589, + "grad_norm": 0.8870907521648331, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 32589 + }, + { + "epoch": 0.3259, + "grad_norm": 0.7950450459390722, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 32590 + }, + { + "epoch": 0.32591, + "grad_norm": 0.7385842791518953, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 32591 + }, + { + "epoch": 0.32592, + "grad_norm": 0.6941728578943309, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32592 + }, + { + "epoch": 0.32593, + "grad_norm": 0.6918214189760913, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 32593 + }, + { + "epoch": 0.32594, + "grad_norm": 0.770830115065779, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 32594 + }, + { + "epoch": 0.32595, + "grad_norm": 0.8530177736738423, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32595 + }, + { + "epoch": 0.32596, + "grad_norm": 1.02183969715601, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 32596 + }, + { + "epoch": 0.32597, + "grad_norm": 0.8430391367372441, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 32597 + }, + { + "epoch": 0.32598, + "grad_norm": 0.7469850574350272, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 32598 + }, + { + "epoch": 0.32599, + "grad_norm": 0.8301538375710558, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 32599 + }, + { + "epoch": 0.326, + "grad_norm": 0.8997271591832265, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 32600 + }, + { + "epoch": 0.32601, + "grad_norm": 0.9684148711140967, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 32601 + }, + { + "epoch": 0.32602, + "grad_norm": 1.2299429624540272, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 32602 + }, + { + "epoch": 0.32603, + "grad_norm": 1.0057955640830745, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 32603 + }, + { + "epoch": 0.32604, + "grad_norm": 0.900385274695718, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 32604 + }, + { + "epoch": 0.32605, + "grad_norm": 0.8151598244475973, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 32605 + }, + { + "epoch": 0.32606, + "grad_norm": 0.783905914434776, + "learning_rate": 0.003, + "loss": 4.048, + "step": 32606 + }, + { + "epoch": 0.32607, + "grad_norm": 0.844341922226867, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 32607 + }, + { + "epoch": 0.32608, + "grad_norm": 0.8542859851316374, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 32608 + }, + { + "epoch": 0.32609, + "grad_norm": 0.974555891891903, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 32609 + }, + { + "epoch": 0.3261, + "grad_norm": 1.0126762032812027, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 32610 + }, + { + "epoch": 0.32611, + "grad_norm": 0.9876399508552836, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 32611 + }, + { + "epoch": 0.32612, + "grad_norm": 0.9750974939779021, + "learning_rate": 0.003, + "loss": 4.039, + "step": 32612 + }, + { + "epoch": 0.32613, + "grad_norm": 0.9660282178438745, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 32613 + }, + { + "epoch": 0.32614, + "grad_norm": 0.8295218995529398, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 32614 + }, + { + "epoch": 0.32615, + "grad_norm": 0.7612581110362032, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 32615 + }, + { + "epoch": 0.32616, + "grad_norm": 0.7779945136489033, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 32616 + }, + { + "epoch": 0.32617, + "grad_norm": 0.6708131965208577, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 32617 + }, + { + "epoch": 0.32618, + "grad_norm": 0.7142469237214132, + "learning_rate": 0.003, + "loss": 4.058, + "step": 32618 + }, + { + "epoch": 0.32619, + "grad_norm": 0.6393939557439822, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 32619 + }, + { + "epoch": 0.3262, + "grad_norm": 0.5527806307421838, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 32620 + }, + { + "epoch": 0.32621, + "grad_norm": 0.5295230904221768, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 32621 + }, + { + "epoch": 0.32622, + "grad_norm": 0.5477791232621022, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 32622 + }, + { + "epoch": 0.32623, + "grad_norm": 0.5362808847597705, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 32623 + }, + { + "epoch": 0.32624, + "grad_norm": 0.538065666078473, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 32624 + }, + { + "epoch": 0.32625, + "grad_norm": 0.6193450727026222, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 32625 + }, + { + "epoch": 0.32626, + "grad_norm": 0.8345618380605104, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 32626 + }, + { + "epoch": 0.32627, + "grad_norm": 1.0550190643870263, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 32627 + }, + { + "epoch": 0.32628, + "grad_norm": 0.9985699048602811, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 32628 + }, + { + "epoch": 0.32629, + "grad_norm": 1.1110949474623533, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 32629 + }, + { + "epoch": 0.3263, + "grad_norm": 0.8758576027769994, + "learning_rate": 0.003, + "loss": 4.039, + "step": 32630 + }, + { + "epoch": 0.32631, + "grad_norm": 0.7954153776303534, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 32631 + }, + { + "epoch": 0.32632, + "grad_norm": 0.7236414386816926, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 32632 + }, + { + "epoch": 0.32633, + "grad_norm": 0.7530452027683179, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 32633 + }, + { + "epoch": 0.32634, + "grad_norm": 0.8438880254603348, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 32634 + }, + { + "epoch": 0.32635, + "grad_norm": 0.8721834088253939, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 32635 + }, + { + "epoch": 0.32636, + "grad_norm": 0.9264135502218175, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 32636 + }, + { + "epoch": 0.32637, + "grad_norm": 1.0075506190400882, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 32637 + }, + { + "epoch": 0.32638, + "grad_norm": 1.0410723311340915, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 32638 + }, + { + "epoch": 0.32639, + "grad_norm": 1.0364610469616946, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 32639 + }, + { + "epoch": 0.3264, + "grad_norm": 0.9252616639099874, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 32640 + }, + { + "epoch": 0.32641, + "grad_norm": 0.9706459880949335, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 32641 + }, + { + "epoch": 0.32642, + "grad_norm": 0.8801712483191553, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 32642 + }, + { + "epoch": 0.32643, + "grad_norm": 0.7759356203642023, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 32643 + }, + { + "epoch": 0.32644, + "grad_norm": 0.7258853963312211, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 32644 + }, + { + "epoch": 0.32645, + "grad_norm": 0.7517368081529687, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 32645 + }, + { + "epoch": 0.32646, + "grad_norm": 0.7749757729682596, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 32646 + }, + { + "epoch": 0.32647, + "grad_norm": 0.7256552981453709, + "learning_rate": 0.003, + "loss": 4.016, + "step": 32647 + }, + { + "epoch": 0.32648, + "grad_norm": 0.7080354367273922, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 32648 + }, + { + "epoch": 0.32649, + "grad_norm": 0.653708836781338, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 32649 + }, + { + "epoch": 0.3265, + "grad_norm": 0.6980347320692092, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 32650 + }, + { + "epoch": 0.32651, + "grad_norm": 0.7180027788283422, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 32651 + }, + { + "epoch": 0.32652, + "grad_norm": 0.6837123250410853, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 32652 + }, + { + "epoch": 0.32653, + "grad_norm": 0.6319153449119808, + "learning_rate": 0.003, + "loss": 4.023, + "step": 32653 + }, + { + "epoch": 0.32654, + "grad_norm": 0.6040908463326176, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 32654 + }, + { + "epoch": 0.32655, + "grad_norm": 0.6489830899567638, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 32655 + }, + { + "epoch": 0.32656, + "grad_norm": 0.7955126444185715, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 32656 + }, + { + "epoch": 0.32657, + "grad_norm": 1.0162980219700033, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 32657 + }, + { + "epoch": 0.32658, + "grad_norm": 1.2259544457286908, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 32658 + }, + { + "epoch": 0.32659, + "grad_norm": 0.8254030540123612, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 32659 + }, + { + "epoch": 0.3266, + "grad_norm": 0.763334812252952, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 32660 + }, + { + "epoch": 0.32661, + "grad_norm": 0.8867958982704471, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 32661 + }, + { + "epoch": 0.32662, + "grad_norm": 1.0904808310847616, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 32662 + }, + { + "epoch": 0.32663, + "grad_norm": 1.0517896808670215, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 32663 + }, + { + "epoch": 0.32664, + "grad_norm": 0.9930361747053034, + "learning_rate": 0.003, + "loss": 4.043, + "step": 32664 + }, + { + "epoch": 0.32665, + "grad_norm": 0.987262923600527, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 32665 + }, + { + "epoch": 0.32666, + "grad_norm": 0.874765296228934, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 32666 + }, + { + "epoch": 0.32667, + "grad_norm": 0.7678586836557916, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 32667 + }, + { + "epoch": 0.32668, + "grad_norm": 0.8181598242740705, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 32668 + }, + { + "epoch": 0.32669, + "grad_norm": 0.7656016343731741, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 32669 + }, + { + "epoch": 0.3267, + "grad_norm": 0.8619763462925668, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 32670 + }, + { + "epoch": 0.32671, + "grad_norm": 0.9582301948223413, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 32671 + }, + { + "epoch": 0.32672, + "grad_norm": 0.9652139678211961, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 32672 + }, + { + "epoch": 0.32673, + "grad_norm": 1.0367779438442388, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 32673 + }, + { + "epoch": 0.32674, + "grad_norm": 0.9659788614990488, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 32674 + }, + { + "epoch": 0.32675, + "grad_norm": 0.881939562643299, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 32675 + }, + { + "epoch": 0.32676, + "grad_norm": 0.7724174547998682, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 32676 + }, + { + "epoch": 0.32677, + "grad_norm": 0.7575952845649925, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 32677 + }, + { + "epoch": 0.32678, + "grad_norm": 0.8084802480084667, + "learning_rate": 0.003, + "loss": 4.034, + "step": 32678 + }, + { + "epoch": 0.32679, + "grad_norm": 0.8625707916712021, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 32679 + }, + { + "epoch": 0.3268, + "grad_norm": 0.9580479265298185, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 32680 + }, + { + "epoch": 0.32681, + "grad_norm": 1.0402644671104464, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 32681 + }, + { + "epoch": 0.32682, + "grad_norm": 0.9472377883004115, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 32682 + }, + { + "epoch": 0.32683, + "grad_norm": 0.8317943075599579, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 32683 + }, + { + "epoch": 0.32684, + "grad_norm": 0.7632418106706249, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 32684 + }, + { + "epoch": 0.32685, + "grad_norm": 0.7534942842463856, + "learning_rate": 0.003, + "loss": 4.065, + "step": 32685 + }, + { + "epoch": 0.32686, + "grad_norm": 0.8284583472894079, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 32686 + }, + { + "epoch": 0.32687, + "grad_norm": 0.9306673896031317, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 32687 + }, + { + "epoch": 0.32688, + "grad_norm": 0.8304650538465979, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 32688 + }, + { + "epoch": 0.32689, + "grad_norm": 0.785430550340263, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 32689 + }, + { + "epoch": 0.3269, + "grad_norm": 0.6266096363208098, + "learning_rate": 0.003, + "loss": 4.026, + "step": 32690 + }, + { + "epoch": 0.32691, + "grad_norm": 0.557663418961099, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 32691 + }, + { + "epoch": 0.32692, + "grad_norm": 0.6189671394753006, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 32692 + }, + { + "epoch": 0.32693, + "grad_norm": 0.6267912499930638, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 32693 + }, + { + "epoch": 0.32694, + "grad_norm": 0.6588836690483691, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 32694 + }, + { + "epoch": 0.32695, + "grad_norm": 0.8181272872686683, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 32695 + }, + { + "epoch": 0.32696, + "grad_norm": 1.0974425053942243, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 32696 + }, + { + "epoch": 0.32697, + "grad_norm": 1.050231939654034, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 32697 + }, + { + "epoch": 0.32698, + "grad_norm": 0.9951396110089871, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 32698 + }, + { + "epoch": 0.32699, + "grad_norm": 0.9862856437889773, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 32699 + }, + { + "epoch": 0.327, + "grad_norm": 0.7648720160015711, + "learning_rate": 0.003, + "loss": 4.031, + "step": 32700 + }, + { + "epoch": 0.32701, + "grad_norm": 0.667367667600846, + "learning_rate": 0.003, + "loss": 3.9832, + "step": 32701 + }, + { + "epoch": 0.32702, + "grad_norm": 0.6424079184842114, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 32702 + }, + { + "epoch": 0.32703, + "grad_norm": 0.5917322200430145, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 32703 + }, + { + "epoch": 0.32704, + "grad_norm": 0.5470828881701201, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 32704 + }, + { + "epoch": 0.32705, + "grad_norm": 0.5138043546739243, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 32705 + }, + { + "epoch": 0.32706, + "grad_norm": 0.5929314003412735, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 32706 + }, + { + "epoch": 0.32707, + "grad_norm": 0.6373304970622402, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 32707 + }, + { + "epoch": 0.32708, + "grad_norm": 0.703832100855302, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 32708 + }, + { + "epoch": 0.32709, + "grad_norm": 0.7549283104495754, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 32709 + }, + { + "epoch": 0.3271, + "grad_norm": 0.7620229290117326, + "learning_rate": 0.003, + "loss": 4.026, + "step": 32710 + }, + { + "epoch": 0.32711, + "grad_norm": 0.7832799994066906, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 32711 + }, + { + "epoch": 0.32712, + "grad_norm": 0.7580807393203416, + "learning_rate": 0.003, + "loss": 4.033, + "step": 32712 + }, + { + "epoch": 0.32713, + "grad_norm": 0.659404874892149, + "learning_rate": 0.003, + "loss": 4.029, + "step": 32713 + }, + { + "epoch": 0.32714, + "grad_norm": 0.7051771385456851, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 32714 + }, + { + "epoch": 0.32715, + "grad_norm": 0.7010914372282241, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 32715 + }, + { + "epoch": 0.32716, + "grad_norm": 0.6621773394646734, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 32716 + }, + { + "epoch": 0.32717, + "grad_norm": 0.6818721600205901, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 32717 + }, + { + "epoch": 0.32718, + "grad_norm": 0.9320988713645613, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 32718 + }, + { + "epoch": 0.32719, + "grad_norm": 1.3932583721752971, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 32719 + }, + { + "epoch": 0.3272, + "grad_norm": 0.9645857869591395, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 32720 + }, + { + "epoch": 0.32721, + "grad_norm": 1.2428984740611644, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 32721 + }, + { + "epoch": 0.32722, + "grad_norm": 0.8873462594108565, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 32722 + }, + { + "epoch": 0.32723, + "grad_norm": 0.7022737471310789, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 32723 + }, + { + "epoch": 0.32724, + "grad_norm": 0.7211253269266785, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 32724 + }, + { + "epoch": 0.32725, + "grad_norm": 0.8661369481518064, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 32725 + }, + { + "epoch": 0.32726, + "grad_norm": 0.9377716395987876, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 32726 + }, + { + "epoch": 0.32727, + "grad_norm": 1.0802130972570518, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 32727 + }, + { + "epoch": 0.32728, + "grad_norm": 1.0363292223459, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 32728 + }, + { + "epoch": 0.32729, + "grad_norm": 1.0393686759906342, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 32729 + }, + { + "epoch": 0.3273, + "grad_norm": 0.8371285469121365, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 32730 + }, + { + "epoch": 0.32731, + "grad_norm": 0.7043283964179214, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 32731 + }, + { + "epoch": 0.32732, + "grad_norm": 0.778717349106579, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 32732 + }, + { + "epoch": 0.32733, + "grad_norm": 0.8882257557875933, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 32733 + }, + { + "epoch": 0.32734, + "grad_norm": 0.829367890903293, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 32734 + }, + { + "epoch": 0.32735, + "grad_norm": 0.7841934051389784, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 32735 + }, + { + "epoch": 0.32736, + "grad_norm": 0.9913421670205981, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 32736 + }, + { + "epoch": 0.32737, + "grad_norm": 1.202738267876105, + "learning_rate": 0.003, + "loss": 4.0899, + "step": 32737 + }, + { + "epoch": 0.32738, + "grad_norm": 0.6559853319127903, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 32738 + }, + { + "epoch": 0.32739, + "grad_norm": 0.6200160688716877, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 32739 + }, + { + "epoch": 0.3274, + "grad_norm": 0.6569301779717273, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 32740 + }, + { + "epoch": 0.32741, + "grad_norm": 0.8037285825040849, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 32741 + }, + { + "epoch": 0.32742, + "grad_norm": 0.9896515288534776, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 32742 + }, + { + "epoch": 0.32743, + "grad_norm": 1.0469308554081531, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 32743 + }, + { + "epoch": 0.32744, + "grad_norm": 0.9763320918160817, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 32744 + }, + { + "epoch": 0.32745, + "grad_norm": 0.9842291833961888, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 32745 + }, + { + "epoch": 0.32746, + "grad_norm": 0.8830098244107689, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 32746 + }, + { + "epoch": 0.32747, + "grad_norm": 0.9544515543146259, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 32747 + }, + { + "epoch": 0.32748, + "grad_norm": 1.1130036547374291, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 32748 + }, + { + "epoch": 0.32749, + "grad_norm": 1.0951183372452218, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 32749 + }, + { + "epoch": 0.3275, + "grad_norm": 1.0124139267030214, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 32750 + }, + { + "epoch": 0.32751, + "grad_norm": 1.0064893755879278, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 32751 + }, + { + "epoch": 0.32752, + "grad_norm": 1.167731086372188, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 32752 + }, + { + "epoch": 0.32753, + "grad_norm": 0.9050129996156847, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 32753 + }, + { + "epoch": 0.32754, + "grad_norm": 0.7887033142312346, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 32754 + }, + { + "epoch": 0.32755, + "grad_norm": 0.8204010796983877, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 32755 + }, + { + "epoch": 0.32756, + "grad_norm": 0.7763758688653838, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 32756 + }, + { + "epoch": 0.32757, + "grad_norm": 0.9575028287853947, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 32757 + }, + { + "epoch": 0.32758, + "grad_norm": 1.0509666025082258, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 32758 + }, + { + "epoch": 0.32759, + "grad_norm": 1.0030692357601934, + "learning_rate": 0.003, + "loss": 4.0819, + "step": 32759 + }, + { + "epoch": 0.3276, + "grad_norm": 0.9734856504793199, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 32760 + }, + { + "epoch": 0.32761, + "grad_norm": 0.8833628174586879, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 32761 + }, + { + "epoch": 0.32762, + "grad_norm": 0.7937855341856025, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 32762 + }, + { + "epoch": 0.32763, + "grad_norm": 0.6951011194511362, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 32763 + }, + { + "epoch": 0.32764, + "grad_norm": 0.7218502057546252, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 32764 + }, + { + "epoch": 0.32765, + "grad_norm": 0.8275231488756186, + "learning_rate": 0.003, + "loss": 4.019, + "step": 32765 + }, + { + "epoch": 0.32766, + "grad_norm": 0.9736101849150745, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 32766 + }, + { + "epoch": 0.32767, + "grad_norm": 1.0231586093677552, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 32767 + }, + { + "epoch": 0.32768, + "grad_norm": 0.861336173411444, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 32768 + }, + { + "epoch": 0.32769, + "grad_norm": 0.7341686431654569, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 32769 + }, + { + "epoch": 0.3277, + "grad_norm": 0.7664811683186009, + "learning_rate": 0.003, + "loss": 4.049, + "step": 32770 + }, + { + "epoch": 0.32771, + "grad_norm": 0.800284485795897, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 32771 + }, + { + "epoch": 0.32772, + "grad_norm": 0.6882773774808549, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 32772 + }, + { + "epoch": 0.32773, + "grad_norm": 0.7567310844584764, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 32773 + }, + { + "epoch": 0.32774, + "grad_norm": 0.9345205319672601, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 32774 + }, + { + "epoch": 0.32775, + "grad_norm": 1.1168300273513763, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 32775 + }, + { + "epoch": 0.32776, + "grad_norm": 0.9851451871273431, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 32776 + }, + { + "epoch": 0.32777, + "grad_norm": 0.809832888975116, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 32777 + }, + { + "epoch": 0.32778, + "grad_norm": 0.7114239327686078, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 32778 + }, + { + "epoch": 0.32779, + "grad_norm": 0.664959126672734, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 32779 + }, + { + "epoch": 0.3278, + "grad_norm": 0.6551251876227606, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 32780 + }, + { + "epoch": 0.32781, + "grad_norm": 0.6948834932724413, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 32781 + }, + { + "epoch": 0.32782, + "grad_norm": 0.6978697874465555, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 32782 + }, + { + "epoch": 0.32783, + "grad_norm": 0.60805044380455, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 32783 + }, + { + "epoch": 0.32784, + "grad_norm": 0.6184721772228089, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 32784 + }, + { + "epoch": 0.32785, + "grad_norm": 0.6153589703558715, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 32785 + }, + { + "epoch": 0.32786, + "grad_norm": 0.6272600819447444, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 32786 + }, + { + "epoch": 0.32787, + "grad_norm": 0.6548382203752734, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 32787 + }, + { + "epoch": 0.32788, + "grad_norm": 0.7113253310900927, + "learning_rate": 0.003, + "loss": 3.9954, + "step": 32788 + }, + { + "epoch": 0.32789, + "grad_norm": 0.6288826086523921, + "learning_rate": 0.003, + "loss": 4.043, + "step": 32789 + }, + { + "epoch": 0.3279, + "grad_norm": 0.7075308254836788, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 32790 + }, + { + "epoch": 0.32791, + "grad_norm": 0.9090959211105727, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 32791 + }, + { + "epoch": 0.32792, + "grad_norm": 1.1252312258679864, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 32792 + }, + { + "epoch": 0.32793, + "grad_norm": 0.9153018125687011, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 32793 + }, + { + "epoch": 0.32794, + "grad_norm": 0.9145029885204722, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 32794 + }, + { + "epoch": 0.32795, + "grad_norm": 0.8658092573008562, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 32795 + }, + { + "epoch": 0.32796, + "grad_norm": 0.8092578250150014, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 32796 + }, + { + "epoch": 0.32797, + "grad_norm": 0.7603594854822793, + "learning_rate": 0.003, + "loss": 3.9972, + "step": 32797 + }, + { + "epoch": 0.32798, + "grad_norm": 0.7855196408310438, + "learning_rate": 0.003, + "loss": 4.035, + "step": 32798 + }, + { + "epoch": 0.32799, + "grad_norm": 0.8468278100072724, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 32799 + }, + { + "epoch": 0.328, + "grad_norm": 0.8550529747746985, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 32800 + }, + { + "epoch": 0.32801, + "grad_norm": 0.8156413881538058, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 32801 + }, + { + "epoch": 0.32802, + "grad_norm": 0.7842351392087904, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 32802 + }, + { + "epoch": 0.32803, + "grad_norm": 0.8473227112000773, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 32803 + }, + { + "epoch": 0.32804, + "grad_norm": 0.9566836814705664, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 32804 + }, + { + "epoch": 0.32805, + "grad_norm": 1.0496157145740168, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 32805 + }, + { + "epoch": 0.32806, + "grad_norm": 1.0949280779621868, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 32806 + }, + { + "epoch": 0.32807, + "grad_norm": 1.1463239599347805, + "learning_rate": 0.003, + "loss": 4.0803, + "step": 32807 + }, + { + "epoch": 0.32808, + "grad_norm": 1.0031558213689882, + "learning_rate": 0.003, + "loss": 4.1252, + "step": 32808 + }, + { + "epoch": 0.32809, + "grad_norm": 1.013256744128986, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 32809 + }, + { + "epoch": 0.3281, + "grad_norm": 0.8858231018729169, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 32810 + }, + { + "epoch": 0.32811, + "grad_norm": 0.8208672329834767, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 32811 + }, + { + "epoch": 0.32812, + "grad_norm": 0.7790944077576955, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 32812 + }, + { + "epoch": 0.32813, + "grad_norm": 0.7128251901038437, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 32813 + }, + { + "epoch": 0.32814, + "grad_norm": 0.7331569463675273, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 32814 + }, + { + "epoch": 0.32815, + "grad_norm": 0.7246077819154009, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 32815 + }, + { + "epoch": 0.32816, + "grad_norm": 0.7339000471952554, + "learning_rate": 0.003, + "loss": 4.0846, + "step": 32816 + }, + { + "epoch": 0.32817, + "grad_norm": 0.7629191471134752, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 32817 + }, + { + "epoch": 0.32818, + "grad_norm": 0.8787444140475765, + "learning_rate": 0.003, + "loss": 4.0045, + "step": 32818 + }, + { + "epoch": 0.32819, + "grad_norm": 0.9304823197642275, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 32819 + }, + { + "epoch": 0.3282, + "grad_norm": 0.8383766193968717, + "learning_rate": 0.003, + "loss": 4.041, + "step": 32820 + }, + { + "epoch": 0.32821, + "grad_norm": 0.7229392827429199, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 32821 + }, + { + "epoch": 0.32822, + "grad_norm": 0.6221199408339271, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 32822 + }, + { + "epoch": 0.32823, + "grad_norm": 0.6677408195564506, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 32823 + }, + { + "epoch": 0.32824, + "grad_norm": 0.6836661735359091, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 32824 + }, + { + "epoch": 0.32825, + "grad_norm": 0.699900358183799, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 32825 + }, + { + "epoch": 0.32826, + "grad_norm": 0.7801314694170908, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 32826 + }, + { + "epoch": 0.32827, + "grad_norm": 0.8380769409537585, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 32827 + }, + { + "epoch": 0.32828, + "grad_norm": 1.0535551010951132, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 32828 + }, + { + "epoch": 0.32829, + "grad_norm": 1.2305865195060723, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 32829 + }, + { + "epoch": 0.3283, + "grad_norm": 0.7102852956524855, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 32830 + }, + { + "epoch": 0.32831, + "grad_norm": 0.7074637090567749, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 32831 + }, + { + "epoch": 0.32832, + "grad_norm": 0.7594337069188017, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 32832 + }, + { + "epoch": 0.32833, + "grad_norm": 0.7695622679845707, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 32833 + }, + { + "epoch": 0.32834, + "grad_norm": 0.807565039625592, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 32834 + }, + { + "epoch": 0.32835, + "grad_norm": 0.9013615683330852, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 32835 + }, + { + "epoch": 0.32836, + "grad_norm": 1.0746814591792715, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 32836 + }, + { + "epoch": 0.32837, + "grad_norm": 1.0622191416300537, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 32837 + }, + { + "epoch": 0.32838, + "grad_norm": 0.9410174559653293, + "learning_rate": 0.003, + "loss": 4.031, + "step": 32838 + }, + { + "epoch": 0.32839, + "grad_norm": 0.8977356456870198, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 32839 + }, + { + "epoch": 0.3284, + "grad_norm": 0.8150098056799706, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 32840 + }, + { + "epoch": 0.32841, + "grad_norm": 0.7356614590927222, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 32841 + }, + { + "epoch": 0.32842, + "grad_norm": 0.6357212271655129, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 32842 + }, + { + "epoch": 0.32843, + "grad_norm": 0.6973430456971517, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 32843 + }, + { + "epoch": 0.32844, + "grad_norm": 0.7740847534768226, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 32844 + }, + { + "epoch": 0.32845, + "grad_norm": 0.9419775722595225, + "learning_rate": 0.003, + "loss": 4.051, + "step": 32845 + }, + { + "epoch": 0.32846, + "grad_norm": 1.1018000004628943, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 32846 + }, + { + "epoch": 0.32847, + "grad_norm": 0.8494176195289611, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 32847 + }, + { + "epoch": 0.32848, + "grad_norm": 0.6334640000951307, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 32848 + }, + { + "epoch": 0.32849, + "grad_norm": 0.5593367295177553, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 32849 + }, + { + "epoch": 0.3285, + "grad_norm": 0.5195682674158729, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 32850 + }, + { + "epoch": 0.32851, + "grad_norm": 0.47280169689782997, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 32851 + }, + { + "epoch": 0.32852, + "grad_norm": 0.4553147953994516, + "learning_rate": 0.003, + "loss": 3.9964, + "step": 32852 + }, + { + "epoch": 0.32853, + "grad_norm": 0.5241443388032121, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 32853 + }, + { + "epoch": 0.32854, + "grad_norm": 0.6475695789677293, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 32854 + }, + { + "epoch": 0.32855, + "grad_norm": 0.7304299685805818, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 32855 + }, + { + "epoch": 0.32856, + "grad_norm": 0.8377203729794402, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 32856 + }, + { + "epoch": 0.32857, + "grad_norm": 1.0310215721881397, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 32857 + }, + { + "epoch": 0.32858, + "grad_norm": 1.2152538321588076, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 32858 + }, + { + "epoch": 0.32859, + "grad_norm": 0.7291529753919089, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 32859 + }, + { + "epoch": 0.3286, + "grad_norm": 0.8052154872052569, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 32860 + }, + { + "epoch": 0.32861, + "grad_norm": 0.7989826991642558, + "learning_rate": 0.003, + "loss": 3.9882, + "step": 32861 + }, + { + "epoch": 0.32862, + "grad_norm": 0.8289516377325735, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 32862 + }, + { + "epoch": 0.32863, + "grad_norm": 0.8380873165247007, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 32863 + }, + { + "epoch": 0.32864, + "grad_norm": 0.9162193404347158, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 32864 + }, + { + "epoch": 0.32865, + "grad_norm": 1.1517260715962037, + "learning_rate": 0.003, + "loss": 4.062, + "step": 32865 + }, + { + "epoch": 0.32866, + "grad_norm": 1.131957426916179, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 32866 + }, + { + "epoch": 0.32867, + "grad_norm": 1.0244120319104884, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 32867 + }, + { + "epoch": 0.32868, + "grad_norm": 0.8919543457532081, + "learning_rate": 0.003, + "loss": 4.042, + "step": 32868 + }, + { + "epoch": 0.32869, + "grad_norm": 0.9891846221221303, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 32869 + }, + { + "epoch": 0.3287, + "grad_norm": 1.046565328288606, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 32870 + }, + { + "epoch": 0.32871, + "grad_norm": 0.947701273631773, + "learning_rate": 0.003, + "loss": 4.062, + "step": 32871 + }, + { + "epoch": 0.32872, + "grad_norm": 1.0529242828476786, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 32872 + }, + { + "epoch": 0.32873, + "grad_norm": 1.0563269855849862, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 32873 + }, + { + "epoch": 0.32874, + "grad_norm": 1.0806617276353185, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 32874 + }, + { + "epoch": 0.32875, + "grad_norm": 0.9481785810853204, + "learning_rate": 0.003, + "loss": 4.0811, + "step": 32875 + }, + { + "epoch": 0.32876, + "grad_norm": 0.832662466974176, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 32876 + }, + { + "epoch": 0.32877, + "grad_norm": 0.856259294451329, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 32877 + }, + { + "epoch": 0.32878, + "grad_norm": 0.8522547378696904, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 32878 + }, + { + "epoch": 0.32879, + "grad_norm": 0.8535406273683419, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 32879 + }, + { + "epoch": 0.3288, + "grad_norm": 0.9512553732259182, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 32880 + }, + { + "epoch": 0.32881, + "grad_norm": 0.9931018712998594, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 32881 + }, + { + "epoch": 0.32882, + "grad_norm": 1.0299175120463493, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 32882 + }, + { + "epoch": 0.32883, + "grad_norm": 0.9199998764376913, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 32883 + }, + { + "epoch": 0.32884, + "grad_norm": 0.8800868299741419, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 32884 + }, + { + "epoch": 0.32885, + "grad_norm": 1.0026752539669597, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 32885 + }, + { + "epoch": 0.32886, + "grad_norm": 1.282599294948141, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 32886 + }, + { + "epoch": 0.32887, + "grad_norm": 0.7950167391585516, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 32887 + }, + { + "epoch": 0.32888, + "grad_norm": 0.6825322821753992, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 32888 + }, + { + "epoch": 0.32889, + "grad_norm": 0.6783807698471133, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 32889 + }, + { + "epoch": 0.3289, + "grad_norm": 0.5986695208295352, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 32890 + }, + { + "epoch": 0.32891, + "grad_norm": 0.6750465113519022, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 32891 + }, + { + "epoch": 0.32892, + "grad_norm": 0.7778988312711068, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 32892 + }, + { + "epoch": 0.32893, + "grad_norm": 0.8767689743543904, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 32893 + }, + { + "epoch": 0.32894, + "grad_norm": 0.9046482255455346, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 32894 + }, + { + "epoch": 0.32895, + "grad_norm": 0.8832149933866696, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 32895 + }, + { + "epoch": 0.32896, + "grad_norm": 0.7482154066347795, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 32896 + }, + { + "epoch": 0.32897, + "grad_norm": 0.6285002657114515, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 32897 + }, + { + "epoch": 0.32898, + "grad_norm": 0.6061916874614466, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 32898 + }, + { + "epoch": 0.32899, + "grad_norm": 0.5265155990607695, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 32899 + }, + { + "epoch": 0.329, + "grad_norm": 0.45829071943881494, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 32900 + }, + { + "epoch": 0.32901, + "grad_norm": 0.4878095817938297, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 32901 + }, + { + "epoch": 0.32902, + "grad_norm": 0.550599095671545, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 32902 + }, + { + "epoch": 0.32903, + "grad_norm": 0.5836665807120486, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 32903 + }, + { + "epoch": 0.32904, + "grad_norm": 0.770545039221921, + "learning_rate": 0.003, + "loss": 3.9961, + "step": 32904 + }, + { + "epoch": 0.32905, + "grad_norm": 1.099832132281905, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 32905 + }, + { + "epoch": 0.32906, + "grad_norm": 1.1132582469446257, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 32906 + }, + { + "epoch": 0.32907, + "grad_norm": 0.7806003274467955, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 32907 + }, + { + "epoch": 0.32908, + "grad_norm": 0.6640957696895113, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 32908 + }, + { + "epoch": 0.32909, + "grad_norm": 0.7425006316226405, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 32909 + }, + { + "epoch": 0.3291, + "grad_norm": 0.7915511300740715, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 32910 + }, + { + "epoch": 0.32911, + "grad_norm": 0.8840312936087314, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 32911 + }, + { + "epoch": 0.32912, + "grad_norm": 1.021036770325842, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 32912 + }, + { + "epoch": 0.32913, + "grad_norm": 0.9682714857754495, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 32913 + }, + { + "epoch": 0.32914, + "grad_norm": 0.8826008650793474, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 32914 + }, + { + "epoch": 0.32915, + "grad_norm": 0.848089142066145, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 32915 + }, + { + "epoch": 0.32916, + "grad_norm": 0.828881911317961, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 32916 + }, + { + "epoch": 0.32917, + "grad_norm": 0.7775530005306261, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 32917 + }, + { + "epoch": 0.32918, + "grad_norm": 0.9062100218694806, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 32918 + }, + { + "epoch": 0.32919, + "grad_norm": 0.9532959049446321, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 32919 + }, + { + "epoch": 0.3292, + "grad_norm": 1.0415259328889153, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 32920 + }, + { + "epoch": 0.32921, + "grad_norm": 1.068833788514106, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 32921 + }, + { + "epoch": 0.32922, + "grad_norm": 0.8911295654545001, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 32922 + }, + { + "epoch": 0.32923, + "grad_norm": 1.0177446975275957, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 32923 + }, + { + "epoch": 0.32924, + "grad_norm": 1.0747989379074088, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 32924 + }, + { + "epoch": 0.32925, + "grad_norm": 0.9789683598810396, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 32925 + }, + { + "epoch": 0.32926, + "grad_norm": 0.9775972281245453, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 32926 + }, + { + "epoch": 0.32927, + "grad_norm": 1.0653559273687836, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 32927 + }, + { + "epoch": 0.32928, + "grad_norm": 0.8535810462653094, + "learning_rate": 0.003, + "loss": 4.0762, + "step": 32928 + }, + { + "epoch": 0.32929, + "grad_norm": 0.7208469766119024, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 32929 + }, + { + "epoch": 0.3293, + "grad_norm": 0.697632945595685, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 32930 + }, + { + "epoch": 0.32931, + "grad_norm": 0.7403299444381116, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 32931 + }, + { + "epoch": 0.32932, + "grad_norm": 0.8034513289524816, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 32932 + }, + { + "epoch": 0.32933, + "grad_norm": 0.8739940049693071, + "learning_rate": 0.003, + "loss": 4.0748, + "step": 32933 + }, + { + "epoch": 0.32934, + "grad_norm": 0.8868919259366007, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 32934 + }, + { + "epoch": 0.32935, + "grad_norm": 1.0251203297051645, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 32935 + }, + { + "epoch": 0.32936, + "grad_norm": 1.1067275598297208, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 32936 + }, + { + "epoch": 0.32937, + "grad_norm": 0.8650980937687841, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 32937 + }, + { + "epoch": 0.32938, + "grad_norm": 0.8812372770906299, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 32938 + }, + { + "epoch": 0.32939, + "grad_norm": 0.9584707317925654, + "learning_rate": 0.003, + "loss": 4.0944, + "step": 32939 + }, + { + "epoch": 0.3294, + "grad_norm": 0.9956592200115973, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 32940 + }, + { + "epoch": 0.32941, + "grad_norm": 1.154067678980052, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 32941 + }, + { + "epoch": 0.32942, + "grad_norm": 1.0523857857797632, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 32942 + }, + { + "epoch": 0.32943, + "grad_norm": 0.892917706443352, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 32943 + }, + { + "epoch": 0.32944, + "grad_norm": 0.8480201439517843, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 32944 + }, + { + "epoch": 0.32945, + "grad_norm": 0.8357257780472986, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 32945 + }, + { + "epoch": 0.32946, + "grad_norm": 0.8789828001425375, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 32946 + }, + { + "epoch": 0.32947, + "grad_norm": 0.8085879264549248, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 32947 + }, + { + "epoch": 0.32948, + "grad_norm": 0.7198950995932402, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 32948 + }, + { + "epoch": 0.32949, + "grad_norm": 0.7034553120131015, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 32949 + }, + { + "epoch": 0.3295, + "grad_norm": 0.5573712916322202, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 32950 + }, + { + "epoch": 0.32951, + "grad_norm": 0.577346444423763, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 32951 + }, + { + "epoch": 0.32952, + "grad_norm": 0.5405754420085164, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 32952 + }, + { + "epoch": 0.32953, + "grad_norm": 0.5728345638485468, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 32953 + }, + { + "epoch": 0.32954, + "grad_norm": 0.5243170441269865, + "learning_rate": 0.003, + "loss": 3.9928, + "step": 32954 + }, + { + "epoch": 0.32955, + "grad_norm": 0.5388268020406098, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 32955 + }, + { + "epoch": 0.32956, + "grad_norm": 0.6098676786214638, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 32956 + }, + { + "epoch": 0.32957, + "grad_norm": 0.7932129683142589, + "learning_rate": 0.003, + "loss": 4.0069, + "step": 32957 + }, + { + "epoch": 0.32958, + "grad_norm": 1.0465855761079197, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 32958 + }, + { + "epoch": 0.32959, + "grad_norm": 1.1591500231503091, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 32959 + }, + { + "epoch": 0.3296, + "grad_norm": 0.7010389428677801, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 32960 + }, + { + "epoch": 0.32961, + "grad_norm": 0.7108886402635417, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 32961 + }, + { + "epoch": 0.32962, + "grad_norm": 0.951959476885274, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 32962 + }, + { + "epoch": 0.32963, + "grad_norm": 0.9088738633470986, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 32963 + }, + { + "epoch": 0.32964, + "grad_norm": 0.7386644504758165, + "learning_rate": 0.003, + "loss": 4.033, + "step": 32964 + }, + { + "epoch": 0.32965, + "grad_norm": 0.6114699565755193, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 32965 + }, + { + "epoch": 0.32966, + "grad_norm": 0.6235121351844526, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 32966 + }, + { + "epoch": 0.32967, + "grad_norm": 0.6606396564616415, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 32967 + }, + { + "epoch": 0.32968, + "grad_norm": 0.693656386127757, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 32968 + }, + { + "epoch": 0.32969, + "grad_norm": 0.6710489270492578, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 32969 + }, + { + "epoch": 0.3297, + "grad_norm": 0.6419747151612746, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 32970 + }, + { + "epoch": 0.32971, + "grad_norm": 0.728378133345311, + "learning_rate": 0.003, + "loss": 4.039, + "step": 32971 + }, + { + "epoch": 0.32972, + "grad_norm": 0.9399211035940511, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 32972 + }, + { + "epoch": 0.32973, + "grad_norm": 1.2331918347203468, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 32973 + }, + { + "epoch": 0.32974, + "grad_norm": 0.9817697625215905, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 32974 + }, + { + "epoch": 0.32975, + "grad_norm": 0.9762982668494186, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 32975 + }, + { + "epoch": 0.32976, + "grad_norm": 0.9963516397266479, + "learning_rate": 0.003, + "loss": 4.051, + "step": 32976 + }, + { + "epoch": 0.32977, + "grad_norm": 0.9031088791599937, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 32977 + }, + { + "epoch": 0.32978, + "grad_norm": 0.8011083457228265, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 32978 + }, + { + "epoch": 0.32979, + "grad_norm": 0.8664963973162351, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 32979 + }, + { + "epoch": 0.3298, + "grad_norm": 0.8986005786448901, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 32980 + }, + { + "epoch": 0.32981, + "grad_norm": 0.769223951727999, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 32981 + }, + { + "epoch": 0.32982, + "grad_norm": 0.7951305047954368, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 32982 + }, + { + "epoch": 0.32983, + "grad_norm": 0.7589346432436799, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 32983 + }, + { + "epoch": 0.32984, + "grad_norm": 0.7188597246609556, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 32984 + }, + { + "epoch": 0.32985, + "grad_norm": 0.7292978841448944, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 32985 + }, + { + "epoch": 0.32986, + "grad_norm": 0.7463183379443021, + "learning_rate": 0.003, + "loss": 4.019, + "step": 32986 + }, + { + "epoch": 0.32987, + "grad_norm": 0.9889820079680672, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 32987 + }, + { + "epoch": 0.32988, + "grad_norm": 1.4506107242380588, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 32988 + }, + { + "epoch": 0.32989, + "grad_norm": 0.6668609181117161, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 32989 + }, + { + "epoch": 0.3299, + "grad_norm": 0.6709974749042328, + "learning_rate": 0.003, + "loss": 3.987, + "step": 32990 + }, + { + "epoch": 0.32991, + "grad_norm": 0.7136704746114485, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 32991 + }, + { + "epoch": 0.32992, + "grad_norm": 0.8580625543067306, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 32992 + }, + { + "epoch": 0.32993, + "grad_norm": 0.9767770393558276, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 32993 + }, + { + "epoch": 0.32994, + "grad_norm": 0.8938353856851828, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 32994 + }, + { + "epoch": 0.32995, + "grad_norm": 0.7550990979270681, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 32995 + }, + { + "epoch": 0.32996, + "grad_norm": 0.6854508115335984, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 32996 + }, + { + "epoch": 0.32997, + "grad_norm": 0.7790134267869914, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 32997 + }, + { + "epoch": 0.32998, + "grad_norm": 0.8729925345457293, + "learning_rate": 0.003, + "loss": 4.037, + "step": 32998 + }, + { + "epoch": 0.32999, + "grad_norm": 0.932328067434735, + "learning_rate": 0.003, + "loss": 4.048, + "step": 32999 + }, + { + "epoch": 0.33, + "grad_norm": 0.9663453963823336, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 33000 + }, + { + "epoch": 0.33001, + "grad_norm": 1.018007900779193, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 33001 + }, + { + "epoch": 0.33002, + "grad_norm": 1.0447281035563456, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 33002 + }, + { + "epoch": 0.33003, + "grad_norm": 0.8527313736970046, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 33003 + }, + { + "epoch": 0.33004, + "grad_norm": 0.8001387911045483, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 33004 + }, + { + "epoch": 0.33005, + "grad_norm": 0.6883411841498963, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 33005 + }, + { + "epoch": 0.33006, + "grad_norm": 0.5559548292191935, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 33006 + }, + { + "epoch": 0.33007, + "grad_norm": 0.6032598493981243, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 33007 + }, + { + "epoch": 0.33008, + "grad_norm": 0.7358384792324202, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 33008 + }, + { + "epoch": 0.33009, + "grad_norm": 0.8430423008530843, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 33009 + }, + { + "epoch": 0.3301, + "grad_norm": 0.973563549451449, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 33010 + }, + { + "epoch": 0.33011, + "grad_norm": 1.012210197434772, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 33011 + }, + { + "epoch": 0.33012, + "grad_norm": 1.0217254923546066, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 33012 + }, + { + "epoch": 0.33013, + "grad_norm": 1.0398783916194048, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 33013 + }, + { + "epoch": 0.33014, + "grad_norm": 0.9739463734757691, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 33014 + }, + { + "epoch": 0.33015, + "grad_norm": 1.0265481328054102, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 33015 + }, + { + "epoch": 0.33016, + "grad_norm": 1.0334095999935946, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 33016 + }, + { + "epoch": 0.33017, + "grad_norm": 0.9492895189269877, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 33017 + }, + { + "epoch": 0.33018, + "grad_norm": 0.8800428485791957, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 33018 + }, + { + "epoch": 0.33019, + "grad_norm": 0.871331158872836, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 33019 + }, + { + "epoch": 0.3302, + "grad_norm": 0.7675812977087423, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 33020 + }, + { + "epoch": 0.33021, + "grad_norm": 0.7667892611459053, + "learning_rate": 0.003, + "loss": 4.05, + "step": 33021 + }, + { + "epoch": 0.33022, + "grad_norm": 0.7447056052660946, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 33022 + }, + { + "epoch": 0.33023, + "grad_norm": 0.6928891980152762, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 33023 + }, + { + "epoch": 0.33024, + "grad_norm": 0.7068987995862442, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 33024 + }, + { + "epoch": 0.33025, + "grad_norm": 0.696461687536192, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 33025 + }, + { + "epoch": 0.33026, + "grad_norm": 0.7828605854844669, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 33026 + }, + { + "epoch": 0.33027, + "grad_norm": 0.6607250068496198, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 33027 + }, + { + "epoch": 0.33028, + "grad_norm": 0.6199331363031488, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 33028 + }, + { + "epoch": 0.33029, + "grad_norm": 0.6324892754030976, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 33029 + }, + { + "epoch": 0.3303, + "grad_norm": 0.8481556652516239, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 33030 + }, + { + "epoch": 0.33031, + "grad_norm": 1.2606400689978698, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 33031 + }, + { + "epoch": 0.33032, + "grad_norm": 1.011375590370824, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 33032 + }, + { + "epoch": 0.33033, + "grad_norm": 0.9109505432784176, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 33033 + }, + { + "epoch": 0.33034, + "grad_norm": 0.8615735021777486, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 33034 + }, + { + "epoch": 0.33035, + "grad_norm": 0.9595597796898825, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 33035 + }, + { + "epoch": 0.33036, + "grad_norm": 0.9633584486031049, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 33036 + }, + { + "epoch": 0.33037, + "grad_norm": 1.0164412896474802, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 33037 + }, + { + "epoch": 0.33038, + "grad_norm": 1.0021728546952704, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 33038 + }, + { + "epoch": 0.33039, + "grad_norm": 0.9869203671116209, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 33039 + }, + { + "epoch": 0.3304, + "grad_norm": 1.0170191408937705, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 33040 + }, + { + "epoch": 0.33041, + "grad_norm": 1.0080815105241663, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 33041 + }, + { + "epoch": 0.33042, + "grad_norm": 0.9254363443868511, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 33042 + }, + { + "epoch": 0.33043, + "grad_norm": 0.8044571363565541, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 33043 + }, + { + "epoch": 0.33044, + "grad_norm": 0.7563258487917408, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 33044 + }, + { + "epoch": 0.33045, + "grad_norm": 0.757706071134185, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 33045 + }, + { + "epoch": 0.33046, + "grad_norm": 0.8108357816523974, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 33046 + }, + { + "epoch": 0.33047, + "grad_norm": 0.8308322193690573, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 33047 + }, + { + "epoch": 0.33048, + "grad_norm": 0.9739862310875564, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 33048 + }, + { + "epoch": 0.33049, + "grad_norm": 1.1255379964493673, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 33049 + }, + { + "epoch": 0.3305, + "grad_norm": 0.8545191710983772, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 33050 + }, + { + "epoch": 0.33051, + "grad_norm": 0.8375441550345032, + "learning_rate": 0.003, + "loss": 4.0068, + "step": 33051 + }, + { + "epoch": 0.33052, + "grad_norm": 0.742451103499636, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 33052 + }, + { + "epoch": 0.33053, + "grad_norm": 0.7152433887194712, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 33053 + }, + { + "epoch": 0.33054, + "grad_norm": 0.7154163168892614, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 33054 + }, + { + "epoch": 0.33055, + "grad_norm": 0.7283853015195996, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 33055 + }, + { + "epoch": 0.33056, + "grad_norm": 0.659897050774051, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 33056 + }, + { + "epoch": 0.33057, + "grad_norm": 0.7192064488578238, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 33057 + }, + { + "epoch": 0.33058, + "grad_norm": 0.792366103412778, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 33058 + }, + { + "epoch": 0.33059, + "grad_norm": 0.7181267298710002, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 33059 + }, + { + "epoch": 0.3306, + "grad_norm": 0.7835064299380337, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 33060 + }, + { + "epoch": 0.33061, + "grad_norm": 0.8370702922409893, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 33061 + }, + { + "epoch": 0.33062, + "grad_norm": 0.8959150710592624, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 33062 + }, + { + "epoch": 0.33063, + "grad_norm": 0.9958039848835833, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 33063 + }, + { + "epoch": 0.33064, + "grad_norm": 1.119185680312841, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 33064 + }, + { + "epoch": 0.33065, + "grad_norm": 0.8006562907848775, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 33065 + }, + { + "epoch": 0.33066, + "grad_norm": 0.6456812088251573, + "learning_rate": 0.003, + "loss": 4.0043, + "step": 33066 + }, + { + "epoch": 0.33067, + "grad_norm": 0.5823895886695795, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 33067 + }, + { + "epoch": 0.33068, + "grad_norm": 0.5718903017268412, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 33068 + }, + { + "epoch": 0.33069, + "grad_norm": 0.6596837769411772, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 33069 + }, + { + "epoch": 0.3307, + "grad_norm": 0.655389635099349, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 33070 + }, + { + "epoch": 0.33071, + "grad_norm": 0.703676628435894, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 33071 + }, + { + "epoch": 0.33072, + "grad_norm": 0.8001275423349887, + "learning_rate": 0.003, + "loss": 4.072, + "step": 33072 + }, + { + "epoch": 0.33073, + "grad_norm": 0.855745444626138, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 33073 + }, + { + "epoch": 0.33074, + "grad_norm": 0.8645823316164157, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 33074 + }, + { + "epoch": 0.33075, + "grad_norm": 0.8974268695616503, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 33075 + }, + { + "epoch": 0.33076, + "grad_norm": 0.9130947547224789, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 33076 + }, + { + "epoch": 0.33077, + "grad_norm": 0.9699312695158779, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 33077 + }, + { + "epoch": 0.33078, + "grad_norm": 1.0361845788058084, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 33078 + }, + { + "epoch": 0.33079, + "grad_norm": 0.769096018504092, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 33079 + }, + { + "epoch": 0.3308, + "grad_norm": 0.7785444935317384, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 33080 + }, + { + "epoch": 0.33081, + "grad_norm": 0.8867992983069425, + "learning_rate": 0.003, + "loss": 4.025, + "step": 33081 + }, + { + "epoch": 0.33082, + "grad_norm": 1.0243221566467258, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 33082 + }, + { + "epoch": 0.33083, + "grad_norm": 1.2922101771469248, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 33083 + }, + { + "epoch": 0.33084, + "grad_norm": 0.8145206756566985, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 33084 + }, + { + "epoch": 0.33085, + "grad_norm": 0.6909658975921286, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 33085 + }, + { + "epoch": 0.33086, + "grad_norm": 0.6193606999604935, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 33086 + }, + { + "epoch": 0.33087, + "grad_norm": 0.5335183408392776, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 33087 + }, + { + "epoch": 0.33088, + "grad_norm": 0.5865069493072997, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 33088 + }, + { + "epoch": 0.33089, + "grad_norm": 0.5060344461536281, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 33089 + }, + { + "epoch": 0.3309, + "grad_norm": 0.5807368883608437, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 33090 + }, + { + "epoch": 0.33091, + "grad_norm": 0.6896235885646109, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 33091 + }, + { + "epoch": 0.33092, + "grad_norm": 0.9329965005808539, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 33092 + }, + { + "epoch": 0.33093, + "grad_norm": 1.154156465236188, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 33093 + }, + { + "epoch": 0.33094, + "grad_norm": 0.866038632102812, + "learning_rate": 0.003, + "loss": 4.049, + "step": 33094 + }, + { + "epoch": 0.33095, + "grad_norm": 0.785540574737168, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 33095 + }, + { + "epoch": 0.33096, + "grad_norm": 0.6803720498531979, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 33096 + }, + { + "epoch": 0.33097, + "grad_norm": 0.7272363362049685, + "learning_rate": 0.003, + "loss": 4.049, + "step": 33097 + }, + { + "epoch": 0.33098, + "grad_norm": 0.9686816138095062, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 33098 + }, + { + "epoch": 0.33099, + "grad_norm": 1.3058478638130415, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 33099 + }, + { + "epoch": 0.331, + "grad_norm": 0.771521968074124, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 33100 + }, + { + "epoch": 0.33101, + "grad_norm": 0.8544438415217132, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 33101 + }, + { + "epoch": 0.33102, + "grad_norm": 0.952976672153501, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 33102 + }, + { + "epoch": 0.33103, + "grad_norm": 0.9871427216098628, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 33103 + }, + { + "epoch": 0.33104, + "grad_norm": 0.9801996292997968, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 33104 + }, + { + "epoch": 0.33105, + "grad_norm": 0.9708329845872995, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 33105 + }, + { + "epoch": 0.33106, + "grad_norm": 0.821149343524782, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 33106 + }, + { + "epoch": 0.33107, + "grad_norm": 0.7082866938168073, + "learning_rate": 0.003, + "loss": 3.9984, + "step": 33107 + }, + { + "epoch": 0.33108, + "grad_norm": 0.6877610431743001, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 33108 + }, + { + "epoch": 0.33109, + "grad_norm": 0.7250056190505494, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 33109 + }, + { + "epoch": 0.3311, + "grad_norm": 0.7617119381085308, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 33110 + }, + { + "epoch": 0.33111, + "grad_norm": 0.8147962746434236, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 33111 + }, + { + "epoch": 0.33112, + "grad_norm": 0.9785031790878241, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 33112 + }, + { + "epoch": 0.33113, + "grad_norm": 1.1068639573230046, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 33113 + }, + { + "epoch": 0.33114, + "grad_norm": 1.2158786828016956, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 33114 + }, + { + "epoch": 0.33115, + "grad_norm": 1.0110611777529348, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 33115 + }, + { + "epoch": 0.33116, + "grad_norm": 0.9725810963220475, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 33116 + }, + { + "epoch": 0.33117, + "grad_norm": 0.9228978651730604, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 33117 + }, + { + "epoch": 0.33118, + "grad_norm": 1.010121431585825, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 33118 + }, + { + "epoch": 0.33119, + "grad_norm": 0.9969489418222564, + "learning_rate": 0.003, + "loss": 4.042, + "step": 33119 + }, + { + "epoch": 0.3312, + "grad_norm": 0.9018544564375202, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 33120 + }, + { + "epoch": 0.33121, + "grad_norm": 0.7792230044935711, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 33121 + }, + { + "epoch": 0.33122, + "grad_norm": 0.841664313337199, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 33122 + }, + { + "epoch": 0.33123, + "grad_norm": 0.808296212257901, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 33123 + }, + { + "epoch": 0.33124, + "grad_norm": 0.9064389374291256, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 33124 + }, + { + "epoch": 0.33125, + "grad_norm": 0.973353388930588, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 33125 + }, + { + "epoch": 0.33126, + "grad_norm": 1.1501809353623658, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 33126 + }, + { + "epoch": 0.33127, + "grad_norm": 0.775657586989361, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 33127 + }, + { + "epoch": 0.33128, + "grad_norm": 0.6806491392147505, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 33128 + }, + { + "epoch": 0.33129, + "grad_norm": 0.6717898991498761, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 33129 + }, + { + "epoch": 0.3313, + "grad_norm": 0.6871434820552076, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 33130 + }, + { + "epoch": 0.33131, + "grad_norm": 0.6231609134500362, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 33131 + }, + { + "epoch": 0.33132, + "grad_norm": 0.6233974829398137, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 33132 + }, + { + "epoch": 0.33133, + "grad_norm": 0.6557001004377007, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 33133 + }, + { + "epoch": 0.33134, + "grad_norm": 0.685775476261344, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 33134 + }, + { + "epoch": 0.33135, + "grad_norm": 0.6178724321201882, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 33135 + }, + { + "epoch": 0.33136, + "grad_norm": 0.7159426956464867, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 33136 + }, + { + "epoch": 0.33137, + "grad_norm": 0.7179011813900265, + "learning_rate": 0.003, + "loss": 4.044, + "step": 33137 + }, + { + "epoch": 0.33138, + "grad_norm": 0.5801292386739222, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 33138 + }, + { + "epoch": 0.33139, + "grad_norm": 0.6105317048676393, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 33139 + }, + { + "epoch": 0.3314, + "grad_norm": 0.5311100333000499, + "learning_rate": 0.003, + "loss": 4.0014, + "step": 33140 + }, + { + "epoch": 0.33141, + "grad_norm": 0.5039277320792148, + "learning_rate": 0.003, + "loss": 3.9993, + "step": 33141 + }, + { + "epoch": 0.33142, + "grad_norm": 0.6694178679194572, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 33142 + }, + { + "epoch": 0.33143, + "grad_norm": 0.9795044168635224, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 33143 + }, + { + "epoch": 0.33144, + "grad_norm": 1.3677241467029255, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 33144 + }, + { + "epoch": 0.33145, + "grad_norm": 0.7125268383919716, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 33145 + }, + { + "epoch": 0.33146, + "grad_norm": 0.6358488337253375, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 33146 + }, + { + "epoch": 0.33147, + "grad_norm": 0.6158784593054026, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 33147 + }, + { + "epoch": 0.33148, + "grad_norm": 0.6391827628613184, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 33148 + }, + { + "epoch": 0.33149, + "grad_norm": 0.6346361012793894, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 33149 + }, + { + "epoch": 0.3315, + "grad_norm": 0.6703251503146196, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 33150 + }, + { + "epoch": 0.33151, + "grad_norm": 0.794634277932375, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 33151 + }, + { + "epoch": 0.33152, + "grad_norm": 0.9779880063553521, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 33152 + }, + { + "epoch": 0.33153, + "grad_norm": 1.1408041544016274, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 33153 + }, + { + "epoch": 0.33154, + "grad_norm": 1.0407384773485815, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 33154 + }, + { + "epoch": 0.33155, + "grad_norm": 1.05828276740499, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 33155 + }, + { + "epoch": 0.33156, + "grad_norm": 1.1083851749660065, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 33156 + }, + { + "epoch": 0.33157, + "grad_norm": 1.1751394921204286, + "learning_rate": 0.003, + "loss": 4.063, + "step": 33157 + }, + { + "epoch": 0.33158, + "grad_norm": 0.978994786091527, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 33158 + }, + { + "epoch": 0.33159, + "grad_norm": 1.0148970496697896, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 33159 + }, + { + "epoch": 0.3316, + "grad_norm": 1.143837045021433, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 33160 + }, + { + "epoch": 0.33161, + "grad_norm": 0.9839010544144808, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 33161 + }, + { + "epoch": 0.33162, + "grad_norm": 0.8699860386808788, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 33162 + }, + { + "epoch": 0.33163, + "grad_norm": 0.9067678402218485, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 33163 + }, + { + "epoch": 0.33164, + "grad_norm": 0.8762856068963026, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 33164 + }, + { + "epoch": 0.33165, + "grad_norm": 0.9139473516750833, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 33165 + }, + { + "epoch": 0.33166, + "grad_norm": 1.0157947061219308, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 33166 + }, + { + "epoch": 0.33167, + "grad_norm": 1.0236508851358188, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 33167 + }, + { + "epoch": 0.33168, + "grad_norm": 0.913101622726206, + "learning_rate": 0.003, + "loss": 4.039, + "step": 33168 + }, + { + "epoch": 0.33169, + "grad_norm": 0.8638677351200966, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 33169 + }, + { + "epoch": 0.3317, + "grad_norm": 0.8149921862438072, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 33170 + }, + { + "epoch": 0.33171, + "grad_norm": 0.8489259110360203, + "learning_rate": 0.003, + "loss": 4.071, + "step": 33171 + }, + { + "epoch": 0.33172, + "grad_norm": 0.8860259291597015, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 33172 + }, + { + "epoch": 0.33173, + "grad_norm": 0.7694409305878398, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 33173 + }, + { + "epoch": 0.33174, + "grad_norm": 0.7055082842834273, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 33174 + }, + { + "epoch": 0.33175, + "grad_norm": 0.8786300605427857, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 33175 + }, + { + "epoch": 0.33176, + "grad_norm": 1.067912465386333, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 33176 + }, + { + "epoch": 0.33177, + "grad_norm": 1.1822002781175989, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 33177 + }, + { + "epoch": 0.33178, + "grad_norm": 0.8500687705564448, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 33178 + }, + { + "epoch": 0.33179, + "grad_norm": 0.6073383634073098, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 33179 + }, + { + "epoch": 0.3318, + "grad_norm": 0.556474211126253, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 33180 + }, + { + "epoch": 0.33181, + "grad_norm": 0.5157811920398366, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 33181 + }, + { + "epoch": 0.33182, + "grad_norm": 0.5392580123634528, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 33182 + }, + { + "epoch": 0.33183, + "grad_norm": 0.5203498903468842, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 33183 + }, + { + "epoch": 0.33184, + "grad_norm": 0.5033203681131596, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 33184 + }, + { + "epoch": 0.33185, + "grad_norm": 0.5151286232331618, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 33185 + }, + { + "epoch": 0.33186, + "grad_norm": 0.5526939922818893, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 33186 + }, + { + "epoch": 0.33187, + "grad_norm": 0.5906380280286542, + "learning_rate": 0.003, + "loss": 4.0022, + "step": 33187 + }, + { + "epoch": 0.33188, + "grad_norm": 0.5922046905994872, + "learning_rate": 0.003, + "loss": 4.019, + "step": 33188 + }, + { + "epoch": 0.33189, + "grad_norm": 0.5981129760485763, + "learning_rate": 0.003, + "loss": 4.02, + "step": 33189 + }, + { + "epoch": 0.3319, + "grad_norm": 0.7133378837268015, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 33190 + }, + { + "epoch": 0.33191, + "grad_norm": 1.003276936433062, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 33191 + }, + { + "epoch": 0.33192, + "grad_norm": 1.2206001492506104, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 33192 + }, + { + "epoch": 0.33193, + "grad_norm": 0.8462541503113888, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 33193 + }, + { + "epoch": 0.33194, + "grad_norm": 0.852221025693233, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 33194 + }, + { + "epoch": 0.33195, + "grad_norm": 0.7874589015617214, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 33195 + }, + { + "epoch": 0.33196, + "grad_norm": 0.8011800875675269, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 33196 + }, + { + "epoch": 0.33197, + "grad_norm": 0.8434308582523594, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 33197 + }, + { + "epoch": 0.33198, + "grad_norm": 0.803477282363193, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 33198 + }, + { + "epoch": 0.33199, + "grad_norm": 0.9037528117123197, + "learning_rate": 0.003, + "loss": 3.9853, + "step": 33199 + }, + { + "epoch": 0.332, + "grad_norm": 1.1962474299828751, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 33200 + }, + { + "epoch": 0.33201, + "grad_norm": 1.0264870204204917, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 33201 + }, + { + "epoch": 0.33202, + "grad_norm": 0.9054558366136043, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33202 + }, + { + "epoch": 0.33203, + "grad_norm": 0.830156389591633, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 33203 + }, + { + "epoch": 0.33204, + "grad_norm": 0.7647726542188915, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 33204 + }, + { + "epoch": 0.33205, + "grad_norm": 0.8167655059898276, + "learning_rate": 0.003, + "loss": 4.042, + "step": 33205 + }, + { + "epoch": 0.33206, + "grad_norm": 0.8796589046459141, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 33206 + }, + { + "epoch": 0.33207, + "grad_norm": 0.8290609837601434, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 33207 + }, + { + "epoch": 0.33208, + "grad_norm": 0.9685632306696346, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33208 + }, + { + "epoch": 0.33209, + "grad_norm": 0.9621543561253598, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 33209 + }, + { + "epoch": 0.3321, + "grad_norm": 0.9415134862184922, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 33210 + }, + { + "epoch": 0.33211, + "grad_norm": 0.8546166668341312, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 33211 + }, + { + "epoch": 0.33212, + "grad_norm": 0.899318427654502, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 33212 + }, + { + "epoch": 0.33213, + "grad_norm": 0.9486448304796464, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 33213 + }, + { + "epoch": 0.33214, + "grad_norm": 1.0257705673695832, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 33214 + }, + { + "epoch": 0.33215, + "grad_norm": 0.9166575066269033, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 33215 + }, + { + "epoch": 0.33216, + "grad_norm": 0.8692365918909141, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 33216 + }, + { + "epoch": 0.33217, + "grad_norm": 0.8684166946030597, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 33217 + }, + { + "epoch": 0.33218, + "grad_norm": 0.9223398422329775, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 33218 + }, + { + "epoch": 0.33219, + "grad_norm": 0.7889970908158304, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 33219 + }, + { + "epoch": 0.3322, + "grad_norm": 0.6461976327828892, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 33220 + }, + { + "epoch": 0.33221, + "grad_norm": 0.7413033729665723, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 33221 + }, + { + "epoch": 0.33222, + "grad_norm": 0.7728775271150758, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 33222 + }, + { + "epoch": 0.33223, + "grad_norm": 0.7915593898506665, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 33223 + }, + { + "epoch": 0.33224, + "grad_norm": 0.9002962633302, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 33224 + }, + { + "epoch": 0.33225, + "grad_norm": 1.0952031138890588, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 33225 + }, + { + "epoch": 0.33226, + "grad_norm": 1.1001129997323829, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 33226 + }, + { + "epoch": 0.33227, + "grad_norm": 0.9245671908895527, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 33227 + }, + { + "epoch": 0.33228, + "grad_norm": 0.8507626729736291, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 33228 + }, + { + "epoch": 0.33229, + "grad_norm": 0.746898912404189, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 33229 + }, + { + "epoch": 0.3323, + "grad_norm": 0.7182759805047383, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 33230 + }, + { + "epoch": 0.33231, + "grad_norm": 0.7082838067710381, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 33231 + }, + { + "epoch": 0.33232, + "grad_norm": 0.7044310895173217, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 33232 + }, + { + "epoch": 0.33233, + "grad_norm": 0.6600811207065008, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 33233 + }, + { + "epoch": 0.33234, + "grad_norm": 0.8632965392303131, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 33234 + }, + { + "epoch": 0.33235, + "grad_norm": 1.0105144234583896, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 33235 + }, + { + "epoch": 0.33236, + "grad_norm": 1.0442538420908853, + "learning_rate": 0.003, + "loss": 4.0813, + "step": 33236 + }, + { + "epoch": 0.33237, + "grad_norm": 1.1419494112937287, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 33237 + }, + { + "epoch": 0.33238, + "grad_norm": 0.8746545562793311, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 33238 + }, + { + "epoch": 0.33239, + "grad_norm": 0.6912659951647822, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 33239 + }, + { + "epoch": 0.3324, + "grad_norm": 0.7139630083498203, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 33240 + }, + { + "epoch": 0.33241, + "grad_norm": 0.7064651915937736, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 33241 + }, + { + "epoch": 0.33242, + "grad_norm": 0.6440441697946089, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 33242 + }, + { + "epoch": 0.33243, + "grad_norm": 0.7330661626697833, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 33243 + }, + { + "epoch": 0.33244, + "grad_norm": 0.9474671528011014, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 33244 + }, + { + "epoch": 0.33245, + "grad_norm": 1.0071675675059995, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 33245 + }, + { + "epoch": 0.33246, + "grad_norm": 1.007942697562821, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 33246 + }, + { + "epoch": 0.33247, + "grad_norm": 0.9313910465991228, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 33247 + }, + { + "epoch": 0.33248, + "grad_norm": 0.8995941426652255, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 33248 + }, + { + "epoch": 0.33249, + "grad_norm": 0.697466525200447, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 33249 + }, + { + "epoch": 0.3325, + "grad_norm": 0.6837103290497701, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 33250 + }, + { + "epoch": 0.33251, + "grad_norm": 0.7751673571675906, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 33251 + }, + { + "epoch": 0.33252, + "grad_norm": 0.8000229901195006, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 33252 + }, + { + "epoch": 0.33253, + "grad_norm": 0.8093074999819367, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 33253 + }, + { + "epoch": 0.33254, + "grad_norm": 0.9550229714314603, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 33254 + }, + { + "epoch": 0.33255, + "grad_norm": 1.2028363638500297, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 33255 + }, + { + "epoch": 0.33256, + "grad_norm": 0.9043159725479395, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 33256 + }, + { + "epoch": 0.33257, + "grad_norm": 0.7768335505233628, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 33257 + }, + { + "epoch": 0.33258, + "grad_norm": 0.8372921255509228, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 33258 + }, + { + "epoch": 0.33259, + "grad_norm": 0.7534945536167506, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 33259 + }, + { + "epoch": 0.3326, + "grad_norm": 0.6585923088851325, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 33260 + }, + { + "epoch": 0.33261, + "grad_norm": 0.7914267420327626, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 33261 + }, + { + "epoch": 0.33262, + "grad_norm": 0.7434969548860024, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 33262 + }, + { + "epoch": 0.33263, + "grad_norm": 0.804755511238947, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 33263 + }, + { + "epoch": 0.33264, + "grad_norm": 0.765303934071197, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 33264 + }, + { + "epoch": 0.33265, + "grad_norm": 0.7241109521896134, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 33265 + }, + { + "epoch": 0.33266, + "grad_norm": 0.7321214805645182, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 33266 + }, + { + "epoch": 0.33267, + "grad_norm": 0.8877356925801678, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 33267 + }, + { + "epoch": 0.33268, + "grad_norm": 0.9061846128008729, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 33268 + }, + { + "epoch": 0.33269, + "grad_norm": 0.8622369113085281, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 33269 + }, + { + "epoch": 0.3327, + "grad_norm": 0.9063633656509638, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 33270 + }, + { + "epoch": 0.33271, + "grad_norm": 0.8999371861705816, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 33271 + }, + { + "epoch": 0.33272, + "grad_norm": 1.228776114901829, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 33272 + }, + { + "epoch": 0.33273, + "grad_norm": 0.9363687513479486, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 33273 + }, + { + "epoch": 0.33274, + "grad_norm": 0.8655539261671985, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 33274 + }, + { + "epoch": 0.33275, + "grad_norm": 0.9536726369018483, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 33275 + }, + { + "epoch": 0.33276, + "grad_norm": 0.9990851726069606, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 33276 + }, + { + "epoch": 0.33277, + "grad_norm": 0.8819330944187551, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 33277 + }, + { + "epoch": 0.33278, + "grad_norm": 0.8471979831515771, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 33278 + }, + { + "epoch": 0.33279, + "grad_norm": 0.72759799486653, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 33279 + }, + { + "epoch": 0.3328, + "grad_norm": 0.6667116586478672, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 33280 + }, + { + "epoch": 0.33281, + "grad_norm": 0.6547937082267524, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 33281 + }, + { + "epoch": 0.33282, + "grad_norm": 0.7905092865454293, + "learning_rate": 0.003, + "loss": 4.033, + "step": 33282 + }, + { + "epoch": 0.33283, + "grad_norm": 0.9274193839187903, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 33283 + }, + { + "epoch": 0.33284, + "grad_norm": 1.227190317388163, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 33284 + }, + { + "epoch": 0.33285, + "grad_norm": 0.8329195956922195, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 33285 + }, + { + "epoch": 0.33286, + "grad_norm": 0.7998024712539566, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 33286 + }, + { + "epoch": 0.33287, + "grad_norm": 0.9018474629170474, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 33287 + }, + { + "epoch": 0.33288, + "grad_norm": 0.958960561810725, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 33288 + }, + { + "epoch": 0.33289, + "grad_norm": 0.9298418415462505, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 33289 + }, + { + "epoch": 0.3329, + "grad_norm": 0.9298921624866663, + "learning_rate": 0.003, + "loss": 4.03, + "step": 33290 + }, + { + "epoch": 0.33291, + "grad_norm": 0.9843132733638666, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 33291 + }, + { + "epoch": 0.33292, + "grad_norm": 0.9477196777456502, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 33292 + }, + { + "epoch": 0.33293, + "grad_norm": 0.88624817612436, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 33293 + }, + { + "epoch": 0.33294, + "grad_norm": 0.8629305348558095, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 33294 + }, + { + "epoch": 0.33295, + "grad_norm": 0.9465481358434882, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 33295 + }, + { + "epoch": 0.33296, + "grad_norm": 1.0069659006708835, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 33296 + }, + { + "epoch": 0.33297, + "grad_norm": 1.0616510217805197, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 33297 + }, + { + "epoch": 0.33298, + "grad_norm": 0.7555026255764139, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 33298 + }, + { + "epoch": 0.33299, + "grad_norm": 0.7060965566417634, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 33299 + }, + { + "epoch": 0.333, + "grad_norm": 0.7717905322659935, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 33300 + }, + { + "epoch": 0.33301, + "grad_norm": 0.8272922601497754, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 33301 + }, + { + "epoch": 0.33302, + "grad_norm": 0.8136546518762648, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 33302 + }, + { + "epoch": 0.33303, + "grad_norm": 0.7286741488636451, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 33303 + }, + { + "epoch": 0.33304, + "grad_norm": 0.819613187146706, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 33304 + }, + { + "epoch": 0.33305, + "grad_norm": 0.9303796765665868, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 33305 + }, + { + "epoch": 0.33306, + "grad_norm": 1.0074278004664337, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 33306 + }, + { + "epoch": 0.33307, + "grad_norm": 1.0137661350047185, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 33307 + }, + { + "epoch": 0.33308, + "grad_norm": 0.9323007523893484, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 33308 + }, + { + "epoch": 0.33309, + "grad_norm": 0.9286932282686795, + "learning_rate": 0.003, + "loss": 4.065, + "step": 33309 + }, + { + "epoch": 0.3331, + "grad_norm": 0.770354753097008, + "learning_rate": 0.003, + "loss": 4.0924, + "step": 33310 + }, + { + "epoch": 0.33311, + "grad_norm": 0.7382329705374477, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 33311 + }, + { + "epoch": 0.33312, + "grad_norm": 0.6238322085646718, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 33312 + }, + { + "epoch": 0.33313, + "grad_norm": 0.5526982004910219, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 33313 + }, + { + "epoch": 0.33314, + "grad_norm": 0.5792018822135384, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 33314 + }, + { + "epoch": 0.33315, + "grad_norm": 0.7205738865495327, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 33315 + }, + { + "epoch": 0.33316, + "grad_norm": 0.9604716331512959, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 33316 + }, + { + "epoch": 0.33317, + "grad_norm": 1.3073509809583759, + "learning_rate": 0.003, + "loss": 4.016, + "step": 33317 + }, + { + "epoch": 0.33318, + "grad_norm": 0.58850028006163, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 33318 + }, + { + "epoch": 0.33319, + "grad_norm": 0.7160154606202531, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 33319 + }, + { + "epoch": 0.3332, + "grad_norm": 1.048946026528417, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 33320 + }, + { + "epoch": 0.33321, + "grad_norm": 0.9174982722954346, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 33321 + }, + { + "epoch": 0.33322, + "grad_norm": 0.7553982053411437, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 33322 + }, + { + "epoch": 0.33323, + "grad_norm": 0.7150546012251735, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 33323 + }, + { + "epoch": 0.33324, + "grad_norm": 0.7102766500107569, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 33324 + }, + { + "epoch": 0.33325, + "grad_norm": 0.7245684909474662, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 33325 + }, + { + "epoch": 0.33326, + "grad_norm": 0.7490610560449205, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 33326 + }, + { + "epoch": 0.33327, + "grad_norm": 0.7738135572717182, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 33327 + }, + { + "epoch": 0.33328, + "grad_norm": 0.7514528703458535, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 33328 + }, + { + "epoch": 0.33329, + "grad_norm": 0.677799317624192, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 33329 + }, + { + "epoch": 0.3333, + "grad_norm": 0.7186352508241046, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 33330 + }, + { + "epoch": 0.33331, + "grad_norm": 0.6853635653553354, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 33331 + }, + { + "epoch": 0.33332, + "grad_norm": 0.7718452823161797, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 33332 + }, + { + "epoch": 0.33333, + "grad_norm": 0.8624313771085232, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 33333 + }, + { + "epoch": 0.33334, + "grad_norm": 0.9720975979391973, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 33334 + }, + { + "epoch": 0.33335, + "grad_norm": 0.9846330486783315, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 33335 + }, + { + "epoch": 0.33336, + "grad_norm": 1.0076436789217027, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 33336 + }, + { + "epoch": 0.33337, + "grad_norm": 1.0401266000765022, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 33337 + }, + { + "epoch": 0.33338, + "grad_norm": 1.003350629312075, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 33338 + }, + { + "epoch": 0.33339, + "grad_norm": 0.9959680916579017, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33339 + }, + { + "epoch": 0.3334, + "grad_norm": 0.9917617045813754, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 33340 + }, + { + "epoch": 0.33341, + "grad_norm": 0.8043459385303889, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 33341 + }, + { + "epoch": 0.33342, + "grad_norm": 0.8033768050217123, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 33342 + }, + { + "epoch": 0.33343, + "grad_norm": 0.7882052647208783, + "learning_rate": 0.003, + "loss": 4.065, + "step": 33343 + }, + { + "epoch": 0.33344, + "grad_norm": 0.900428299723834, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 33344 + }, + { + "epoch": 0.33345, + "grad_norm": 1.0307832153563465, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 33345 + }, + { + "epoch": 0.33346, + "grad_norm": 0.996884809073134, + "learning_rate": 0.003, + "loss": 4.034, + "step": 33346 + }, + { + "epoch": 0.33347, + "grad_norm": 0.9950379967464209, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 33347 + }, + { + "epoch": 0.33348, + "grad_norm": 0.9314530625794599, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 33348 + }, + { + "epoch": 0.33349, + "grad_norm": 0.9084031525776701, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 33349 + }, + { + "epoch": 0.3335, + "grad_norm": 0.8986614036959937, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 33350 + }, + { + "epoch": 0.33351, + "grad_norm": 0.8849763114539304, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 33351 + }, + { + "epoch": 0.33352, + "grad_norm": 0.7889523547667419, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 33352 + }, + { + "epoch": 0.33353, + "grad_norm": 0.8210755439825939, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 33353 + }, + { + "epoch": 0.33354, + "grad_norm": 0.7900765764962026, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 33354 + }, + { + "epoch": 0.33355, + "grad_norm": 0.81944029430372, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 33355 + }, + { + "epoch": 0.33356, + "grad_norm": 0.9833580940704708, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 33356 + }, + { + "epoch": 0.33357, + "grad_norm": 1.106402193024358, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 33357 + }, + { + "epoch": 0.33358, + "grad_norm": 0.9229854824922316, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 33358 + }, + { + "epoch": 0.33359, + "grad_norm": 0.8627374269215073, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 33359 + }, + { + "epoch": 0.3336, + "grad_norm": 0.9058011091454641, + "learning_rate": 0.003, + "loss": 4.0864, + "step": 33360 + }, + { + "epoch": 0.33361, + "grad_norm": 0.8426100277198227, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 33361 + }, + { + "epoch": 0.33362, + "grad_norm": 0.7306558176093666, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 33362 + }, + { + "epoch": 0.33363, + "grad_norm": 0.5835573813882036, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 33363 + }, + { + "epoch": 0.33364, + "grad_norm": 0.6175814669265302, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 33364 + }, + { + "epoch": 0.33365, + "grad_norm": 0.6710901286711127, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 33365 + }, + { + "epoch": 0.33366, + "grad_norm": 0.7269772762767898, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 33366 + }, + { + "epoch": 0.33367, + "grad_norm": 0.785864206367794, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 33367 + }, + { + "epoch": 0.33368, + "grad_norm": 0.8822439158355975, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 33368 + }, + { + "epoch": 0.33369, + "grad_norm": 0.9537205744201658, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 33369 + }, + { + "epoch": 0.3337, + "grad_norm": 0.9235509431199971, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 33370 + }, + { + "epoch": 0.33371, + "grad_norm": 0.7092882713777754, + "learning_rate": 0.003, + "loss": 4.044, + "step": 33371 + }, + { + "epoch": 0.33372, + "grad_norm": 0.7136674515729304, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 33372 + }, + { + "epoch": 0.33373, + "grad_norm": 0.692342493707749, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 33373 + }, + { + "epoch": 0.33374, + "grad_norm": 0.6805295975417623, + "learning_rate": 0.003, + "loss": 3.9976, + "step": 33374 + }, + { + "epoch": 0.33375, + "grad_norm": 0.7509553086876447, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 33375 + }, + { + "epoch": 0.33376, + "grad_norm": 0.8650374080244768, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 33376 + }, + { + "epoch": 0.33377, + "grad_norm": 0.9550921190549287, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 33377 + }, + { + "epoch": 0.33378, + "grad_norm": 0.9708827917983992, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 33378 + }, + { + "epoch": 0.33379, + "grad_norm": 0.9142487141191915, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 33379 + }, + { + "epoch": 0.3338, + "grad_norm": 0.9202660257083384, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 33380 + }, + { + "epoch": 0.33381, + "grad_norm": 0.7874397603158287, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 33381 + }, + { + "epoch": 0.33382, + "grad_norm": 0.6403194995470689, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 33382 + }, + { + "epoch": 0.33383, + "grad_norm": 0.6612253456899041, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 33383 + }, + { + "epoch": 0.33384, + "grad_norm": 0.834622769228584, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 33384 + }, + { + "epoch": 0.33385, + "grad_norm": 0.9384407847816049, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 33385 + }, + { + "epoch": 0.33386, + "grad_norm": 1.0372367610519904, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 33386 + }, + { + "epoch": 0.33387, + "grad_norm": 1.032922372300866, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 33387 + }, + { + "epoch": 0.33388, + "grad_norm": 0.8444379457647426, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 33388 + }, + { + "epoch": 0.33389, + "grad_norm": 0.6976530969223596, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 33389 + }, + { + "epoch": 0.3339, + "grad_norm": 0.6399301773457566, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 33390 + }, + { + "epoch": 0.33391, + "grad_norm": 0.6136997134599194, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 33391 + }, + { + "epoch": 0.33392, + "grad_norm": 0.6241371902268488, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 33392 + }, + { + "epoch": 0.33393, + "grad_norm": 0.6868410692337971, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 33393 + }, + { + "epoch": 0.33394, + "grad_norm": 0.7330244499953732, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 33394 + }, + { + "epoch": 0.33395, + "grad_norm": 0.8186830934409803, + "learning_rate": 0.003, + "loss": 4.046, + "step": 33395 + }, + { + "epoch": 0.33396, + "grad_norm": 1.0114948544369764, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 33396 + }, + { + "epoch": 0.33397, + "grad_norm": 1.085815198855781, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 33397 + }, + { + "epoch": 0.33398, + "grad_norm": 0.9430671854639365, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 33398 + }, + { + "epoch": 0.33399, + "grad_norm": 0.9505432899801365, + "learning_rate": 0.003, + "loss": 4.048, + "step": 33399 + }, + { + "epoch": 0.334, + "grad_norm": 0.898644859300184, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 33400 + }, + { + "epoch": 0.33401, + "grad_norm": 0.8016723185043879, + "learning_rate": 0.003, + "loss": 4.0781, + "step": 33401 + }, + { + "epoch": 0.33402, + "grad_norm": 0.8178678837158063, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 33402 + }, + { + "epoch": 0.33403, + "grad_norm": 0.8171846780000046, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 33403 + }, + { + "epoch": 0.33404, + "grad_norm": 0.7637504805082586, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 33404 + }, + { + "epoch": 0.33405, + "grad_norm": 0.78490916268894, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 33405 + }, + { + "epoch": 0.33406, + "grad_norm": 0.7737647126991103, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 33406 + }, + { + "epoch": 0.33407, + "grad_norm": 0.830597861707749, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 33407 + }, + { + "epoch": 0.33408, + "grad_norm": 0.9068920520964764, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 33408 + }, + { + "epoch": 0.33409, + "grad_norm": 0.8898916527700895, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 33409 + }, + { + "epoch": 0.3341, + "grad_norm": 0.9533014795270639, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 33410 + }, + { + "epoch": 0.33411, + "grad_norm": 1.0183902931181759, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 33411 + }, + { + "epoch": 0.33412, + "grad_norm": 1.0695337901114388, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 33412 + }, + { + "epoch": 0.33413, + "grad_norm": 0.9424585803980258, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 33413 + }, + { + "epoch": 0.33414, + "grad_norm": 0.7926427786197242, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 33414 + }, + { + "epoch": 0.33415, + "grad_norm": 0.7841684851632081, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 33415 + }, + { + "epoch": 0.33416, + "grad_norm": 0.7932362454450294, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 33416 + }, + { + "epoch": 0.33417, + "grad_norm": 0.8505726323249602, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 33417 + }, + { + "epoch": 0.33418, + "grad_norm": 0.8868499442190434, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 33418 + }, + { + "epoch": 0.33419, + "grad_norm": 0.8539995990251088, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 33419 + }, + { + "epoch": 0.3342, + "grad_norm": 0.808349081846885, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 33420 + }, + { + "epoch": 0.33421, + "grad_norm": 0.6488981615711122, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 33421 + }, + { + "epoch": 0.33422, + "grad_norm": 0.5750322265093099, + "learning_rate": 0.003, + "loss": 4.034, + "step": 33422 + }, + { + "epoch": 0.33423, + "grad_norm": 0.5238950400625405, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 33423 + }, + { + "epoch": 0.33424, + "grad_norm": 0.5881696124622532, + "learning_rate": 0.003, + "loss": 4.043, + "step": 33424 + }, + { + "epoch": 0.33425, + "grad_norm": 0.5850745819719491, + "learning_rate": 0.003, + "loss": 4.054, + "step": 33425 + }, + { + "epoch": 0.33426, + "grad_norm": 0.654993031226601, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 33426 + }, + { + "epoch": 0.33427, + "grad_norm": 0.710482025132162, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 33427 + }, + { + "epoch": 0.33428, + "grad_norm": 0.7962968582896989, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 33428 + }, + { + "epoch": 0.33429, + "grad_norm": 1.0169009739627957, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 33429 + }, + { + "epoch": 0.3343, + "grad_norm": 1.166233969724563, + "learning_rate": 0.003, + "loss": 4.064, + "step": 33430 + }, + { + "epoch": 0.33431, + "grad_norm": 0.8417757833979131, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 33431 + }, + { + "epoch": 0.33432, + "grad_norm": 0.8384339708235556, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 33432 + }, + { + "epoch": 0.33433, + "grad_norm": 0.8459915354674105, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 33433 + }, + { + "epoch": 0.33434, + "grad_norm": 0.9423443365450178, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 33434 + }, + { + "epoch": 0.33435, + "grad_norm": 0.869912428363126, + "learning_rate": 0.003, + "loss": 4.0069, + "step": 33435 + }, + { + "epoch": 0.33436, + "grad_norm": 0.9415144169549705, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 33436 + }, + { + "epoch": 0.33437, + "grad_norm": 1.0150367239400437, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 33437 + }, + { + "epoch": 0.33438, + "grad_norm": 1.0066312648284297, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 33438 + }, + { + "epoch": 0.33439, + "grad_norm": 1.1211641582453507, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 33439 + }, + { + "epoch": 0.3344, + "grad_norm": 0.9376414844849542, + "learning_rate": 0.003, + "loss": 4.062, + "step": 33440 + }, + { + "epoch": 0.33441, + "grad_norm": 1.0827740752440962, + "learning_rate": 0.003, + "loss": 4.0892, + "step": 33441 + }, + { + "epoch": 0.33442, + "grad_norm": 1.079851839398803, + "learning_rate": 0.003, + "loss": 4.076, + "step": 33442 + }, + { + "epoch": 0.33443, + "grad_norm": 0.8658830522646865, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 33443 + }, + { + "epoch": 0.33444, + "grad_norm": 0.8426996690883429, + "learning_rate": 0.003, + "loss": 4.068, + "step": 33444 + }, + { + "epoch": 0.33445, + "grad_norm": 0.8143941663425606, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 33445 + }, + { + "epoch": 0.33446, + "grad_norm": 0.8660839115511949, + "learning_rate": 0.003, + "loss": 4.037, + "step": 33446 + }, + { + "epoch": 0.33447, + "grad_norm": 0.9553266242749251, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 33447 + }, + { + "epoch": 0.33448, + "grad_norm": 1.2729477117881887, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 33448 + }, + { + "epoch": 0.33449, + "grad_norm": 0.788203285422488, + "learning_rate": 0.003, + "loss": 4.041, + "step": 33449 + }, + { + "epoch": 0.3345, + "grad_norm": 0.7154054206118292, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 33450 + }, + { + "epoch": 0.33451, + "grad_norm": 0.7566436019733688, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 33451 + }, + { + "epoch": 0.33452, + "grad_norm": 0.7477463443748265, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 33452 + }, + { + "epoch": 0.33453, + "grad_norm": 0.7910376563083757, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 33453 + }, + { + "epoch": 0.33454, + "grad_norm": 0.8438335071022727, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 33454 + }, + { + "epoch": 0.33455, + "grad_norm": 0.7999201420421643, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 33455 + }, + { + "epoch": 0.33456, + "grad_norm": 0.8078301439137415, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 33456 + }, + { + "epoch": 0.33457, + "grad_norm": 0.9172369153755748, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 33457 + }, + { + "epoch": 0.33458, + "grad_norm": 1.076612886918343, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 33458 + }, + { + "epoch": 0.33459, + "grad_norm": 0.9361910937059847, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 33459 + }, + { + "epoch": 0.3346, + "grad_norm": 0.7832802137151167, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 33460 + }, + { + "epoch": 0.33461, + "grad_norm": 0.6441892825156816, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 33461 + }, + { + "epoch": 0.33462, + "grad_norm": 0.7004834286542263, + "learning_rate": 0.003, + "loss": 4.069, + "step": 33462 + }, + { + "epoch": 0.33463, + "grad_norm": 0.6246980054397719, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 33463 + }, + { + "epoch": 0.33464, + "grad_norm": 0.7087798366136248, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 33464 + }, + { + "epoch": 0.33465, + "grad_norm": 0.6760655697340239, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 33465 + }, + { + "epoch": 0.33466, + "grad_norm": 0.6752056572128844, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 33466 + }, + { + "epoch": 0.33467, + "grad_norm": 0.8037842620489187, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 33467 + }, + { + "epoch": 0.33468, + "grad_norm": 0.9550500049232913, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 33468 + }, + { + "epoch": 0.33469, + "grad_norm": 0.9098043746304054, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 33469 + }, + { + "epoch": 0.3347, + "grad_norm": 0.6735099740264832, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 33470 + }, + { + "epoch": 0.33471, + "grad_norm": 0.6099054699074581, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 33471 + }, + { + "epoch": 0.33472, + "grad_norm": 0.7409331962393706, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 33472 + }, + { + "epoch": 0.33473, + "grad_norm": 0.7580832586106795, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 33473 + }, + { + "epoch": 0.33474, + "grad_norm": 0.788801196463899, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 33474 + }, + { + "epoch": 0.33475, + "grad_norm": 0.7675144571928182, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 33475 + }, + { + "epoch": 0.33476, + "grad_norm": 0.699106095156432, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 33476 + }, + { + "epoch": 0.33477, + "grad_norm": 0.6534834925644141, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 33477 + }, + { + "epoch": 0.33478, + "grad_norm": 0.6172583332945806, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 33478 + }, + { + "epoch": 0.33479, + "grad_norm": 0.6927279958322375, + "learning_rate": 0.003, + "loss": 4.0036, + "step": 33479 + }, + { + "epoch": 0.3348, + "grad_norm": 0.7499984993163773, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33480 + }, + { + "epoch": 0.33481, + "grad_norm": 0.77208004144386, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 33481 + }, + { + "epoch": 0.33482, + "grad_norm": 0.7166004348920456, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 33482 + }, + { + "epoch": 0.33483, + "grad_norm": 0.7311637946440613, + "learning_rate": 0.003, + "loss": 4.008, + "step": 33483 + }, + { + "epoch": 0.33484, + "grad_norm": 0.8314427975961974, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 33484 + }, + { + "epoch": 0.33485, + "grad_norm": 0.9612630020359775, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 33485 + }, + { + "epoch": 0.33486, + "grad_norm": 1.0865671897687097, + "learning_rate": 0.003, + "loss": 4.032, + "step": 33486 + }, + { + "epoch": 0.33487, + "grad_norm": 0.9207722187890302, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 33487 + }, + { + "epoch": 0.33488, + "grad_norm": 0.8772632750026157, + "learning_rate": 0.003, + "loss": 4.028, + "step": 33488 + }, + { + "epoch": 0.33489, + "grad_norm": 0.8945441773116348, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 33489 + }, + { + "epoch": 0.3349, + "grad_norm": 0.9418057766643548, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 33490 + }, + { + "epoch": 0.33491, + "grad_norm": 1.0364769133825553, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 33491 + }, + { + "epoch": 0.33492, + "grad_norm": 1.047691807331402, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 33492 + }, + { + "epoch": 0.33493, + "grad_norm": 0.9758598841555752, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 33493 + }, + { + "epoch": 0.33494, + "grad_norm": 0.8666599567356137, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 33494 + }, + { + "epoch": 0.33495, + "grad_norm": 0.7837848736875653, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 33495 + }, + { + "epoch": 0.33496, + "grad_norm": 0.6720289867989876, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 33496 + }, + { + "epoch": 0.33497, + "grad_norm": 0.7969269255815048, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 33497 + }, + { + "epoch": 0.33498, + "grad_norm": 0.9371447038669312, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 33498 + }, + { + "epoch": 0.33499, + "grad_norm": 0.8813051562746631, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 33499 + }, + { + "epoch": 0.335, + "grad_norm": 0.7452557934591124, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 33500 + }, + { + "epoch": 0.33501, + "grad_norm": 0.7063572025967, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 33501 + }, + { + "epoch": 0.33502, + "grad_norm": 0.6605384912455938, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 33502 + }, + { + "epoch": 0.33503, + "grad_norm": 0.7856760938378934, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 33503 + }, + { + "epoch": 0.33504, + "grad_norm": 0.990539165310162, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 33504 + }, + { + "epoch": 0.33505, + "grad_norm": 1.2271671672265865, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 33505 + }, + { + "epoch": 0.33506, + "grad_norm": 0.9756346989812787, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 33506 + }, + { + "epoch": 0.33507, + "grad_norm": 1.0521701460024862, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 33507 + }, + { + "epoch": 0.33508, + "grad_norm": 0.8636386032198867, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 33508 + }, + { + "epoch": 0.33509, + "grad_norm": 0.8902431998971329, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 33509 + }, + { + "epoch": 0.3351, + "grad_norm": 0.7891654485515646, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 33510 + }, + { + "epoch": 0.33511, + "grad_norm": 0.7098602474287389, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 33511 + }, + { + "epoch": 0.33512, + "grad_norm": 0.8104152766077731, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 33512 + }, + { + "epoch": 0.33513, + "grad_norm": 0.9225202176828405, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 33513 + }, + { + "epoch": 0.33514, + "grad_norm": 0.8525060964878511, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 33514 + }, + { + "epoch": 0.33515, + "grad_norm": 0.8396774825751079, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 33515 + }, + { + "epoch": 0.33516, + "grad_norm": 0.8523550270247341, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 33516 + }, + { + "epoch": 0.33517, + "grad_norm": 0.9038406081204027, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 33517 + }, + { + "epoch": 0.33518, + "grad_norm": 0.9036610566089047, + "learning_rate": 0.003, + "loss": 4.028, + "step": 33518 + }, + { + "epoch": 0.33519, + "grad_norm": 0.8199293709236257, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 33519 + }, + { + "epoch": 0.3352, + "grad_norm": 0.981300614635479, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 33520 + }, + { + "epoch": 0.33521, + "grad_norm": 1.1074675098194209, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 33521 + }, + { + "epoch": 0.33522, + "grad_norm": 1.071947485630254, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 33522 + }, + { + "epoch": 0.33523, + "grad_norm": 0.8517244491451923, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 33523 + }, + { + "epoch": 0.33524, + "grad_norm": 0.7104047852745636, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 33524 + }, + { + "epoch": 0.33525, + "grad_norm": 0.7429684524788617, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 33525 + }, + { + "epoch": 0.33526, + "grad_norm": 0.6862680593036469, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 33526 + }, + { + "epoch": 0.33527, + "grad_norm": 0.6051643274765951, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 33527 + }, + { + "epoch": 0.33528, + "grad_norm": 0.6414455771916885, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 33528 + }, + { + "epoch": 0.33529, + "grad_norm": 0.6937150298815976, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 33529 + }, + { + "epoch": 0.3353, + "grad_norm": 0.687733090238632, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 33530 + }, + { + "epoch": 0.33531, + "grad_norm": 0.7286216729197353, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 33531 + }, + { + "epoch": 0.33532, + "grad_norm": 0.7474821749271051, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 33532 + }, + { + "epoch": 0.33533, + "grad_norm": 0.7755996260099706, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 33533 + }, + { + "epoch": 0.33534, + "grad_norm": 0.887601695899262, + "learning_rate": 0.003, + "loss": 4.021, + "step": 33534 + }, + { + "epoch": 0.33535, + "grad_norm": 0.8679706282075552, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 33535 + }, + { + "epoch": 0.33536, + "grad_norm": 0.7469096714370186, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 33536 + }, + { + "epoch": 0.33537, + "grad_norm": 0.7306315425450887, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 33537 + }, + { + "epoch": 0.33538, + "grad_norm": 0.6994257134932458, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 33538 + }, + { + "epoch": 0.33539, + "grad_norm": 0.7170118195177576, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 33539 + }, + { + "epoch": 0.3354, + "grad_norm": 0.9195514664696472, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 33540 + }, + { + "epoch": 0.33541, + "grad_norm": 1.2413082430290139, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 33541 + }, + { + "epoch": 0.33542, + "grad_norm": 0.8220084764346933, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 33542 + }, + { + "epoch": 0.33543, + "grad_norm": 0.7021691356515798, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 33543 + }, + { + "epoch": 0.33544, + "grad_norm": 0.6708133080351312, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 33544 + }, + { + "epoch": 0.33545, + "grad_norm": 0.6937323111051921, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 33545 + }, + { + "epoch": 0.33546, + "grad_norm": 0.6405592964781028, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 33546 + }, + { + "epoch": 0.33547, + "grad_norm": 0.5923893310013788, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 33547 + }, + { + "epoch": 0.33548, + "grad_norm": 0.6567774079868601, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 33548 + }, + { + "epoch": 0.33549, + "grad_norm": 0.778111505029056, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 33549 + }, + { + "epoch": 0.3355, + "grad_norm": 0.8425385677834523, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 33550 + }, + { + "epoch": 0.33551, + "grad_norm": 0.9126983654811395, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 33551 + }, + { + "epoch": 0.33552, + "grad_norm": 1.0420663376188255, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 33552 + }, + { + "epoch": 0.33553, + "grad_norm": 1.0413664816664334, + "learning_rate": 0.003, + "loss": 4.059, + "step": 33553 + }, + { + "epoch": 0.33554, + "grad_norm": 0.9265316712741981, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 33554 + }, + { + "epoch": 0.33555, + "grad_norm": 0.9322532493863386, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 33555 + }, + { + "epoch": 0.33556, + "grad_norm": 1.0512132143797417, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 33556 + }, + { + "epoch": 0.33557, + "grad_norm": 1.0864026306413104, + "learning_rate": 0.003, + "loss": 4.0833, + "step": 33557 + }, + { + "epoch": 0.33558, + "grad_norm": 1.0611270664298809, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 33558 + }, + { + "epoch": 0.33559, + "grad_norm": 0.9819778289622576, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 33559 + }, + { + "epoch": 0.3356, + "grad_norm": 0.9642082976140308, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 33560 + }, + { + "epoch": 0.33561, + "grad_norm": 0.9331276000729407, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 33561 + }, + { + "epoch": 0.33562, + "grad_norm": 0.8913522847478555, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 33562 + }, + { + "epoch": 0.33563, + "grad_norm": 1.12234459691052, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 33563 + }, + { + "epoch": 0.33564, + "grad_norm": 1.0219880317515369, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 33564 + }, + { + "epoch": 0.33565, + "grad_norm": 0.9117341748612894, + "learning_rate": 0.003, + "loss": 4.043, + "step": 33565 + }, + { + "epoch": 0.33566, + "grad_norm": 1.026076247329884, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 33566 + }, + { + "epoch": 0.33567, + "grad_norm": 1.104171858915612, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 33567 + }, + { + "epoch": 0.33568, + "grad_norm": 0.9939210631570862, + "learning_rate": 0.003, + "loss": 4.035, + "step": 33568 + }, + { + "epoch": 0.33569, + "grad_norm": 1.0430473680432613, + "learning_rate": 0.003, + "loss": 4.066, + "step": 33569 + }, + { + "epoch": 0.3357, + "grad_norm": 0.9253520419346458, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 33570 + }, + { + "epoch": 0.33571, + "grad_norm": 0.8039423286251106, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 33571 + }, + { + "epoch": 0.33572, + "grad_norm": 0.7364980089281515, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 33572 + }, + { + "epoch": 0.33573, + "grad_norm": 0.7413715412335512, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 33573 + }, + { + "epoch": 0.33574, + "grad_norm": 0.7096705774255924, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 33574 + }, + { + "epoch": 0.33575, + "grad_norm": 0.6701532182622197, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 33575 + }, + { + "epoch": 0.33576, + "grad_norm": 0.6076870341469583, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 33576 + }, + { + "epoch": 0.33577, + "grad_norm": 0.5813119915251628, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 33577 + }, + { + "epoch": 0.33578, + "grad_norm": 0.5336756142107969, + "learning_rate": 0.003, + "loss": 4.085, + "step": 33578 + }, + { + "epoch": 0.33579, + "grad_norm": 0.5116800094071948, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 33579 + }, + { + "epoch": 0.3358, + "grad_norm": 0.5586390034089612, + "learning_rate": 0.003, + "loss": 4.026, + "step": 33580 + }, + { + "epoch": 0.33581, + "grad_norm": 0.5494169250122006, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 33581 + }, + { + "epoch": 0.33582, + "grad_norm": 0.6241252380536063, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 33582 + }, + { + "epoch": 0.33583, + "grad_norm": 0.7883056592711065, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 33583 + }, + { + "epoch": 0.33584, + "grad_norm": 0.9625133168142012, + "learning_rate": 0.003, + "loss": 4.031, + "step": 33584 + }, + { + "epoch": 0.33585, + "grad_norm": 0.8874149010677069, + "learning_rate": 0.003, + "loss": 3.9974, + "step": 33585 + }, + { + "epoch": 0.33586, + "grad_norm": 0.7069268686628442, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 33586 + }, + { + "epoch": 0.33587, + "grad_norm": 0.6571131322707537, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 33587 + }, + { + "epoch": 0.33588, + "grad_norm": 0.6330083116229438, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 33588 + }, + { + "epoch": 0.33589, + "grad_norm": 0.6446224322959807, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 33589 + }, + { + "epoch": 0.3359, + "grad_norm": 0.6937806903911785, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 33590 + }, + { + "epoch": 0.33591, + "grad_norm": 0.7637299072414784, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 33591 + }, + { + "epoch": 0.33592, + "grad_norm": 0.7201971779386008, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 33592 + }, + { + "epoch": 0.33593, + "grad_norm": 0.7469280616477116, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 33593 + }, + { + "epoch": 0.33594, + "grad_norm": 0.8502883924404253, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 33594 + }, + { + "epoch": 0.33595, + "grad_norm": 0.9508864628861705, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 33595 + }, + { + "epoch": 0.33596, + "grad_norm": 0.8993536385986304, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 33596 + }, + { + "epoch": 0.33597, + "grad_norm": 0.946571821614601, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 33597 + }, + { + "epoch": 0.33598, + "grad_norm": 0.9964047155552305, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 33598 + }, + { + "epoch": 0.33599, + "grad_norm": 1.0515835074207691, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 33599 + }, + { + "epoch": 0.336, + "grad_norm": 1.0316345538006748, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 33600 + }, + { + "epoch": 0.33601, + "grad_norm": 1.0581854790699317, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 33601 + }, + { + "epoch": 0.33602, + "grad_norm": 1.0690935533164048, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 33602 + }, + { + "epoch": 0.33603, + "grad_norm": 1.0866548933742963, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 33603 + }, + { + "epoch": 0.33604, + "grad_norm": 0.9572430958557303, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 33604 + }, + { + "epoch": 0.33605, + "grad_norm": 1.0522957014862997, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 33605 + }, + { + "epoch": 0.33606, + "grad_norm": 0.9926888688482821, + "learning_rate": 0.003, + "loss": 4.066, + "step": 33606 + }, + { + "epoch": 0.33607, + "grad_norm": 1.076751246776696, + "learning_rate": 0.003, + "loss": 4.071, + "step": 33607 + }, + { + "epoch": 0.33608, + "grad_norm": 0.9915906862540832, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 33608 + }, + { + "epoch": 0.33609, + "grad_norm": 0.9371044354169651, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 33609 + }, + { + "epoch": 0.3361, + "grad_norm": 0.9645135972349086, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 33610 + }, + { + "epoch": 0.33611, + "grad_norm": 0.825670657187605, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 33611 + }, + { + "epoch": 0.33612, + "grad_norm": 0.7731019708277355, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 33612 + }, + { + "epoch": 0.33613, + "grad_norm": 0.8113392526737846, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 33613 + }, + { + "epoch": 0.33614, + "grad_norm": 0.7773188193018316, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 33614 + }, + { + "epoch": 0.33615, + "grad_norm": 0.7674169686530927, + "learning_rate": 0.003, + "loss": 4.059, + "step": 33615 + }, + { + "epoch": 0.33616, + "grad_norm": 0.7478962721218371, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 33616 + }, + { + "epoch": 0.33617, + "grad_norm": 0.6569174795283418, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 33617 + }, + { + "epoch": 0.33618, + "grad_norm": 0.7916644993925536, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 33618 + }, + { + "epoch": 0.33619, + "grad_norm": 1.0444404689157807, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 33619 + }, + { + "epoch": 0.3362, + "grad_norm": 1.3897323235681776, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 33620 + }, + { + "epoch": 0.33621, + "grad_norm": 0.5869293321695513, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 33621 + }, + { + "epoch": 0.33622, + "grad_norm": 0.808204560836411, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 33622 + }, + { + "epoch": 0.33623, + "grad_norm": 0.873072596370925, + "learning_rate": 0.003, + "loss": 4.058, + "step": 33623 + }, + { + "epoch": 0.33624, + "grad_norm": 0.7823250984489073, + "learning_rate": 0.003, + "loss": 4.029, + "step": 33624 + }, + { + "epoch": 0.33625, + "grad_norm": 0.8199875435623101, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 33625 + }, + { + "epoch": 0.33626, + "grad_norm": 0.7675983790034979, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 33626 + }, + { + "epoch": 0.33627, + "grad_norm": 0.7638764630141989, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 33627 + }, + { + "epoch": 0.33628, + "grad_norm": 0.7984148238622651, + "learning_rate": 0.003, + "loss": 4.067, + "step": 33628 + }, + { + "epoch": 0.33629, + "grad_norm": 0.7167186465908104, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 33629 + }, + { + "epoch": 0.3363, + "grad_norm": 0.5899983030177005, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 33630 + }, + { + "epoch": 0.33631, + "grad_norm": 0.5856574105830651, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 33631 + }, + { + "epoch": 0.33632, + "grad_norm": 0.5706383355837534, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 33632 + }, + { + "epoch": 0.33633, + "grad_norm": 0.6110297849270651, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 33633 + }, + { + "epoch": 0.33634, + "grad_norm": 0.6345777839586034, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 33634 + }, + { + "epoch": 0.33635, + "grad_norm": 0.6636477086617412, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 33635 + }, + { + "epoch": 0.33636, + "grad_norm": 0.7361149268051024, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 33636 + }, + { + "epoch": 0.33637, + "grad_norm": 0.8860355190386732, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 33637 + }, + { + "epoch": 0.33638, + "grad_norm": 1.1958196045003993, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33638 + }, + { + "epoch": 0.33639, + "grad_norm": 1.0366855651951992, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 33639 + }, + { + "epoch": 0.3364, + "grad_norm": 0.7903674522209951, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 33640 + }, + { + "epoch": 0.33641, + "grad_norm": 0.7909345835777578, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 33641 + }, + { + "epoch": 0.33642, + "grad_norm": 0.874671546363745, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 33642 + }, + { + "epoch": 0.33643, + "grad_norm": 0.8767178439344934, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 33643 + }, + { + "epoch": 0.33644, + "grad_norm": 0.7661993858943851, + "learning_rate": 0.003, + "loss": 4.029, + "step": 33644 + }, + { + "epoch": 0.33645, + "grad_norm": 0.7987692670303714, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 33645 + }, + { + "epoch": 0.33646, + "grad_norm": 0.8741513571790588, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 33646 + }, + { + "epoch": 0.33647, + "grad_norm": 0.9145512880201334, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 33647 + }, + { + "epoch": 0.33648, + "grad_norm": 0.8829331282614442, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 33648 + }, + { + "epoch": 0.33649, + "grad_norm": 0.7420776974336835, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 33649 + }, + { + "epoch": 0.3365, + "grad_norm": 0.7857909331910413, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 33650 + }, + { + "epoch": 0.33651, + "grad_norm": 0.8613129125354198, + "learning_rate": 0.003, + "loss": 4.023, + "step": 33651 + }, + { + "epoch": 0.33652, + "grad_norm": 0.993851430434911, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 33652 + }, + { + "epoch": 0.33653, + "grad_norm": 0.9995989247863574, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 33653 + }, + { + "epoch": 0.33654, + "grad_norm": 0.9636129022636271, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 33654 + }, + { + "epoch": 0.33655, + "grad_norm": 0.8158394071431365, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 33655 + }, + { + "epoch": 0.33656, + "grad_norm": 0.8175438940916879, + "learning_rate": 0.003, + "loss": 4.051, + "step": 33656 + }, + { + "epoch": 0.33657, + "grad_norm": 0.9134112886931331, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 33657 + }, + { + "epoch": 0.33658, + "grad_norm": 0.8977398535858316, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 33658 + }, + { + "epoch": 0.33659, + "grad_norm": 0.938007301479686, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 33659 + }, + { + "epoch": 0.3366, + "grad_norm": 0.9014220950115539, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 33660 + }, + { + "epoch": 0.33661, + "grad_norm": 0.8931790975158279, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 33661 + }, + { + "epoch": 0.33662, + "grad_norm": 0.9453495245386333, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33662 + }, + { + "epoch": 0.33663, + "grad_norm": 0.7771897120815943, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 33663 + }, + { + "epoch": 0.33664, + "grad_norm": 0.6816267950435487, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 33664 + }, + { + "epoch": 0.33665, + "grad_norm": 0.7519933077388464, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 33665 + }, + { + "epoch": 0.33666, + "grad_norm": 0.670810083297287, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 33666 + }, + { + "epoch": 0.33667, + "grad_norm": 0.6485022638629593, + "learning_rate": 0.003, + "loss": 4.032, + "step": 33667 + }, + { + "epoch": 0.33668, + "grad_norm": 0.7126986455539552, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 33668 + }, + { + "epoch": 0.33669, + "grad_norm": 1.0060010221335662, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 33669 + }, + { + "epoch": 0.3367, + "grad_norm": 1.3738223183805245, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 33670 + }, + { + "epoch": 0.33671, + "grad_norm": 0.6155110726688291, + "learning_rate": 0.003, + "loss": 4.0035, + "step": 33671 + }, + { + "epoch": 0.33672, + "grad_norm": 0.6552756859273646, + "learning_rate": 0.003, + "loss": 4.023, + "step": 33672 + }, + { + "epoch": 0.33673, + "grad_norm": 0.6331318239781898, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 33673 + }, + { + "epoch": 0.33674, + "grad_norm": 0.6199653128102041, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 33674 + }, + { + "epoch": 0.33675, + "grad_norm": 0.6463716723007972, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 33675 + }, + { + "epoch": 0.33676, + "grad_norm": 0.6865905547730407, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 33676 + }, + { + "epoch": 0.33677, + "grad_norm": 0.6644797220877094, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 33677 + }, + { + "epoch": 0.33678, + "grad_norm": 0.6129449638610547, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 33678 + }, + { + "epoch": 0.33679, + "grad_norm": 0.6996823077271874, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 33679 + }, + { + "epoch": 0.3368, + "grad_norm": 0.9462330542137047, + "learning_rate": 0.003, + "loss": 4.026, + "step": 33680 + }, + { + "epoch": 0.33681, + "grad_norm": 1.2048944988596013, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 33681 + }, + { + "epoch": 0.33682, + "grad_norm": 0.984115128170861, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 33682 + }, + { + "epoch": 0.33683, + "grad_norm": 0.7998684697696333, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 33683 + }, + { + "epoch": 0.33684, + "grad_norm": 0.6805048124053439, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 33684 + }, + { + "epoch": 0.33685, + "grad_norm": 0.5751741105359596, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 33685 + }, + { + "epoch": 0.33686, + "grad_norm": 0.5448355358246327, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 33686 + }, + { + "epoch": 0.33687, + "grad_norm": 0.5440085183920897, + "learning_rate": 0.003, + "loss": 4.0021, + "step": 33687 + }, + { + "epoch": 0.33688, + "grad_norm": 0.6644786693562702, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 33688 + }, + { + "epoch": 0.33689, + "grad_norm": 0.7788718592854356, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 33689 + }, + { + "epoch": 0.3369, + "grad_norm": 0.8263109832876955, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 33690 + }, + { + "epoch": 0.33691, + "grad_norm": 0.8749051722820884, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 33691 + }, + { + "epoch": 0.33692, + "grad_norm": 0.9497159616559773, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 33692 + }, + { + "epoch": 0.33693, + "grad_norm": 1.0547912898490093, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 33693 + }, + { + "epoch": 0.33694, + "grad_norm": 1.0700180046678145, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 33694 + }, + { + "epoch": 0.33695, + "grad_norm": 0.9393820806815416, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 33695 + }, + { + "epoch": 0.33696, + "grad_norm": 1.022833564704935, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 33696 + }, + { + "epoch": 0.33697, + "grad_norm": 1.081619061855931, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 33697 + }, + { + "epoch": 0.33698, + "grad_norm": 1.2224473216288065, + "learning_rate": 0.003, + "loss": 4.0834, + "step": 33698 + }, + { + "epoch": 0.33699, + "grad_norm": 1.0296145550556874, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 33699 + }, + { + "epoch": 0.337, + "grad_norm": 0.7948861975686476, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 33700 + }, + { + "epoch": 0.33701, + "grad_norm": 0.7016093421820723, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 33701 + }, + { + "epoch": 0.33702, + "grad_norm": 0.6192690036724552, + "learning_rate": 0.003, + "loss": 4.035, + "step": 33702 + }, + { + "epoch": 0.33703, + "grad_norm": 0.6283667662964441, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 33703 + }, + { + "epoch": 0.33704, + "grad_norm": 0.6601439284802328, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 33704 + }, + { + "epoch": 0.33705, + "grad_norm": 0.6387464488781871, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 33705 + }, + { + "epoch": 0.33706, + "grad_norm": 0.7184624215387317, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 33706 + }, + { + "epoch": 0.33707, + "grad_norm": 0.9146334160884503, + "learning_rate": 0.003, + "loss": 4.05, + "step": 33707 + }, + { + "epoch": 0.33708, + "grad_norm": 1.1030600560273256, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 33708 + }, + { + "epoch": 0.33709, + "grad_norm": 0.866243458237003, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 33709 + }, + { + "epoch": 0.3371, + "grad_norm": 0.7828058969550554, + "learning_rate": 0.003, + "loss": 4.044, + "step": 33710 + }, + { + "epoch": 0.33711, + "grad_norm": 0.7171777944216865, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 33711 + }, + { + "epoch": 0.33712, + "grad_norm": 0.7091630800835255, + "learning_rate": 0.003, + "loss": 4.024, + "step": 33712 + }, + { + "epoch": 0.33713, + "grad_norm": 0.7129814720576563, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 33713 + }, + { + "epoch": 0.33714, + "grad_norm": 0.7443836434509767, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 33714 + }, + { + "epoch": 0.33715, + "grad_norm": 0.7978504803587637, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 33715 + }, + { + "epoch": 0.33716, + "grad_norm": 1.0511832784668238, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 33716 + }, + { + "epoch": 0.33717, + "grad_norm": 1.050841625546212, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 33717 + }, + { + "epoch": 0.33718, + "grad_norm": 1.0144876774409533, + "learning_rate": 0.003, + "loss": 4.047, + "step": 33718 + }, + { + "epoch": 0.33719, + "grad_norm": 0.9539808069223594, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 33719 + }, + { + "epoch": 0.3372, + "grad_norm": 0.9191649905101167, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 33720 + }, + { + "epoch": 0.33721, + "grad_norm": 1.0479468287899436, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 33721 + }, + { + "epoch": 0.33722, + "grad_norm": 0.8579074380774641, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 33722 + }, + { + "epoch": 0.33723, + "grad_norm": 0.7049855400400556, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 33723 + }, + { + "epoch": 0.33724, + "grad_norm": 0.624082936470447, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 33724 + }, + { + "epoch": 0.33725, + "grad_norm": 0.7087209720191705, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 33725 + }, + { + "epoch": 0.33726, + "grad_norm": 0.7502743865520788, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 33726 + }, + { + "epoch": 0.33727, + "grad_norm": 0.6627149796570143, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 33727 + }, + { + "epoch": 0.33728, + "grad_norm": 0.5346839962942674, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 33728 + }, + { + "epoch": 0.33729, + "grad_norm": 0.5361997695426446, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 33729 + }, + { + "epoch": 0.3373, + "grad_norm": 0.5693474006982953, + "learning_rate": 0.003, + "loss": 4.03, + "step": 33730 + }, + { + "epoch": 0.33731, + "grad_norm": 0.6642634981053736, + "learning_rate": 0.003, + "loss": 4.027, + "step": 33731 + }, + { + "epoch": 0.33732, + "grad_norm": 0.878114166881482, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 33732 + }, + { + "epoch": 0.33733, + "grad_norm": 1.3233062076376578, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 33733 + }, + { + "epoch": 0.33734, + "grad_norm": 0.6038469268795158, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 33734 + }, + { + "epoch": 0.33735, + "grad_norm": 0.696028703666814, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 33735 + }, + { + "epoch": 0.33736, + "grad_norm": 1.0810904032179156, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 33736 + }, + { + "epoch": 0.33737, + "grad_norm": 1.0671751705768573, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 33737 + }, + { + "epoch": 0.33738, + "grad_norm": 0.8449499775676197, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 33738 + }, + { + "epoch": 0.33739, + "grad_norm": 0.7887234417312652, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 33739 + }, + { + "epoch": 0.3374, + "grad_norm": 0.9723895713583691, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 33740 + }, + { + "epoch": 0.33741, + "grad_norm": 1.162317984052887, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 33741 + }, + { + "epoch": 0.33742, + "grad_norm": 1.0167555695562032, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 33742 + }, + { + "epoch": 0.33743, + "grad_norm": 0.9762673040015455, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 33743 + }, + { + "epoch": 0.33744, + "grad_norm": 0.9420215015286822, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 33744 + }, + { + "epoch": 0.33745, + "grad_norm": 0.9974545233684636, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 33745 + }, + { + "epoch": 0.33746, + "grad_norm": 1.047316122945228, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 33746 + }, + { + "epoch": 0.33747, + "grad_norm": 1.02788321708909, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 33747 + }, + { + "epoch": 0.33748, + "grad_norm": 0.9621453639986354, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 33748 + }, + { + "epoch": 0.33749, + "grad_norm": 0.7753507389702278, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 33749 + }, + { + "epoch": 0.3375, + "grad_norm": 0.7498129684899302, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 33750 + }, + { + "epoch": 0.33751, + "grad_norm": 0.8573142913748312, + "learning_rate": 0.003, + "loss": 4.0947, + "step": 33751 + }, + { + "epoch": 0.33752, + "grad_norm": 0.9865724859907175, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 33752 + }, + { + "epoch": 0.33753, + "grad_norm": 1.0244092243291494, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 33753 + }, + { + "epoch": 0.33754, + "grad_norm": 0.8906472731390455, + "learning_rate": 0.003, + "loss": 4.069, + "step": 33754 + }, + { + "epoch": 0.33755, + "grad_norm": 0.9583044904772331, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 33755 + }, + { + "epoch": 0.33756, + "grad_norm": 1.0169105579582782, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 33756 + }, + { + "epoch": 0.33757, + "grad_norm": 1.031396706040273, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 33757 + }, + { + "epoch": 0.33758, + "grad_norm": 0.865718027429324, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 33758 + }, + { + "epoch": 0.33759, + "grad_norm": 0.7855548780303279, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 33759 + }, + { + "epoch": 0.3376, + "grad_norm": 0.8459300000118278, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 33760 + }, + { + "epoch": 0.33761, + "grad_norm": 1.0958415695589119, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 33761 + }, + { + "epoch": 0.33762, + "grad_norm": 1.020360733991124, + "learning_rate": 0.003, + "loss": 4.074, + "step": 33762 + }, + { + "epoch": 0.33763, + "grad_norm": 1.012402323547838, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 33763 + }, + { + "epoch": 0.33764, + "grad_norm": 0.864281530533913, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 33764 + }, + { + "epoch": 0.33765, + "grad_norm": 0.7253130091408841, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 33765 + }, + { + "epoch": 0.33766, + "grad_norm": 0.6422790038199969, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 33766 + }, + { + "epoch": 0.33767, + "grad_norm": 0.6274937730802178, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 33767 + }, + { + "epoch": 0.33768, + "grad_norm": 0.6524088958063811, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 33768 + }, + { + "epoch": 0.33769, + "grad_norm": 0.6233708698861566, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 33769 + }, + { + "epoch": 0.3377, + "grad_norm": 0.6706095291185132, + "learning_rate": 0.003, + "loss": 3.9987, + "step": 33770 + }, + { + "epoch": 0.33771, + "grad_norm": 0.6600965645514117, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 33771 + }, + { + "epoch": 0.33772, + "grad_norm": 0.62100640408022, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 33772 + }, + { + "epoch": 0.33773, + "grad_norm": 0.6640184891560466, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 33773 + }, + { + "epoch": 0.33774, + "grad_norm": 0.6402224688503386, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 33774 + }, + { + "epoch": 0.33775, + "grad_norm": 0.6782496588249532, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 33775 + }, + { + "epoch": 0.33776, + "grad_norm": 0.9506969353423204, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 33776 + }, + { + "epoch": 0.33777, + "grad_norm": 1.392708177996448, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 33777 + }, + { + "epoch": 0.33778, + "grad_norm": 0.5795193119261093, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 33778 + }, + { + "epoch": 0.33779, + "grad_norm": 0.7761015423260159, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 33779 + }, + { + "epoch": 0.3378, + "grad_norm": 0.9514346302334404, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 33780 + }, + { + "epoch": 0.33781, + "grad_norm": 0.9336378997415741, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 33781 + }, + { + "epoch": 0.33782, + "grad_norm": 0.9094389881063851, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 33782 + }, + { + "epoch": 0.33783, + "grad_norm": 0.7847867544099536, + "learning_rate": 0.003, + "loss": 4.0077, + "step": 33783 + }, + { + "epoch": 0.33784, + "grad_norm": 0.6865547692224602, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 33784 + }, + { + "epoch": 0.33785, + "grad_norm": 0.6318388651387125, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 33785 + }, + { + "epoch": 0.33786, + "grad_norm": 0.6640374444864019, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 33786 + }, + { + "epoch": 0.33787, + "grad_norm": 0.7648249150247822, + "learning_rate": 0.003, + "loss": 4.042, + "step": 33787 + }, + { + "epoch": 0.33788, + "grad_norm": 0.766325275480788, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 33788 + }, + { + "epoch": 0.33789, + "grad_norm": 0.858532639174059, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 33789 + }, + { + "epoch": 0.3379, + "grad_norm": 0.951045205251094, + "learning_rate": 0.003, + "loss": 4.035, + "step": 33790 + }, + { + "epoch": 0.33791, + "grad_norm": 0.9298190962377589, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 33791 + }, + { + "epoch": 0.33792, + "grad_norm": 0.9441326909092653, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 33792 + }, + { + "epoch": 0.33793, + "grad_norm": 0.9721437550451898, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 33793 + }, + { + "epoch": 0.33794, + "grad_norm": 0.8942603192329401, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 33794 + }, + { + "epoch": 0.33795, + "grad_norm": 0.8612172935880857, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 33795 + }, + { + "epoch": 0.33796, + "grad_norm": 0.882955891135359, + "learning_rate": 0.003, + "loss": 4.058, + "step": 33796 + }, + { + "epoch": 0.33797, + "grad_norm": 0.8824934084216972, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 33797 + }, + { + "epoch": 0.33798, + "grad_norm": 0.9883406607215025, + "learning_rate": 0.003, + "loss": 4.0797, + "step": 33798 + }, + { + "epoch": 0.33799, + "grad_norm": 1.0566472667122047, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 33799 + }, + { + "epoch": 0.338, + "grad_norm": 0.8839962310425998, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 33800 + }, + { + "epoch": 0.33801, + "grad_norm": 0.7845811748934701, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 33801 + }, + { + "epoch": 0.33802, + "grad_norm": 0.7770584710378017, + "learning_rate": 0.003, + "loss": 4.051, + "step": 33802 + }, + { + "epoch": 0.33803, + "grad_norm": 0.7406397409570852, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 33803 + }, + { + "epoch": 0.33804, + "grad_norm": 0.7352998325867052, + "learning_rate": 0.003, + "loss": 3.9835, + "step": 33804 + }, + { + "epoch": 0.33805, + "grad_norm": 0.791367794128499, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 33805 + }, + { + "epoch": 0.33806, + "grad_norm": 0.8487970159084213, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 33806 + }, + { + "epoch": 0.33807, + "grad_norm": 0.9297580414380384, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 33807 + }, + { + "epoch": 0.33808, + "grad_norm": 0.9863059192325285, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 33808 + }, + { + "epoch": 0.33809, + "grad_norm": 1.0595019645137196, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 33809 + }, + { + "epoch": 0.3381, + "grad_norm": 0.8549935067457187, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 33810 + }, + { + "epoch": 0.33811, + "grad_norm": 0.7434150034506393, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 33811 + }, + { + "epoch": 0.33812, + "grad_norm": 0.732443026899655, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 33812 + }, + { + "epoch": 0.33813, + "grad_norm": 0.6394441098591139, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 33813 + }, + { + "epoch": 0.33814, + "grad_norm": 0.6593133801120066, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 33814 + }, + { + "epoch": 0.33815, + "grad_norm": 0.5546760161598798, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 33815 + }, + { + "epoch": 0.33816, + "grad_norm": 0.551799298160508, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 33816 + }, + { + "epoch": 0.33817, + "grad_norm": 0.6308094122877197, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 33817 + }, + { + "epoch": 0.33818, + "grad_norm": 0.6738498287449948, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 33818 + }, + { + "epoch": 0.33819, + "grad_norm": 0.7803895787172441, + "learning_rate": 0.003, + "loss": 4.053, + "step": 33819 + }, + { + "epoch": 0.3382, + "grad_norm": 0.886281732424119, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 33820 + }, + { + "epoch": 0.33821, + "grad_norm": 0.8765366089642417, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 33821 + }, + { + "epoch": 0.33822, + "grad_norm": 0.8468068345465231, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 33822 + }, + { + "epoch": 0.33823, + "grad_norm": 0.8140039293249891, + "learning_rate": 0.003, + "loss": 4.029, + "step": 33823 + }, + { + "epoch": 0.33824, + "grad_norm": 0.9490528531709089, + "learning_rate": 0.003, + "loss": 4.022, + "step": 33824 + }, + { + "epoch": 0.33825, + "grad_norm": 1.127976542735466, + "learning_rate": 0.003, + "loss": 4.033, + "step": 33825 + }, + { + "epoch": 0.33826, + "grad_norm": 0.8872414865733492, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 33826 + }, + { + "epoch": 0.33827, + "grad_norm": 0.8125460762829133, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 33827 + }, + { + "epoch": 0.33828, + "grad_norm": 0.8597610011335546, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 33828 + }, + { + "epoch": 0.33829, + "grad_norm": 0.7659670046658855, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 33829 + }, + { + "epoch": 0.3383, + "grad_norm": 0.6466119535673419, + "learning_rate": 0.003, + "loss": 4.005, + "step": 33830 + }, + { + "epoch": 0.33831, + "grad_norm": 0.668991323831408, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 33831 + }, + { + "epoch": 0.33832, + "grad_norm": 0.6399733485191847, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 33832 + }, + { + "epoch": 0.33833, + "grad_norm": 0.6870153089244928, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 33833 + }, + { + "epoch": 0.33834, + "grad_norm": 0.8011481327802343, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 33834 + }, + { + "epoch": 0.33835, + "grad_norm": 0.9422141196154757, + "learning_rate": 0.003, + "loss": 4.045, + "step": 33835 + }, + { + "epoch": 0.33836, + "grad_norm": 1.066269169322221, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 33836 + }, + { + "epoch": 0.33837, + "grad_norm": 1.0474269708866115, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 33837 + }, + { + "epoch": 0.33838, + "grad_norm": 0.8981477826465813, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 33838 + }, + { + "epoch": 0.33839, + "grad_norm": 0.716292037558805, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 33839 + }, + { + "epoch": 0.3384, + "grad_norm": 0.6982057676058101, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 33840 + }, + { + "epoch": 0.33841, + "grad_norm": 0.7708414426348612, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 33841 + }, + { + "epoch": 0.33842, + "grad_norm": 0.9415533452580352, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 33842 + }, + { + "epoch": 0.33843, + "grad_norm": 1.1446530133067467, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 33843 + }, + { + "epoch": 0.33844, + "grad_norm": 0.952355254581042, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 33844 + }, + { + "epoch": 0.33845, + "grad_norm": 1.0192463098079565, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 33845 + }, + { + "epoch": 0.33846, + "grad_norm": 1.0353314678797443, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 33846 + }, + { + "epoch": 0.33847, + "grad_norm": 1.0163923848392722, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 33847 + }, + { + "epoch": 0.33848, + "grad_norm": 0.8638210697912252, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 33848 + }, + { + "epoch": 0.33849, + "grad_norm": 0.7865636205890819, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 33849 + }, + { + "epoch": 0.3385, + "grad_norm": 0.733418168445731, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 33850 + }, + { + "epoch": 0.33851, + "grad_norm": 0.7999615393289004, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 33851 + }, + { + "epoch": 0.33852, + "grad_norm": 0.9279038084876917, + "learning_rate": 0.003, + "loss": 4.017, + "step": 33852 + }, + { + "epoch": 0.33853, + "grad_norm": 0.9738942269160589, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 33853 + }, + { + "epoch": 0.33854, + "grad_norm": 1.0194273700353533, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 33854 + }, + { + "epoch": 0.33855, + "grad_norm": 0.8991948194746865, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 33855 + }, + { + "epoch": 0.33856, + "grad_norm": 0.9951685827787403, + "learning_rate": 0.003, + "loss": 4.026, + "step": 33856 + }, + { + "epoch": 0.33857, + "grad_norm": 1.0705511544722706, + "learning_rate": 0.003, + "loss": 4.04, + "step": 33857 + }, + { + "epoch": 0.33858, + "grad_norm": 0.9591458755190022, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 33858 + }, + { + "epoch": 0.33859, + "grad_norm": 0.956784518143416, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 33859 + }, + { + "epoch": 0.3386, + "grad_norm": 0.9762666824630115, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 33860 + }, + { + "epoch": 0.33861, + "grad_norm": 0.927142664764423, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 33861 + }, + { + "epoch": 0.33862, + "grad_norm": 1.0025601305493832, + "learning_rate": 0.003, + "loss": 4.0904, + "step": 33862 + }, + { + "epoch": 0.33863, + "grad_norm": 0.8256886273732722, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 33863 + }, + { + "epoch": 0.33864, + "grad_norm": 0.6667967093666312, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 33864 + }, + { + "epoch": 0.33865, + "grad_norm": 0.727284170492555, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 33865 + }, + { + "epoch": 0.33866, + "grad_norm": 0.6716632324289056, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 33866 + }, + { + "epoch": 0.33867, + "grad_norm": 0.5907775819845212, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 33867 + }, + { + "epoch": 0.33868, + "grad_norm": 0.6200656805926703, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 33868 + }, + { + "epoch": 0.33869, + "grad_norm": 0.5988276375441735, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 33869 + }, + { + "epoch": 0.3387, + "grad_norm": 0.5969392478144382, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 33870 + }, + { + "epoch": 0.33871, + "grad_norm": 0.6364715901463579, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 33871 + }, + { + "epoch": 0.33872, + "grad_norm": 0.8102056790940111, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 33872 + }, + { + "epoch": 0.33873, + "grad_norm": 0.9610363744479001, + "learning_rate": 0.003, + "loss": 4.059, + "step": 33873 + }, + { + "epoch": 0.33874, + "grad_norm": 0.9779386168157106, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 33874 + }, + { + "epoch": 0.33875, + "grad_norm": 0.9898314085580503, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 33875 + }, + { + "epoch": 0.33876, + "grad_norm": 0.9704065301153844, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 33876 + }, + { + "epoch": 0.33877, + "grad_norm": 0.7093498998371275, + "learning_rate": 0.003, + "loss": 4.024, + "step": 33877 + }, + { + "epoch": 0.33878, + "grad_norm": 0.6650131618031551, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 33878 + }, + { + "epoch": 0.33879, + "grad_norm": 0.7183072582243293, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 33879 + }, + { + "epoch": 0.3388, + "grad_norm": 0.6754542598339904, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 33880 + }, + { + "epoch": 0.33881, + "grad_norm": 0.6847795594648908, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 33881 + }, + { + "epoch": 0.33882, + "grad_norm": 0.7400395087204644, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 33882 + }, + { + "epoch": 0.33883, + "grad_norm": 0.8589590828997828, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 33883 + }, + { + "epoch": 0.33884, + "grad_norm": 0.9289306099553821, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 33884 + }, + { + "epoch": 0.33885, + "grad_norm": 0.800640792990739, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 33885 + }, + { + "epoch": 0.33886, + "grad_norm": 0.834523650076088, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 33886 + }, + { + "epoch": 0.33887, + "grad_norm": 0.9679795628922451, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 33887 + }, + { + "epoch": 0.33888, + "grad_norm": 0.9319319086745497, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 33888 + }, + { + "epoch": 0.33889, + "grad_norm": 0.8257557339882334, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 33889 + }, + { + "epoch": 0.3389, + "grad_norm": 0.9070968517682227, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 33890 + }, + { + "epoch": 0.33891, + "grad_norm": 0.9442435272323383, + "learning_rate": 0.003, + "loss": 4.059, + "step": 33891 + }, + { + "epoch": 0.33892, + "grad_norm": 0.9284791352487384, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 33892 + }, + { + "epoch": 0.33893, + "grad_norm": 0.9888968675124461, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 33893 + }, + { + "epoch": 0.33894, + "grad_norm": 0.9292441513275207, + "learning_rate": 0.003, + "loss": 4.056, + "step": 33894 + }, + { + "epoch": 0.33895, + "grad_norm": 0.8184736492840081, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 33895 + }, + { + "epoch": 0.33896, + "grad_norm": 0.6972622789561078, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 33896 + }, + { + "epoch": 0.33897, + "grad_norm": 0.7391064175844099, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 33897 + }, + { + "epoch": 0.33898, + "grad_norm": 0.7000797647075282, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 33898 + }, + { + "epoch": 0.33899, + "grad_norm": 0.6806905625410583, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 33899 + }, + { + "epoch": 0.339, + "grad_norm": 0.8690201398200897, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 33900 + }, + { + "epoch": 0.33901, + "grad_norm": 1.0287014063106046, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 33901 + }, + { + "epoch": 0.33902, + "grad_norm": 1.0950433315026802, + "learning_rate": 0.003, + "loss": 4.035, + "step": 33902 + }, + { + "epoch": 0.33903, + "grad_norm": 1.0291223986569278, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 33903 + }, + { + "epoch": 0.33904, + "grad_norm": 1.0881742538989359, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 33904 + }, + { + "epoch": 0.33905, + "grad_norm": 0.9089145461989396, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 33905 + }, + { + "epoch": 0.33906, + "grad_norm": 0.907672643636694, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 33906 + }, + { + "epoch": 0.33907, + "grad_norm": 0.9193154513659452, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 33907 + }, + { + "epoch": 0.33908, + "grad_norm": 0.8434930238866593, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 33908 + }, + { + "epoch": 0.33909, + "grad_norm": 0.7817617634477758, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 33909 + }, + { + "epoch": 0.3391, + "grad_norm": 0.8184585773660709, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 33910 + }, + { + "epoch": 0.33911, + "grad_norm": 0.9509032226951651, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 33911 + }, + { + "epoch": 0.33912, + "grad_norm": 1.0450757195679297, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 33912 + }, + { + "epoch": 0.33913, + "grad_norm": 0.9549056090915814, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 33913 + }, + { + "epoch": 0.33914, + "grad_norm": 1.0151572512788025, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 33914 + }, + { + "epoch": 0.33915, + "grad_norm": 0.9899429975718765, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 33915 + }, + { + "epoch": 0.33916, + "grad_norm": 1.0165365246587603, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 33916 + }, + { + "epoch": 0.33917, + "grad_norm": 0.8557490629076506, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 33917 + }, + { + "epoch": 0.33918, + "grad_norm": 0.7005441171152348, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 33918 + }, + { + "epoch": 0.33919, + "grad_norm": 0.6688066978284886, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 33919 + }, + { + "epoch": 0.3392, + "grad_norm": 0.579532699428292, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 33920 + }, + { + "epoch": 0.33921, + "grad_norm": 0.608005173501639, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 33921 + }, + { + "epoch": 0.33922, + "grad_norm": 0.6343890050967571, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 33922 + }, + { + "epoch": 0.33923, + "grad_norm": 0.6242461943212776, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 33923 + }, + { + "epoch": 0.33924, + "grad_norm": 0.5527384875407005, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 33924 + }, + { + "epoch": 0.33925, + "grad_norm": 0.4839709214601328, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 33925 + }, + { + "epoch": 0.33926, + "grad_norm": 0.6095916516369055, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 33926 + }, + { + "epoch": 0.33927, + "grad_norm": 0.6524598232060316, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 33927 + }, + { + "epoch": 0.33928, + "grad_norm": 0.6409222711416348, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 33928 + }, + { + "epoch": 0.33929, + "grad_norm": 0.6054119299147679, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 33929 + }, + { + "epoch": 0.3393, + "grad_norm": 0.6718707521160994, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 33930 + }, + { + "epoch": 0.33931, + "grad_norm": 0.8492154360351177, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 33931 + }, + { + "epoch": 0.33932, + "grad_norm": 1.1363820900629902, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 33932 + }, + { + "epoch": 0.33933, + "grad_norm": 1.18482528867355, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 33933 + }, + { + "epoch": 0.33934, + "grad_norm": 0.7876477308776886, + "learning_rate": 0.003, + "loss": 4.0014, + "step": 33934 + }, + { + "epoch": 0.33935, + "grad_norm": 0.6987729511030895, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 33935 + }, + { + "epoch": 0.33936, + "grad_norm": 0.6446699575264577, + "learning_rate": 0.003, + "loss": 4.017, + "step": 33936 + }, + { + "epoch": 0.33937, + "grad_norm": 0.7463389124676082, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 33937 + }, + { + "epoch": 0.33938, + "grad_norm": 0.8568218017450804, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 33938 + }, + { + "epoch": 0.33939, + "grad_norm": 0.8028665628090043, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 33939 + }, + { + "epoch": 0.3394, + "grad_norm": 0.8306171453670893, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 33940 + }, + { + "epoch": 0.33941, + "grad_norm": 0.9429347869024594, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 33941 + }, + { + "epoch": 0.33942, + "grad_norm": 1.003440560699561, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 33942 + }, + { + "epoch": 0.33943, + "grad_norm": 1.018685539938234, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 33943 + }, + { + "epoch": 0.33944, + "grad_norm": 0.999758026657778, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 33944 + }, + { + "epoch": 0.33945, + "grad_norm": 0.9001894819472159, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 33945 + }, + { + "epoch": 0.33946, + "grad_norm": 0.7491383780742519, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 33946 + }, + { + "epoch": 0.33947, + "grad_norm": 0.794097073614366, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 33947 + }, + { + "epoch": 0.33948, + "grad_norm": 0.8145666147577381, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 33948 + }, + { + "epoch": 0.33949, + "grad_norm": 0.8418176852530579, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 33949 + }, + { + "epoch": 0.3395, + "grad_norm": 0.8031660879662917, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 33950 + }, + { + "epoch": 0.33951, + "grad_norm": 0.7998934076220618, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 33951 + }, + { + "epoch": 0.33952, + "grad_norm": 0.9087819939081989, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 33952 + }, + { + "epoch": 0.33953, + "grad_norm": 0.9322337828327047, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 33953 + }, + { + "epoch": 0.33954, + "grad_norm": 0.8859636258308221, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 33954 + }, + { + "epoch": 0.33955, + "grad_norm": 0.778486998371977, + "learning_rate": 0.003, + "loss": 4.065, + "step": 33955 + }, + { + "epoch": 0.33956, + "grad_norm": 0.8016595452512968, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 33956 + }, + { + "epoch": 0.33957, + "grad_norm": 0.9232747762393058, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 33957 + }, + { + "epoch": 0.33958, + "grad_norm": 1.0837467413276234, + "learning_rate": 0.003, + "loss": 4.045, + "step": 33958 + }, + { + "epoch": 0.33959, + "grad_norm": 0.8708310193033623, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 33959 + }, + { + "epoch": 0.3396, + "grad_norm": 0.7797993622130089, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 33960 + }, + { + "epoch": 0.33961, + "grad_norm": 0.6730679547674745, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 33961 + }, + { + "epoch": 0.33962, + "grad_norm": 0.7344531463258496, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 33962 + }, + { + "epoch": 0.33963, + "grad_norm": 0.805683984962356, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 33963 + }, + { + "epoch": 0.33964, + "grad_norm": 0.8573894402592114, + "learning_rate": 0.003, + "loss": 3.9925, + "step": 33964 + }, + { + "epoch": 0.33965, + "grad_norm": 0.8397193555461177, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 33965 + }, + { + "epoch": 0.33966, + "grad_norm": 0.8354234908248966, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 33966 + }, + { + "epoch": 0.33967, + "grad_norm": 0.832628518718302, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 33967 + }, + { + "epoch": 0.33968, + "grad_norm": 0.8827716141383298, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 33968 + }, + { + "epoch": 0.33969, + "grad_norm": 1.0843105209490447, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 33969 + }, + { + "epoch": 0.3397, + "grad_norm": 1.118876305433769, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 33970 + }, + { + "epoch": 0.33971, + "grad_norm": 0.8240167196639178, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 33971 + }, + { + "epoch": 0.33972, + "grad_norm": 0.7091845877462508, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 33972 + }, + { + "epoch": 0.33973, + "grad_norm": 0.6037823851005784, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 33973 + }, + { + "epoch": 0.33974, + "grad_norm": 0.6305169759226481, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 33974 + }, + { + "epoch": 0.33975, + "grad_norm": 0.7030336458663832, + "learning_rate": 0.003, + "loss": 3.9923, + "step": 33975 + }, + { + "epoch": 0.33976, + "grad_norm": 0.7334082368041822, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 33976 + }, + { + "epoch": 0.33977, + "grad_norm": 0.821646163544227, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 33977 + }, + { + "epoch": 0.33978, + "grad_norm": 0.8156406817289322, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 33978 + }, + { + "epoch": 0.33979, + "grad_norm": 0.766630193106133, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 33979 + }, + { + "epoch": 0.3398, + "grad_norm": 0.7663197666229395, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 33980 + }, + { + "epoch": 0.33981, + "grad_norm": 0.8472479667847751, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 33981 + }, + { + "epoch": 0.33982, + "grad_norm": 0.9859562433248386, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 33982 + }, + { + "epoch": 0.33983, + "grad_norm": 1.1190536882855842, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 33983 + }, + { + "epoch": 0.33984, + "grad_norm": 0.908836173729518, + "learning_rate": 0.003, + "loss": 4.031, + "step": 33984 + }, + { + "epoch": 0.33985, + "grad_norm": 0.94922326366902, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 33985 + }, + { + "epoch": 0.33986, + "grad_norm": 0.8023988802160777, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 33986 + }, + { + "epoch": 0.33987, + "grad_norm": 0.8540913040229845, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 33987 + }, + { + "epoch": 0.33988, + "grad_norm": 0.7839512907011877, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 33988 + }, + { + "epoch": 0.33989, + "grad_norm": 0.7892693763947001, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 33989 + }, + { + "epoch": 0.3399, + "grad_norm": 0.9671844311715436, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 33990 + }, + { + "epoch": 0.33991, + "grad_norm": 1.2058800017127689, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 33991 + }, + { + "epoch": 0.33992, + "grad_norm": 1.0469266471284238, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 33992 + }, + { + "epoch": 0.33993, + "grad_norm": 0.9374829411302804, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 33993 + }, + { + "epoch": 0.33994, + "grad_norm": 0.7611668201801775, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 33994 + }, + { + "epoch": 0.33995, + "grad_norm": 0.7201002905573799, + "learning_rate": 0.003, + "loss": 4.027, + "step": 33995 + }, + { + "epoch": 0.33996, + "grad_norm": 0.7542273113510776, + "learning_rate": 0.003, + "loss": 4.04, + "step": 33996 + }, + { + "epoch": 0.33997, + "grad_norm": 0.8414992382540966, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 33997 + }, + { + "epoch": 0.33998, + "grad_norm": 1.0506033711003715, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 33998 + }, + { + "epoch": 0.33999, + "grad_norm": 1.0494184252834238, + "learning_rate": 0.003, + "loss": 4.062, + "step": 33999 + }, + { + "epoch": 0.34, + "grad_norm": 0.808229401525309, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 34000 + }, + { + "epoch": 0.34001, + "grad_norm": 0.6813712460308294, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 34001 + }, + { + "epoch": 0.34002, + "grad_norm": 0.7061821576133944, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 34002 + }, + { + "epoch": 0.34003, + "grad_norm": 0.7761118187110538, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 34003 + }, + { + "epoch": 0.34004, + "grad_norm": 0.8928893587257872, + "learning_rate": 0.003, + "loss": 4.061, + "step": 34004 + }, + { + "epoch": 0.34005, + "grad_norm": 1.0744947409562577, + "learning_rate": 0.003, + "loss": 4.038, + "step": 34005 + }, + { + "epoch": 0.34006, + "grad_norm": 0.768060507798, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 34006 + }, + { + "epoch": 0.34007, + "grad_norm": 0.5726100003169947, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 34007 + }, + { + "epoch": 0.34008, + "grad_norm": 0.7205296492804241, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 34008 + }, + { + "epoch": 0.34009, + "grad_norm": 0.8126641289915525, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 34009 + }, + { + "epoch": 0.3401, + "grad_norm": 0.8405270132090459, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 34010 + }, + { + "epoch": 0.34011, + "grad_norm": 0.8016598353678006, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 34011 + }, + { + "epoch": 0.34012, + "grad_norm": 0.7236283488909129, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 34012 + }, + { + "epoch": 0.34013, + "grad_norm": 0.6918793412724296, + "learning_rate": 0.003, + "loss": 4.031, + "step": 34013 + }, + { + "epoch": 0.34014, + "grad_norm": 0.6888009211995693, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 34014 + }, + { + "epoch": 0.34015, + "grad_norm": 0.6414926492987785, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 34015 + }, + { + "epoch": 0.34016, + "grad_norm": 0.6788533880304966, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 34016 + }, + { + "epoch": 0.34017, + "grad_norm": 0.8881637199886941, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 34017 + }, + { + "epoch": 0.34018, + "grad_norm": 1.1383292408221541, + "learning_rate": 0.003, + "loss": 4.029, + "step": 34018 + }, + { + "epoch": 0.34019, + "grad_norm": 0.862798995041277, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 34019 + }, + { + "epoch": 0.3402, + "grad_norm": 0.8807963523114474, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 34020 + }, + { + "epoch": 0.34021, + "grad_norm": 0.8937778301909161, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 34021 + }, + { + "epoch": 0.34022, + "grad_norm": 0.9040918660881367, + "learning_rate": 0.003, + "loss": 4.0814, + "step": 34022 + }, + { + "epoch": 0.34023, + "grad_norm": 0.8931219188168983, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 34023 + }, + { + "epoch": 0.34024, + "grad_norm": 0.8368951453772987, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 34024 + }, + { + "epoch": 0.34025, + "grad_norm": 0.820385441185719, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 34025 + }, + { + "epoch": 0.34026, + "grad_norm": 0.9104867951152543, + "learning_rate": 0.003, + "loss": 4.008, + "step": 34026 + }, + { + "epoch": 0.34027, + "grad_norm": 1.0213425725725176, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 34027 + }, + { + "epoch": 0.34028, + "grad_norm": 1.0102829346173594, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 34028 + }, + { + "epoch": 0.34029, + "grad_norm": 0.7919918083080788, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 34029 + }, + { + "epoch": 0.3403, + "grad_norm": 0.6635644161056327, + "learning_rate": 0.003, + "loss": 4.062, + "step": 34030 + }, + { + "epoch": 0.34031, + "grad_norm": 0.7348682684672848, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 34031 + }, + { + "epoch": 0.34032, + "grad_norm": 0.8345681208634823, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 34032 + }, + { + "epoch": 0.34033, + "grad_norm": 1.0080460432351017, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 34033 + }, + { + "epoch": 0.34034, + "grad_norm": 1.0546084720353353, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 34034 + }, + { + "epoch": 0.34035, + "grad_norm": 0.9627829127313069, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 34035 + }, + { + "epoch": 0.34036, + "grad_norm": 0.9231483428389341, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 34036 + }, + { + "epoch": 0.34037, + "grad_norm": 0.9560914068311439, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 34037 + }, + { + "epoch": 0.34038, + "grad_norm": 0.8899091765631013, + "learning_rate": 0.003, + "loss": 4.028, + "step": 34038 + }, + { + "epoch": 0.34039, + "grad_norm": 0.7990126485273582, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 34039 + }, + { + "epoch": 0.3404, + "grad_norm": 0.8140745802549377, + "learning_rate": 0.003, + "loss": 4.034, + "step": 34040 + }, + { + "epoch": 0.34041, + "grad_norm": 0.8571463368294632, + "learning_rate": 0.003, + "loss": 3.9717, + "step": 34041 + }, + { + "epoch": 0.34042, + "grad_norm": 0.9846784498839475, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 34042 + }, + { + "epoch": 0.34043, + "grad_norm": 1.0988393678946655, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 34043 + }, + { + "epoch": 0.34044, + "grad_norm": 1.141264938068107, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 34044 + }, + { + "epoch": 0.34045, + "grad_norm": 0.9291993666282642, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 34045 + }, + { + "epoch": 0.34046, + "grad_norm": 0.8216586907874481, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 34046 + }, + { + "epoch": 0.34047, + "grad_norm": 0.8771715725689065, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 34047 + }, + { + "epoch": 0.34048, + "grad_norm": 0.9754138697928109, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 34048 + }, + { + "epoch": 0.34049, + "grad_norm": 1.0165560969133782, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 34049 + }, + { + "epoch": 0.3405, + "grad_norm": 0.9512061188390406, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 34050 + }, + { + "epoch": 0.34051, + "grad_norm": 0.9573857064088921, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 34051 + }, + { + "epoch": 0.34052, + "grad_norm": 1.0567655719933524, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 34052 + }, + { + "epoch": 0.34053, + "grad_norm": 0.9456061501177918, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 34053 + }, + { + "epoch": 0.34054, + "grad_norm": 0.891822733391886, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 34054 + }, + { + "epoch": 0.34055, + "grad_norm": 1.066353289188147, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 34055 + }, + { + "epoch": 0.34056, + "grad_norm": 0.9921176709983363, + "learning_rate": 0.003, + "loss": 4.064, + "step": 34056 + }, + { + "epoch": 0.34057, + "grad_norm": 0.8443572215065228, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 34057 + }, + { + "epoch": 0.34058, + "grad_norm": 0.6659394578129543, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 34058 + }, + { + "epoch": 0.34059, + "grad_norm": 0.642632778360411, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 34059 + }, + { + "epoch": 0.3406, + "grad_norm": 0.6846821669390213, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 34060 + }, + { + "epoch": 0.34061, + "grad_norm": 0.6157083788952982, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 34061 + }, + { + "epoch": 0.34062, + "grad_norm": 0.5995327449973084, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 34062 + }, + { + "epoch": 0.34063, + "grad_norm": 0.6390337631402602, + "learning_rate": 0.003, + "loss": 4.051, + "step": 34063 + }, + { + "epoch": 0.34064, + "grad_norm": 0.6862217919974073, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 34064 + }, + { + "epoch": 0.34065, + "grad_norm": 0.7079321973795291, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 34065 + }, + { + "epoch": 0.34066, + "grad_norm": 0.7422396070746735, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 34066 + }, + { + "epoch": 0.34067, + "grad_norm": 0.7694410431965949, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 34067 + }, + { + "epoch": 0.34068, + "grad_norm": 0.7298912418147402, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 34068 + }, + { + "epoch": 0.34069, + "grad_norm": 0.696396097562659, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 34069 + }, + { + "epoch": 0.3407, + "grad_norm": 0.8039333358037213, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 34070 + }, + { + "epoch": 0.34071, + "grad_norm": 0.9188471858299033, + "learning_rate": 0.003, + "loss": 4.043, + "step": 34071 + }, + { + "epoch": 0.34072, + "grad_norm": 0.9335020564533633, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 34072 + }, + { + "epoch": 0.34073, + "grad_norm": 0.7619691598775563, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 34073 + }, + { + "epoch": 0.34074, + "grad_norm": 0.5664050427702848, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 34074 + }, + { + "epoch": 0.34075, + "grad_norm": 0.5991461087743574, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 34075 + }, + { + "epoch": 0.34076, + "grad_norm": 0.7252386777453789, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 34076 + }, + { + "epoch": 0.34077, + "grad_norm": 0.7563183933703501, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 34077 + }, + { + "epoch": 0.34078, + "grad_norm": 0.8165078555796453, + "learning_rate": 0.003, + "loss": 4.034, + "step": 34078 + }, + { + "epoch": 0.34079, + "grad_norm": 0.8131740459003255, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 34079 + }, + { + "epoch": 0.3408, + "grad_norm": 0.7191276769288464, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 34080 + }, + { + "epoch": 0.34081, + "grad_norm": 0.6501357944924261, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 34081 + }, + { + "epoch": 0.34082, + "grad_norm": 0.5499518574261527, + "learning_rate": 0.003, + "loss": 4.032, + "step": 34082 + }, + { + "epoch": 0.34083, + "grad_norm": 0.5727021702499225, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 34083 + }, + { + "epoch": 0.34084, + "grad_norm": 0.6434500357256907, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 34084 + }, + { + "epoch": 0.34085, + "grad_norm": 0.8472931670590771, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 34085 + }, + { + "epoch": 0.34086, + "grad_norm": 1.1837189357808138, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 34086 + }, + { + "epoch": 0.34087, + "grad_norm": 0.9735768700501405, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 34087 + }, + { + "epoch": 0.34088, + "grad_norm": 0.960727559172683, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 34088 + }, + { + "epoch": 0.34089, + "grad_norm": 1.0232946868386503, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 34089 + }, + { + "epoch": 0.3409, + "grad_norm": 0.8598195578259243, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 34090 + }, + { + "epoch": 0.34091, + "grad_norm": 0.8561350460659448, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 34091 + }, + { + "epoch": 0.34092, + "grad_norm": 0.9100934333074193, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 34092 + }, + { + "epoch": 0.34093, + "grad_norm": 0.8624772276279844, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 34093 + }, + { + "epoch": 0.34094, + "grad_norm": 0.8473210067197896, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 34094 + }, + { + "epoch": 0.34095, + "grad_norm": 0.8804047951294056, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 34095 + }, + { + "epoch": 0.34096, + "grad_norm": 0.9525547616152052, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 34096 + }, + { + "epoch": 0.34097, + "grad_norm": 0.9265206179146316, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 34097 + }, + { + "epoch": 0.34098, + "grad_norm": 0.8123424376342245, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 34098 + }, + { + "epoch": 0.34099, + "grad_norm": 0.7773731154550838, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 34099 + }, + { + "epoch": 0.341, + "grad_norm": 0.7897351015906173, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 34100 + }, + { + "epoch": 0.34101, + "grad_norm": 0.7565509952428561, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 34101 + }, + { + "epoch": 0.34102, + "grad_norm": 0.8756526847494094, + "learning_rate": 0.003, + "loss": 4.052, + "step": 34102 + }, + { + "epoch": 0.34103, + "grad_norm": 1.1298907259418725, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 34103 + }, + { + "epoch": 0.34104, + "grad_norm": 0.8588107269951857, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 34104 + }, + { + "epoch": 0.34105, + "grad_norm": 0.7267037468687892, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 34105 + }, + { + "epoch": 0.34106, + "grad_norm": 0.6482819078968429, + "learning_rate": 0.003, + "loss": 4.024, + "step": 34106 + }, + { + "epoch": 0.34107, + "grad_norm": 0.5770128843700393, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 34107 + }, + { + "epoch": 0.34108, + "grad_norm": 0.5558906508416201, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 34108 + }, + { + "epoch": 0.34109, + "grad_norm": 0.5618263887178572, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 34109 + }, + { + "epoch": 0.3411, + "grad_norm": 0.6417931560600224, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 34110 + }, + { + "epoch": 0.34111, + "grad_norm": 0.6653043732172801, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 34111 + }, + { + "epoch": 0.34112, + "grad_norm": 0.8177576843996563, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 34112 + }, + { + "epoch": 0.34113, + "grad_norm": 1.009209524088005, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 34113 + }, + { + "epoch": 0.34114, + "grad_norm": 1.2105239173830795, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 34114 + }, + { + "epoch": 0.34115, + "grad_norm": 0.8670529695453328, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 34115 + }, + { + "epoch": 0.34116, + "grad_norm": 0.8026099400566107, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 34116 + }, + { + "epoch": 0.34117, + "grad_norm": 0.7474280072851388, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 34117 + }, + { + "epoch": 0.34118, + "grad_norm": 0.7166166608726242, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 34118 + }, + { + "epoch": 0.34119, + "grad_norm": 0.6939769902389634, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 34119 + }, + { + "epoch": 0.3412, + "grad_norm": 0.7100201665571866, + "learning_rate": 0.003, + "loss": 3.9947, + "step": 34120 + }, + { + "epoch": 0.34121, + "grad_norm": 0.7510456913023875, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 34121 + }, + { + "epoch": 0.34122, + "grad_norm": 0.826391508852153, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 34122 + }, + { + "epoch": 0.34123, + "grad_norm": 0.8238184220538075, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 34123 + }, + { + "epoch": 0.34124, + "grad_norm": 0.7036154175466772, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 34124 + }, + { + "epoch": 0.34125, + "grad_norm": 0.70798884496288, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 34125 + }, + { + "epoch": 0.34126, + "grad_norm": 0.7434099770722953, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 34126 + }, + { + "epoch": 0.34127, + "grad_norm": 0.8771941891752033, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 34127 + }, + { + "epoch": 0.34128, + "grad_norm": 1.1911679237448496, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 34128 + }, + { + "epoch": 0.34129, + "grad_norm": 0.8597111924644183, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 34129 + }, + { + "epoch": 0.3413, + "grad_norm": 0.8009334956746131, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 34130 + }, + { + "epoch": 0.34131, + "grad_norm": 0.8834728711698868, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 34131 + }, + { + "epoch": 0.34132, + "grad_norm": 0.875896101013776, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 34132 + }, + { + "epoch": 0.34133, + "grad_norm": 0.8493992695546244, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 34133 + }, + { + "epoch": 0.34134, + "grad_norm": 0.9216079654849639, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 34134 + }, + { + "epoch": 0.34135, + "grad_norm": 0.9320815039231054, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 34135 + }, + { + "epoch": 0.34136, + "grad_norm": 1.0271660256908617, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 34136 + }, + { + "epoch": 0.34137, + "grad_norm": 1.0819310354396876, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 34137 + }, + { + "epoch": 0.34138, + "grad_norm": 0.8406774328826657, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 34138 + }, + { + "epoch": 0.34139, + "grad_norm": 0.930939794708705, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 34139 + }, + { + "epoch": 0.3414, + "grad_norm": 0.9348035174712852, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 34140 + }, + { + "epoch": 0.34141, + "grad_norm": 1.0572559013801959, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 34141 + }, + { + "epoch": 0.34142, + "grad_norm": 0.8560503341282029, + "learning_rate": 0.003, + "loss": 4.033, + "step": 34142 + }, + { + "epoch": 0.34143, + "grad_norm": 0.7722441740829369, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 34143 + }, + { + "epoch": 0.34144, + "grad_norm": 0.8219747528847562, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 34144 + }, + { + "epoch": 0.34145, + "grad_norm": 0.8958487417277264, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 34145 + }, + { + "epoch": 0.34146, + "grad_norm": 1.0690156125225527, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 34146 + }, + { + "epoch": 0.34147, + "grad_norm": 1.3689574985764732, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 34147 + }, + { + "epoch": 0.34148, + "grad_norm": 0.5994807302880562, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 34148 + }, + { + "epoch": 0.34149, + "grad_norm": 0.7049412081228258, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 34149 + }, + { + "epoch": 0.3415, + "grad_norm": 0.8585650402085708, + "learning_rate": 0.003, + "loss": 4.0815, + "step": 34150 + }, + { + "epoch": 0.34151, + "grad_norm": 1.0825769961444125, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 34151 + }, + { + "epoch": 0.34152, + "grad_norm": 0.9241934280552466, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 34152 + }, + { + "epoch": 0.34153, + "grad_norm": 0.8366249375605892, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 34153 + }, + { + "epoch": 0.34154, + "grad_norm": 0.7013071183313523, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 34154 + }, + { + "epoch": 0.34155, + "grad_norm": 0.6041906261293323, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 34155 + }, + { + "epoch": 0.34156, + "grad_norm": 0.7402391278605512, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 34156 + }, + { + "epoch": 0.34157, + "grad_norm": 0.7990377051906979, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 34157 + }, + { + "epoch": 0.34158, + "grad_norm": 0.9276508140199522, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 34158 + }, + { + "epoch": 0.34159, + "grad_norm": 1.032593761516333, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 34159 + }, + { + "epoch": 0.3416, + "grad_norm": 0.8636707114485934, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 34160 + }, + { + "epoch": 0.34161, + "grad_norm": 0.788374302409921, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 34161 + }, + { + "epoch": 0.34162, + "grad_norm": 0.7413028408943177, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 34162 + }, + { + "epoch": 0.34163, + "grad_norm": 0.6995780315432757, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 34163 + }, + { + "epoch": 0.34164, + "grad_norm": 0.6679726440798593, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 34164 + }, + { + "epoch": 0.34165, + "grad_norm": 0.749514724162423, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 34165 + }, + { + "epoch": 0.34166, + "grad_norm": 1.0415305841149887, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 34166 + }, + { + "epoch": 0.34167, + "grad_norm": 1.2616302331557625, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 34167 + }, + { + "epoch": 0.34168, + "grad_norm": 0.8446072718374869, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 34168 + }, + { + "epoch": 0.34169, + "grad_norm": 0.7913534884000306, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 34169 + }, + { + "epoch": 0.3417, + "grad_norm": 0.7455098656572278, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 34170 + }, + { + "epoch": 0.34171, + "grad_norm": 0.7576731268537117, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 34171 + }, + { + "epoch": 0.34172, + "grad_norm": 0.8154926551929459, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 34172 + }, + { + "epoch": 0.34173, + "grad_norm": 0.9365266264737857, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 34173 + }, + { + "epoch": 0.34174, + "grad_norm": 0.9724254325166384, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 34174 + }, + { + "epoch": 0.34175, + "grad_norm": 0.9108502400625214, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 34175 + }, + { + "epoch": 0.34176, + "grad_norm": 0.9737213543426644, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 34176 + }, + { + "epoch": 0.34177, + "grad_norm": 1.1966532269912842, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 34177 + }, + { + "epoch": 0.34178, + "grad_norm": 0.888633356587651, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 34178 + }, + { + "epoch": 0.34179, + "grad_norm": 0.8346376101778342, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 34179 + }, + { + "epoch": 0.3418, + "grad_norm": 0.8115699936073141, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 34180 + }, + { + "epoch": 0.34181, + "grad_norm": 0.8815818334363897, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 34181 + }, + { + "epoch": 0.34182, + "grad_norm": 0.9259141402651386, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 34182 + }, + { + "epoch": 0.34183, + "grad_norm": 0.8537826835310705, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 34183 + }, + { + "epoch": 0.34184, + "grad_norm": 0.9241610101979926, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 34184 + }, + { + "epoch": 0.34185, + "grad_norm": 0.9901197968333971, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 34185 + }, + { + "epoch": 0.34186, + "grad_norm": 0.9654579870254195, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 34186 + }, + { + "epoch": 0.34187, + "grad_norm": 0.8881702859059837, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 34187 + }, + { + "epoch": 0.34188, + "grad_norm": 0.7987541930638775, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 34188 + }, + { + "epoch": 0.34189, + "grad_norm": 0.7104788724639892, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 34189 + }, + { + "epoch": 0.3419, + "grad_norm": 0.772970697093058, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 34190 + }, + { + "epoch": 0.34191, + "grad_norm": 0.8264135686570176, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 34191 + }, + { + "epoch": 0.34192, + "grad_norm": 0.7285398936370029, + "learning_rate": 0.003, + "loss": 4.039, + "step": 34192 + }, + { + "epoch": 0.34193, + "grad_norm": 0.7653744786091489, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 34193 + }, + { + "epoch": 0.34194, + "grad_norm": 0.7511782282319743, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 34194 + }, + { + "epoch": 0.34195, + "grad_norm": 0.830378425248287, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 34195 + }, + { + "epoch": 0.34196, + "grad_norm": 0.9757630249615244, + "learning_rate": 0.003, + "loss": 4.031, + "step": 34196 + }, + { + "epoch": 0.34197, + "grad_norm": 1.0666845351141039, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 34197 + }, + { + "epoch": 0.34198, + "grad_norm": 0.8178727455250759, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 34198 + }, + { + "epoch": 0.34199, + "grad_norm": 0.5681718769843592, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 34199 + }, + { + "epoch": 0.342, + "grad_norm": 0.6054103724911871, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 34200 + }, + { + "epoch": 0.34201, + "grad_norm": 0.583388878685876, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 34201 + }, + { + "epoch": 0.34202, + "grad_norm": 0.6253589025203606, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 34202 + }, + { + "epoch": 0.34203, + "grad_norm": 0.6493058510088409, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 34203 + }, + { + "epoch": 0.34204, + "grad_norm": 0.63778945452355, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 34204 + }, + { + "epoch": 0.34205, + "grad_norm": 0.769849271470049, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 34205 + }, + { + "epoch": 0.34206, + "grad_norm": 0.9723975282050301, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 34206 + }, + { + "epoch": 0.34207, + "grad_norm": 1.082945186616203, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 34207 + }, + { + "epoch": 0.34208, + "grad_norm": 0.98318130861273, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 34208 + }, + { + "epoch": 0.34209, + "grad_norm": 1.2282167641912864, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 34209 + }, + { + "epoch": 0.3421, + "grad_norm": 0.8958423677019361, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 34210 + }, + { + "epoch": 0.34211, + "grad_norm": 0.8478548503732254, + "learning_rate": 0.003, + "loss": 4.063, + "step": 34211 + }, + { + "epoch": 0.34212, + "grad_norm": 0.6949704387137674, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 34212 + }, + { + "epoch": 0.34213, + "grad_norm": 0.7417330302214048, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 34213 + }, + { + "epoch": 0.34214, + "grad_norm": 0.7420227201950665, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 34214 + }, + { + "epoch": 0.34215, + "grad_norm": 0.6378521754983429, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 34215 + }, + { + "epoch": 0.34216, + "grad_norm": 0.689544028938562, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 34216 + }, + { + "epoch": 0.34217, + "grad_norm": 0.7086479157903997, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 34217 + }, + { + "epoch": 0.34218, + "grad_norm": 0.7673848374100676, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 34218 + }, + { + "epoch": 0.34219, + "grad_norm": 0.816789259412931, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 34219 + }, + { + "epoch": 0.3422, + "grad_norm": 0.8877423433284022, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 34220 + }, + { + "epoch": 0.34221, + "grad_norm": 1.0107137831482358, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 34221 + }, + { + "epoch": 0.34222, + "grad_norm": 1.0337129380165264, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 34222 + }, + { + "epoch": 0.34223, + "grad_norm": 0.8624236572021852, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 34223 + }, + { + "epoch": 0.34224, + "grad_norm": 0.6574268306301482, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 34224 + }, + { + "epoch": 0.34225, + "grad_norm": 0.5699253974519912, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 34225 + }, + { + "epoch": 0.34226, + "grad_norm": 0.5742641829064818, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 34226 + }, + { + "epoch": 0.34227, + "grad_norm": 0.6445195732943844, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 34227 + }, + { + "epoch": 0.34228, + "grad_norm": 0.7741742423700957, + "learning_rate": 0.003, + "loss": 4.017, + "step": 34228 + }, + { + "epoch": 0.34229, + "grad_norm": 1.0830562373713817, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 34229 + }, + { + "epoch": 0.3423, + "grad_norm": 1.1508411160115481, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 34230 + }, + { + "epoch": 0.34231, + "grad_norm": 0.7479905326894947, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 34231 + }, + { + "epoch": 0.34232, + "grad_norm": 0.7006662892860149, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 34232 + }, + { + "epoch": 0.34233, + "grad_norm": 0.7284964864633967, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 34233 + }, + { + "epoch": 0.34234, + "grad_norm": 0.8211042755979867, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 34234 + }, + { + "epoch": 0.34235, + "grad_norm": 0.8996660822931208, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 34235 + }, + { + "epoch": 0.34236, + "grad_norm": 0.9307934822601484, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 34236 + }, + { + "epoch": 0.34237, + "grad_norm": 0.9908394542096045, + "learning_rate": 0.003, + "loss": 4.025, + "step": 34237 + }, + { + "epoch": 0.34238, + "grad_norm": 0.8665372974822444, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 34238 + }, + { + "epoch": 0.34239, + "grad_norm": 0.8144994677925155, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 34239 + }, + { + "epoch": 0.3424, + "grad_norm": 0.8615312584944728, + "learning_rate": 0.003, + "loss": 4.025, + "step": 34240 + }, + { + "epoch": 0.34241, + "grad_norm": 0.8268039419964168, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 34241 + }, + { + "epoch": 0.34242, + "grad_norm": 0.7131123957611836, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 34242 + }, + { + "epoch": 0.34243, + "grad_norm": 0.8036957144140716, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 34243 + }, + { + "epoch": 0.34244, + "grad_norm": 0.8958642843159621, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 34244 + }, + { + "epoch": 0.34245, + "grad_norm": 0.9980564249085984, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 34245 + }, + { + "epoch": 0.34246, + "grad_norm": 1.012051849020208, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 34246 + }, + { + "epoch": 0.34247, + "grad_norm": 1.0778167626269333, + "learning_rate": 0.003, + "loss": 4.0793, + "step": 34247 + }, + { + "epoch": 0.34248, + "grad_norm": 0.9765579892746566, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 34248 + }, + { + "epoch": 0.34249, + "grad_norm": 0.8649853649720828, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 34249 + }, + { + "epoch": 0.3425, + "grad_norm": 0.7975365810670239, + "learning_rate": 0.003, + "loss": 4.035, + "step": 34250 + }, + { + "epoch": 0.34251, + "grad_norm": 0.7151677310016729, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 34251 + }, + { + "epoch": 0.34252, + "grad_norm": 0.7078988911251008, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 34252 + }, + { + "epoch": 0.34253, + "grad_norm": 0.7918740097700563, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 34253 + }, + { + "epoch": 0.34254, + "grad_norm": 0.9367906909600723, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 34254 + }, + { + "epoch": 0.34255, + "grad_norm": 1.0754901178345573, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 34255 + }, + { + "epoch": 0.34256, + "grad_norm": 1.0511790133931227, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 34256 + }, + { + "epoch": 0.34257, + "grad_norm": 0.9128938945746193, + "learning_rate": 0.003, + "loss": 4.038, + "step": 34257 + }, + { + "epoch": 0.34258, + "grad_norm": 0.7837822125276601, + "learning_rate": 0.003, + "loss": 4.053, + "step": 34258 + }, + { + "epoch": 0.34259, + "grad_norm": 0.7319731593079192, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 34259 + }, + { + "epoch": 0.3426, + "grad_norm": 0.8052981107971173, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 34260 + }, + { + "epoch": 0.34261, + "grad_norm": 0.7960667158634984, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 34261 + }, + { + "epoch": 0.34262, + "grad_norm": 0.8283638397118233, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 34262 + }, + { + "epoch": 0.34263, + "grad_norm": 1.049207409215749, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 34263 + }, + { + "epoch": 0.34264, + "grad_norm": 1.087561548757546, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 34264 + }, + { + "epoch": 0.34265, + "grad_norm": 0.8660800275215281, + "learning_rate": 0.003, + "loss": 4.0778, + "step": 34265 + }, + { + "epoch": 0.34266, + "grad_norm": 0.7108522858875274, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 34266 + }, + { + "epoch": 0.34267, + "grad_norm": 0.6790127019678809, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 34267 + }, + { + "epoch": 0.34268, + "grad_norm": 0.5172674225151003, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 34268 + }, + { + "epoch": 0.34269, + "grad_norm": 0.5516572579737573, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 34269 + }, + { + "epoch": 0.3427, + "grad_norm": 0.4875027218016631, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 34270 + }, + { + "epoch": 0.34271, + "grad_norm": 0.49162549557791874, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 34271 + }, + { + "epoch": 0.34272, + "grad_norm": 0.5669864914452368, + "learning_rate": 0.003, + "loss": 4.007, + "step": 34272 + }, + { + "epoch": 0.34273, + "grad_norm": 0.6400140131502341, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 34273 + }, + { + "epoch": 0.34274, + "grad_norm": 0.7492609919725377, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 34274 + }, + { + "epoch": 0.34275, + "grad_norm": 0.6632336496903914, + "learning_rate": 0.003, + "loss": 4.0051, + "step": 34275 + }, + { + "epoch": 0.34276, + "grad_norm": 0.6201661330325717, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 34276 + }, + { + "epoch": 0.34277, + "grad_norm": 0.7305374742525537, + "learning_rate": 0.003, + "loss": 4.0008, + "step": 34277 + }, + { + "epoch": 0.34278, + "grad_norm": 0.9853924506892965, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 34278 + }, + { + "epoch": 0.34279, + "grad_norm": 1.3939815351793787, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 34279 + }, + { + "epoch": 0.3428, + "grad_norm": 0.5708547303721172, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 34280 + }, + { + "epoch": 0.34281, + "grad_norm": 0.8135471227141484, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 34281 + }, + { + "epoch": 0.34282, + "grad_norm": 0.959858579324949, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 34282 + }, + { + "epoch": 0.34283, + "grad_norm": 0.8405824977296205, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 34283 + }, + { + "epoch": 0.34284, + "grad_norm": 0.7954336512035353, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 34284 + }, + { + "epoch": 0.34285, + "grad_norm": 0.9188836007621363, + "learning_rate": 0.003, + "loss": 4.022, + "step": 34285 + }, + { + "epoch": 0.34286, + "grad_norm": 0.8535824582432092, + "learning_rate": 0.003, + "loss": 4.074, + "step": 34286 + }, + { + "epoch": 0.34287, + "grad_norm": 0.799695422441215, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 34287 + }, + { + "epoch": 0.34288, + "grad_norm": 0.9174510006166071, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 34288 + }, + { + "epoch": 0.34289, + "grad_norm": 0.9391807684809665, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 34289 + }, + { + "epoch": 0.3429, + "grad_norm": 0.8578236790706937, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 34290 + }, + { + "epoch": 0.34291, + "grad_norm": 0.8729531764495353, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 34291 + }, + { + "epoch": 0.34292, + "grad_norm": 0.9433296627524341, + "learning_rate": 0.003, + "loss": 4.048, + "step": 34292 + }, + { + "epoch": 0.34293, + "grad_norm": 1.0635091446047824, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 34293 + }, + { + "epoch": 0.34294, + "grad_norm": 0.9978057822497826, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 34294 + }, + { + "epoch": 0.34295, + "grad_norm": 1.07225375221665, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 34295 + }, + { + "epoch": 0.34296, + "grad_norm": 0.9347544182003088, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 34296 + }, + { + "epoch": 0.34297, + "grad_norm": 0.934710529614385, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 34297 + }, + { + "epoch": 0.34298, + "grad_norm": 1.0179271556328928, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 34298 + }, + { + "epoch": 0.34299, + "grad_norm": 1.0049584509086138, + "learning_rate": 0.003, + "loss": 4.06, + "step": 34299 + }, + { + "epoch": 0.343, + "grad_norm": 0.9525167737102912, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 34300 + }, + { + "epoch": 0.34301, + "grad_norm": 1.1083105118296863, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 34301 + }, + { + "epoch": 0.34302, + "grad_norm": 0.9471370460009991, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 34302 + }, + { + "epoch": 0.34303, + "grad_norm": 0.8181247966331232, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 34303 + }, + { + "epoch": 0.34304, + "grad_norm": 0.7220401322600519, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 34304 + }, + { + "epoch": 0.34305, + "grad_norm": 0.6766740733035059, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 34305 + }, + { + "epoch": 0.34306, + "grad_norm": 0.62972985859045, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 34306 + }, + { + "epoch": 0.34307, + "grad_norm": 0.5935099991824007, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 34307 + }, + { + "epoch": 0.34308, + "grad_norm": 0.6586530179350258, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 34308 + }, + { + "epoch": 0.34309, + "grad_norm": 0.7947822828267335, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 34309 + }, + { + "epoch": 0.3431, + "grad_norm": 0.8755811774071334, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 34310 + }, + { + "epoch": 0.34311, + "grad_norm": 0.8978993923874248, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 34311 + }, + { + "epoch": 0.34312, + "grad_norm": 0.8065217673946592, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 34312 + }, + { + "epoch": 0.34313, + "grad_norm": 0.7013078414032896, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 34313 + }, + { + "epoch": 0.34314, + "grad_norm": 0.5714876054591733, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 34314 + }, + { + "epoch": 0.34315, + "grad_norm": 0.5551558743592503, + "learning_rate": 0.003, + "loss": 4.013, + "step": 34315 + }, + { + "epoch": 0.34316, + "grad_norm": 0.6775217475244516, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 34316 + }, + { + "epoch": 0.34317, + "grad_norm": 0.8237157751720057, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 34317 + }, + { + "epoch": 0.34318, + "grad_norm": 0.8923689250664276, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 34318 + }, + { + "epoch": 0.34319, + "grad_norm": 0.8845253668501907, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 34319 + }, + { + "epoch": 0.3432, + "grad_norm": 0.9120541948215466, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 34320 + }, + { + "epoch": 0.34321, + "grad_norm": 1.0184850553867384, + "learning_rate": 0.003, + "loss": 4.052, + "step": 34321 + }, + { + "epoch": 0.34322, + "grad_norm": 1.0133093429008453, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 34322 + }, + { + "epoch": 0.34323, + "grad_norm": 0.9505683264218208, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 34323 + }, + { + "epoch": 0.34324, + "grad_norm": 1.0555282740390874, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 34324 + }, + { + "epoch": 0.34325, + "grad_norm": 1.1125853493003732, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 34325 + }, + { + "epoch": 0.34326, + "grad_norm": 0.9741249648896545, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 34326 + }, + { + "epoch": 0.34327, + "grad_norm": 0.9218446144112232, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 34327 + }, + { + "epoch": 0.34328, + "grad_norm": 0.908394672496767, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 34328 + }, + { + "epoch": 0.34329, + "grad_norm": 0.9215306116820904, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 34329 + }, + { + "epoch": 0.3433, + "grad_norm": 0.8035282425982846, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 34330 + }, + { + "epoch": 0.34331, + "grad_norm": 0.6781373725729692, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 34331 + }, + { + "epoch": 0.34332, + "grad_norm": 0.853842159110138, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 34332 + }, + { + "epoch": 0.34333, + "grad_norm": 0.8921161225298422, + "learning_rate": 0.003, + "loss": 4.0008, + "step": 34333 + }, + { + "epoch": 0.34334, + "grad_norm": 0.8491323110414845, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 34334 + }, + { + "epoch": 0.34335, + "grad_norm": 0.8131831120473866, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 34335 + }, + { + "epoch": 0.34336, + "grad_norm": 0.7265825088326356, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 34336 + }, + { + "epoch": 0.34337, + "grad_norm": 0.7158437676511544, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 34337 + }, + { + "epoch": 0.34338, + "grad_norm": 0.8460344114523077, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 34338 + }, + { + "epoch": 0.34339, + "grad_norm": 0.9168844906636844, + "learning_rate": 0.003, + "loss": 4.051, + "step": 34339 + }, + { + "epoch": 0.3434, + "grad_norm": 0.9224397538726977, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 34340 + }, + { + "epoch": 0.34341, + "grad_norm": 0.7803160686381381, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 34341 + }, + { + "epoch": 0.34342, + "grad_norm": 0.7390765807257162, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 34342 + }, + { + "epoch": 0.34343, + "grad_norm": 0.7302430056878702, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 34343 + }, + { + "epoch": 0.34344, + "grad_norm": 0.715826148209429, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 34344 + }, + { + "epoch": 0.34345, + "grad_norm": 0.6932553081036588, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 34345 + }, + { + "epoch": 0.34346, + "grad_norm": 0.7673599504695867, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 34346 + }, + { + "epoch": 0.34347, + "grad_norm": 0.7865430439887382, + "learning_rate": 0.003, + "loss": 4.031, + "step": 34347 + }, + { + "epoch": 0.34348, + "grad_norm": 0.8232580619952031, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 34348 + }, + { + "epoch": 0.34349, + "grad_norm": 1.0077988383261338, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 34349 + }, + { + "epoch": 0.3435, + "grad_norm": 1.016309087892976, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 34350 + }, + { + "epoch": 0.34351, + "grad_norm": 0.8970888059866468, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 34351 + }, + { + "epoch": 0.34352, + "grad_norm": 0.8809211631931301, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 34352 + }, + { + "epoch": 0.34353, + "grad_norm": 1.1212846917344534, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 34353 + }, + { + "epoch": 0.34354, + "grad_norm": 1.030031316866575, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 34354 + }, + { + "epoch": 0.34355, + "grad_norm": 0.9733970380447158, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 34355 + }, + { + "epoch": 0.34356, + "grad_norm": 0.9853300531237188, + "learning_rate": 0.003, + "loss": 4.045, + "step": 34356 + }, + { + "epoch": 0.34357, + "grad_norm": 0.9736043010167277, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 34357 + }, + { + "epoch": 0.34358, + "grad_norm": 1.0126975615045644, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 34358 + }, + { + "epoch": 0.34359, + "grad_norm": 1.0113224916306265, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 34359 + }, + { + "epoch": 0.3436, + "grad_norm": 0.8717947372457454, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 34360 + }, + { + "epoch": 0.34361, + "grad_norm": 0.7836203436739804, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 34361 + }, + { + "epoch": 0.34362, + "grad_norm": 0.7515575314005516, + "learning_rate": 0.003, + "loss": 4.048, + "step": 34362 + }, + { + "epoch": 0.34363, + "grad_norm": 0.8304115963877164, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 34363 + }, + { + "epoch": 0.34364, + "grad_norm": 0.9506349334457698, + "learning_rate": 0.003, + "loss": 4.1048, + "step": 34364 + }, + { + "epoch": 0.34365, + "grad_norm": 0.9777236455328896, + "learning_rate": 0.003, + "loss": 4.042, + "step": 34365 + }, + { + "epoch": 0.34366, + "grad_norm": 1.1314325156774205, + "learning_rate": 0.003, + "loss": 4.036, + "step": 34366 + }, + { + "epoch": 0.34367, + "grad_norm": 1.0220916338758927, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 34367 + }, + { + "epoch": 0.34368, + "grad_norm": 0.9200834326138174, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 34368 + }, + { + "epoch": 0.34369, + "grad_norm": 0.8014450665199313, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 34369 + }, + { + "epoch": 0.3437, + "grad_norm": 0.8291264520590482, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 34370 + }, + { + "epoch": 0.34371, + "grad_norm": 0.7457737325038994, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 34371 + }, + { + "epoch": 0.34372, + "grad_norm": 0.7643534957043733, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 34372 + }, + { + "epoch": 0.34373, + "grad_norm": 0.6994266266054555, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 34373 + }, + { + "epoch": 0.34374, + "grad_norm": 0.6606628565346713, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 34374 + }, + { + "epoch": 0.34375, + "grad_norm": 0.718675995581735, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 34375 + }, + { + "epoch": 0.34376, + "grad_norm": 0.689593047774557, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 34376 + }, + { + "epoch": 0.34377, + "grad_norm": 0.5739551326274103, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 34377 + }, + { + "epoch": 0.34378, + "grad_norm": 0.4793925227047839, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 34378 + }, + { + "epoch": 0.34379, + "grad_norm": 0.4633989402473253, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 34379 + }, + { + "epoch": 0.3438, + "grad_norm": 0.4239188184978804, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 34380 + }, + { + "epoch": 0.34381, + "grad_norm": 0.4262550684731709, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 34381 + }, + { + "epoch": 0.34382, + "grad_norm": 0.45615950335474575, + "learning_rate": 0.003, + "loss": 3.9908, + "step": 34382 + }, + { + "epoch": 0.34383, + "grad_norm": 0.472199562235203, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 34383 + }, + { + "epoch": 0.34384, + "grad_norm": 0.6236999368583548, + "learning_rate": 0.003, + "loss": 4.045, + "step": 34384 + }, + { + "epoch": 0.34385, + "grad_norm": 0.772775769192427, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 34385 + }, + { + "epoch": 0.34386, + "grad_norm": 0.8766508638928393, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 34386 + }, + { + "epoch": 0.34387, + "grad_norm": 1.0612280814739388, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 34387 + }, + { + "epoch": 0.34388, + "grad_norm": 1.2906864433168301, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 34388 + }, + { + "epoch": 0.34389, + "grad_norm": 0.7721513568124236, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 34389 + }, + { + "epoch": 0.3439, + "grad_norm": 0.7648390138572144, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 34390 + }, + { + "epoch": 0.34391, + "grad_norm": 0.7821557713956991, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 34391 + }, + { + "epoch": 0.34392, + "grad_norm": 0.9552782014054575, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 34392 + }, + { + "epoch": 0.34393, + "grad_norm": 0.9909735813373586, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 34393 + }, + { + "epoch": 0.34394, + "grad_norm": 0.8989240648359856, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 34394 + }, + { + "epoch": 0.34395, + "grad_norm": 0.8570167605812792, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 34395 + }, + { + "epoch": 0.34396, + "grad_norm": 0.9154828822235801, + "learning_rate": 0.003, + "loss": 4.071, + "step": 34396 + }, + { + "epoch": 0.34397, + "grad_norm": 0.9029045413868424, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 34397 + }, + { + "epoch": 0.34398, + "grad_norm": 0.941098220920906, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 34398 + }, + { + "epoch": 0.34399, + "grad_norm": 1.0314394226835948, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 34399 + }, + { + "epoch": 0.344, + "grad_norm": 1.0509736862048664, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 34400 + }, + { + "epoch": 0.34401, + "grad_norm": 0.9011762535245328, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 34401 + }, + { + "epoch": 0.34402, + "grad_norm": 0.9144210967017222, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 34402 + }, + { + "epoch": 0.34403, + "grad_norm": 0.8578916494018558, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 34403 + }, + { + "epoch": 0.34404, + "grad_norm": 0.8447016674093634, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 34404 + }, + { + "epoch": 0.34405, + "grad_norm": 0.9658522291091811, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 34405 + }, + { + "epoch": 0.34406, + "grad_norm": 1.0656648709692087, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 34406 + }, + { + "epoch": 0.34407, + "grad_norm": 1.059295109034186, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 34407 + }, + { + "epoch": 0.34408, + "grad_norm": 1.1293351786126067, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 34408 + }, + { + "epoch": 0.34409, + "grad_norm": 0.8045462094366068, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 34409 + }, + { + "epoch": 0.3441, + "grad_norm": 0.6889670566829941, + "learning_rate": 0.003, + "loss": 4.053, + "step": 34410 + }, + { + "epoch": 0.34411, + "grad_norm": 0.6353553276801132, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 34411 + }, + { + "epoch": 0.34412, + "grad_norm": 0.6480420536539415, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 34412 + }, + { + "epoch": 0.34413, + "grad_norm": 0.6445141408900813, + "learning_rate": 0.003, + "loss": 4.035, + "step": 34413 + }, + { + "epoch": 0.34414, + "grad_norm": 0.638126700908288, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 34414 + }, + { + "epoch": 0.34415, + "grad_norm": 0.7116979656648418, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 34415 + }, + { + "epoch": 0.34416, + "grad_norm": 0.8434477417143457, + "learning_rate": 0.003, + "loss": 4.032, + "step": 34416 + }, + { + "epoch": 0.34417, + "grad_norm": 0.8316456338766718, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 34417 + }, + { + "epoch": 0.34418, + "grad_norm": 0.7071585703980245, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 34418 + }, + { + "epoch": 0.34419, + "grad_norm": 0.6647223070510185, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 34419 + }, + { + "epoch": 0.3442, + "grad_norm": 0.7937624249375839, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 34420 + }, + { + "epoch": 0.34421, + "grad_norm": 0.8363459274095433, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 34421 + }, + { + "epoch": 0.34422, + "grad_norm": 0.8181373119953027, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 34422 + }, + { + "epoch": 0.34423, + "grad_norm": 0.8268951977055664, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 34423 + }, + { + "epoch": 0.34424, + "grad_norm": 0.8390735356167486, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 34424 + }, + { + "epoch": 0.34425, + "grad_norm": 0.8692473541297471, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 34425 + }, + { + "epoch": 0.34426, + "grad_norm": 1.0376042999418795, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 34426 + }, + { + "epoch": 0.34427, + "grad_norm": 1.2091898923807605, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 34427 + }, + { + "epoch": 0.34428, + "grad_norm": 0.7933274911408704, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 34428 + }, + { + "epoch": 0.34429, + "grad_norm": 0.7648897799768679, + "learning_rate": 0.003, + "loss": 4.054, + "step": 34429 + }, + { + "epoch": 0.3443, + "grad_norm": 0.8190916993448014, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 34430 + }, + { + "epoch": 0.34431, + "grad_norm": 0.8945365712701867, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 34431 + }, + { + "epoch": 0.34432, + "grad_norm": 0.9012615478087793, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 34432 + }, + { + "epoch": 0.34433, + "grad_norm": 0.9055400214296948, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 34433 + }, + { + "epoch": 0.34434, + "grad_norm": 0.8531961035640648, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 34434 + }, + { + "epoch": 0.34435, + "grad_norm": 0.8380405087179986, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 34435 + }, + { + "epoch": 0.34436, + "grad_norm": 0.9156856253885731, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 34436 + }, + { + "epoch": 0.34437, + "grad_norm": 0.9008425180649923, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 34437 + }, + { + "epoch": 0.34438, + "grad_norm": 0.9519058120000256, + "learning_rate": 0.003, + "loss": 4.022, + "step": 34438 + }, + { + "epoch": 0.34439, + "grad_norm": 1.0646092287705244, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 34439 + }, + { + "epoch": 0.3444, + "grad_norm": 0.8378762792749237, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 34440 + }, + { + "epoch": 0.34441, + "grad_norm": 0.6485099655237964, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 34441 + }, + { + "epoch": 0.34442, + "grad_norm": 0.6459689250686206, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 34442 + }, + { + "epoch": 0.34443, + "grad_norm": 0.6827979312615772, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 34443 + }, + { + "epoch": 0.34444, + "grad_norm": 0.6395002213577309, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 34444 + }, + { + "epoch": 0.34445, + "grad_norm": 0.6263494258134135, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 34445 + }, + { + "epoch": 0.34446, + "grad_norm": 0.7846406183637684, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 34446 + }, + { + "epoch": 0.34447, + "grad_norm": 0.9496997390306008, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 34447 + }, + { + "epoch": 0.34448, + "grad_norm": 1.066632209520968, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 34448 + }, + { + "epoch": 0.34449, + "grad_norm": 0.8334939772611274, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 34449 + }, + { + "epoch": 0.3445, + "grad_norm": 0.7309908659528728, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 34450 + }, + { + "epoch": 0.34451, + "grad_norm": 0.8099299652602019, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 34451 + }, + { + "epoch": 0.34452, + "grad_norm": 0.9285309097363291, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 34452 + }, + { + "epoch": 0.34453, + "grad_norm": 0.949055266865184, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 34453 + }, + { + "epoch": 0.34454, + "grad_norm": 0.8906996516390296, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 34454 + }, + { + "epoch": 0.34455, + "grad_norm": 0.7579628772437226, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 34455 + }, + { + "epoch": 0.34456, + "grad_norm": 0.7199277560393076, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 34456 + }, + { + "epoch": 0.34457, + "grad_norm": 0.7405526006916023, + "learning_rate": 0.003, + "loss": 4.052, + "step": 34457 + }, + { + "epoch": 0.34458, + "grad_norm": 0.847058536175887, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 34458 + }, + { + "epoch": 0.34459, + "grad_norm": 0.7891104802743765, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 34459 + }, + { + "epoch": 0.3446, + "grad_norm": 0.7403359572905774, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 34460 + }, + { + "epoch": 0.34461, + "grad_norm": 0.7777923144426475, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 34461 + }, + { + "epoch": 0.34462, + "grad_norm": 0.7270321301791535, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 34462 + }, + { + "epoch": 0.34463, + "grad_norm": 0.8351091971797917, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 34463 + }, + { + "epoch": 0.34464, + "grad_norm": 0.8329056104282853, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 34464 + }, + { + "epoch": 0.34465, + "grad_norm": 0.6945163884811114, + "learning_rate": 0.003, + "loss": 3.9968, + "step": 34465 + }, + { + "epoch": 0.34466, + "grad_norm": 0.7823688049564557, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 34466 + }, + { + "epoch": 0.34467, + "grad_norm": 0.8211512364531924, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 34467 + }, + { + "epoch": 0.34468, + "grad_norm": 0.9767848317581035, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 34468 + }, + { + "epoch": 0.34469, + "grad_norm": 1.113259243299537, + "learning_rate": 0.003, + "loss": 4.0767, + "step": 34469 + }, + { + "epoch": 0.3447, + "grad_norm": 0.8945362407203205, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 34470 + }, + { + "epoch": 0.34471, + "grad_norm": 1.021054546601807, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 34471 + }, + { + "epoch": 0.34472, + "grad_norm": 1.028389745168708, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 34472 + }, + { + "epoch": 0.34473, + "grad_norm": 1.0665819796141116, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 34473 + }, + { + "epoch": 0.34474, + "grad_norm": 0.9430507186661142, + "learning_rate": 0.003, + "loss": 4.058, + "step": 34474 + }, + { + "epoch": 0.34475, + "grad_norm": 0.9744671695296545, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 34475 + }, + { + "epoch": 0.34476, + "grad_norm": 0.9440649527200788, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 34476 + }, + { + "epoch": 0.34477, + "grad_norm": 0.9228166552876553, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 34477 + }, + { + "epoch": 0.34478, + "grad_norm": 0.8787752207843391, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 34478 + }, + { + "epoch": 0.34479, + "grad_norm": 0.9585833270810419, + "learning_rate": 0.003, + "loss": 4.0004, + "step": 34479 + }, + { + "epoch": 0.3448, + "grad_norm": 0.913230746739235, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 34480 + }, + { + "epoch": 0.34481, + "grad_norm": 0.8443848023179719, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 34481 + }, + { + "epoch": 0.34482, + "grad_norm": 0.9425020103806578, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 34482 + }, + { + "epoch": 0.34483, + "grad_norm": 1.1006441880868523, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 34483 + }, + { + "epoch": 0.34484, + "grad_norm": 0.823136366930898, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 34484 + }, + { + "epoch": 0.34485, + "grad_norm": 0.6960911088675873, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 34485 + }, + { + "epoch": 0.34486, + "grad_norm": 0.5919128082862295, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 34486 + }, + { + "epoch": 0.34487, + "grad_norm": 0.6504391539004777, + "learning_rate": 0.003, + "loss": 4.026, + "step": 34487 + }, + { + "epoch": 0.34488, + "grad_norm": 0.6791437979013697, + "learning_rate": 0.003, + "loss": 3.9875, + "step": 34488 + }, + { + "epoch": 0.34489, + "grad_norm": 0.75498878784684, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 34489 + }, + { + "epoch": 0.3449, + "grad_norm": 0.8743215113414976, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 34490 + }, + { + "epoch": 0.34491, + "grad_norm": 0.8667212292107455, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 34491 + }, + { + "epoch": 0.34492, + "grad_norm": 0.7704183451240254, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 34492 + }, + { + "epoch": 0.34493, + "grad_norm": 0.630302722729315, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 34493 + }, + { + "epoch": 0.34494, + "grad_norm": 0.5820349432020077, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 34494 + }, + { + "epoch": 0.34495, + "grad_norm": 0.5924345172755775, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 34495 + }, + { + "epoch": 0.34496, + "grad_norm": 0.6162099949796576, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 34496 + }, + { + "epoch": 0.34497, + "grad_norm": 0.7081988987761293, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 34497 + }, + { + "epoch": 0.34498, + "grad_norm": 0.7845977635084048, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 34498 + }, + { + "epoch": 0.34499, + "grad_norm": 0.8925926864490982, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 34499 + }, + { + "epoch": 0.345, + "grad_norm": 1.1076749896547693, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 34500 + }, + { + "epoch": 0.34501, + "grad_norm": 0.9281581851219178, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 34501 + }, + { + "epoch": 0.34502, + "grad_norm": 0.8642861625773332, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 34502 + }, + { + "epoch": 0.34503, + "grad_norm": 1.0046072231737118, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 34503 + }, + { + "epoch": 0.34504, + "grad_norm": 0.9838616773481328, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 34504 + }, + { + "epoch": 0.34505, + "grad_norm": 0.953057029619005, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 34505 + }, + { + "epoch": 0.34506, + "grad_norm": 0.9297301374460281, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 34506 + }, + { + "epoch": 0.34507, + "grad_norm": 0.7620046855883965, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 34507 + }, + { + "epoch": 0.34508, + "grad_norm": 0.6974924947418466, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 34508 + }, + { + "epoch": 0.34509, + "grad_norm": 0.646488825618821, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 34509 + }, + { + "epoch": 0.3451, + "grad_norm": 0.6750048949469442, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 34510 + }, + { + "epoch": 0.34511, + "grad_norm": 0.7280224074959399, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 34511 + }, + { + "epoch": 0.34512, + "grad_norm": 0.7914277567634888, + "learning_rate": 0.003, + "loss": 4.03, + "step": 34512 + }, + { + "epoch": 0.34513, + "grad_norm": 1.0964289554153872, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 34513 + }, + { + "epoch": 0.34514, + "grad_norm": 1.2791377206822097, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 34514 + }, + { + "epoch": 0.34515, + "grad_norm": 0.7700551973076047, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 34515 + }, + { + "epoch": 0.34516, + "grad_norm": 0.8039798203904456, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 34516 + }, + { + "epoch": 0.34517, + "grad_norm": 0.8049864972043989, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 34517 + }, + { + "epoch": 0.34518, + "grad_norm": 0.8346477015817062, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 34518 + }, + { + "epoch": 0.34519, + "grad_norm": 0.8084706744463853, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 34519 + }, + { + "epoch": 0.3452, + "grad_norm": 0.7862310731894386, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 34520 + }, + { + "epoch": 0.34521, + "grad_norm": 0.7845774376710308, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 34521 + }, + { + "epoch": 0.34522, + "grad_norm": 0.8661238083116782, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 34522 + }, + { + "epoch": 0.34523, + "grad_norm": 0.8291039374317084, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 34523 + }, + { + "epoch": 0.34524, + "grad_norm": 0.8582608514630463, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 34524 + }, + { + "epoch": 0.34525, + "grad_norm": 0.9565947836252808, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 34525 + }, + { + "epoch": 0.34526, + "grad_norm": 1.011379152526198, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 34526 + }, + { + "epoch": 0.34527, + "grad_norm": 1.0536115484476516, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 34527 + }, + { + "epoch": 0.34528, + "grad_norm": 0.9223553392970494, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 34528 + }, + { + "epoch": 0.34529, + "grad_norm": 0.8834702568551616, + "learning_rate": 0.003, + "loss": 4.029, + "step": 34529 + }, + { + "epoch": 0.3453, + "grad_norm": 0.9643716282096577, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 34530 + }, + { + "epoch": 0.34531, + "grad_norm": 1.0549261361948874, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 34531 + }, + { + "epoch": 0.34532, + "grad_norm": 0.8681925652959624, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 34532 + }, + { + "epoch": 0.34533, + "grad_norm": 0.8081112366219122, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 34533 + }, + { + "epoch": 0.34534, + "grad_norm": 0.8016640399609277, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 34534 + }, + { + "epoch": 0.34535, + "grad_norm": 0.7256033309801178, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 34535 + }, + { + "epoch": 0.34536, + "grad_norm": 0.7213868952615656, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 34536 + }, + { + "epoch": 0.34537, + "grad_norm": 0.6930666920656228, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 34537 + }, + { + "epoch": 0.34538, + "grad_norm": 0.7418710892844897, + "learning_rate": 0.003, + "loss": 4.033, + "step": 34538 + }, + { + "epoch": 0.34539, + "grad_norm": 0.906730519742983, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 34539 + }, + { + "epoch": 0.3454, + "grad_norm": 1.1408856132068281, + "learning_rate": 0.003, + "loss": 4.048, + "step": 34540 + }, + { + "epoch": 0.34541, + "grad_norm": 0.822724275633429, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 34541 + }, + { + "epoch": 0.34542, + "grad_norm": 0.7215945830847605, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 34542 + }, + { + "epoch": 0.34543, + "grad_norm": 0.7712502921885293, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 34543 + }, + { + "epoch": 0.34544, + "grad_norm": 0.8554056663840541, + "learning_rate": 0.003, + "loss": 4.027, + "step": 34544 + }, + { + "epoch": 0.34545, + "grad_norm": 0.822493298851573, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 34545 + }, + { + "epoch": 0.34546, + "grad_norm": 0.6997806874733459, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 34546 + }, + { + "epoch": 0.34547, + "grad_norm": 0.6465514566310105, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 34547 + }, + { + "epoch": 0.34548, + "grad_norm": 0.7415188196452973, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 34548 + }, + { + "epoch": 0.34549, + "grad_norm": 0.8149508153396584, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 34549 + }, + { + "epoch": 0.3455, + "grad_norm": 0.9039037170837417, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 34550 + }, + { + "epoch": 0.34551, + "grad_norm": 1.09703868368854, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 34551 + }, + { + "epoch": 0.34552, + "grad_norm": 1.112548429801207, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 34552 + }, + { + "epoch": 0.34553, + "grad_norm": 0.8596606829898167, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 34553 + }, + { + "epoch": 0.34554, + "grad_norm": 0.703125732760317, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 34554 + }, + { + "epoch": 0.34555, + "grad_norm": 0.665232441408588, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 34555 + }, + { + "epoch": 0.34556, + "grad_norm": 0.6759826399629765, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 34556 + }, + { + "epoch": 0.34557, + "grad_norm": 0.7933773221043016, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 34557 + }, + { + "epoch": 0.34558, + "grad_norm": 0.8609410667085539, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 34558 + }, + { + "epoch": 0.34559, + "grad_norm": 0.8195135548338098, + "learning_rate": 0.003, + "loss": 4.047, + "step": 34559 + }, + { + "epoch": 0.3456, + "grad_norm": 0.8250583228442626, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 34560 + }, + { + "epoch": 0.34561, + "grad_norm": 0.9270330730108449, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 34561 + }, + { + "epoch": 0.34562, + "grad_norm": 0.9079094707007858, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 34562 + }, + { + "epoch": 0.34563, + "grad_norm": 0.892176790650534, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 34563 + }, + { + "epoch": 0.34564, + "grad_norm": 0.8183163745853229, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 34564 + }, + { + "epoch": 0.34565, + "grad_norm": 0.9102899867379226, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 34565 + }, + { + "epoch": 0.34566, + "grad_norm": 0.9670867455092108, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 34566 + }, + { + "epoch": 0.34567, + "grad_norm": 1.1761193338658684, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 34567 + }, + { + "epoch": 0.34568, + "grad_norm": 0.8410875220313775, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 34568 + }, + { + "epoch": 0.34569, + "grad_norm": 0.6812215398123858, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 34569 + }, + { + "epoch": 0.3457, + "grad_norm": 0.7412583822425433, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 34570 + }, + { + "epoch": 0.34571, + "grad_norm": 0.7463570027262695, + "learning_rate": 0.003, + "loss": 4.041, + "step": 34571 + }, + { + "epoch": 0.34572, + "grad_norm": 0.7732749595965485, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 34572 + }, + { + "epoch": 0.34573, + "grad_norm": 0.733446532799318, + "learning_rate": 0.003, + "loss": 4.0022, + "step": 34573 + }, + { + "epoch": 0.34574, + "grad_norm": 0.5783139804370725, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 34574 + }, + { + "epoch": 0.34575, + "grad_norm": 0.5553829762633938, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 34575 + }, + { + "epoch": 0.34576, + "grad_norm": 0.519178272044254, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 34576 + }, + { + "epoch": 0.34577, + "grad_norm": 0.5400796769481038, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 34577 + }, + { + "epoch": 0.34578, + "grad_norm": 0.6244169251633643, + "learning_rate": 0.003, + "loss": 4.0019, + "step": 34578 + }, + { + "epoch": 0.34579, + "grad_norm": 0.7710071632116319, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 34579 + }, + { + "epoch": 0.3458, + "grad_norm": 0.9574734369924512, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 34580 + }, + { + "epoch": 0.34581, + "grad_norm": 1.0608778730643844, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 34581 + }, + { + "epoch": 0.34582, + "grad_norm": 0.9327007147221428, + "learning_rate": 0.003, + "loss": 4.057, + "step": 34582 + }, + { + "epoch": 0.34583, + "grad_norm": 0.8412969469583328, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 34583 + }, + { + "epoch": 0.34584, + "grad_norm": 0.7916873323799967, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 34584 + }, + { + "epoch": 0.34585, + "grad_norm": 0.9286630699257109, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 34585 + }, + { + "epoch": 0.34586, + "grad_norm": 0.821698764205388, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 34586 + }, + { + "epoch": 0.34587, + "grad_norm": 0.752623117435259, + "learning_rate": 0.003, + "loss": 4.03, + "step": 34587 + }, + { + "epoch": 0.34588, + "grad_norm": 0.7307158467940512, + "learning_rate": 0.003, + "loss": 4.031, + "step": 34588 + }, + { + "epoch": 0.34589, + "grad_norm": 0.8431457537234694, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 34589 + }, + { + "epoch": 0.3459, + "grad_norm": 0.7161184904473243, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 34590 + }, + { + "epoch": 0.34591, + "grad_norm": 0.7055874432909173, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 34591 + }, + { + "epoch": 0.34592, + "grad_norm": 0.6863780238149824, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 34592 + }, + { + "epoch": 0.34593, + "grad_norm": 0.7950832105897014, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 34593 + }, + { + "epoch": 0.34594, + "grad_norm": 1.0942104249739206, + "learning_rate": 0.003, + "loss": 4.056, + "step": 34594 + }, + { + "epoch": 0.34595, + "grad_norm": 1.333949293942147, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 34595 + }, + { + "epoch": 0.34596, + "grad_norm": 0.6479003370107027, + "learning_rate": 0.003, + "loss": 4.067, + "step": 34596 + }, + { + "epoch": 0.34597, + "grad_norm": 0.6703786679202025, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 34597 + }, + { + "epoch": 0.34598, + "grad_norm": 0.6626222824528238, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 34598 + }, + { + "epoch": 0.34599, + "grad_norm": 0.6181534129231927, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 34599 + }, + { + "epoch": 0.346, + "grad_norm": 0.6149757757079107, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 34600 + }, + { + "epoch": 0.34601, + "grad_norm": 0.6210751975947761, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 34601 + }, + { + "epoch": 0.34602, + "grad_norm": 0.6490147228503137, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 34602 + }, + { + "epoch": 0.34603, + "grad_norm": 0.7014481306282548, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 34603 + }, + { + "epoch": 0.34604, + "grad_norm": 0.8595456690331306, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 34604 + }, + { + "epoch": 0.34605, + "grad_norm": 1.0158973415551695, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 34605 + }, + { + "epoch": 0.34606, + "grad_norm": 1.1238570530522944, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 34606 + }, + { + "epoch": 0.34607, + "grad_norm": 0.9983904378473883, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 34607 + }, + { + "epoch": 0.34608, + "grad_norm": 0.9135744003058295, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 34608 + }, + { + "epoch": 0.34609, + "grad_norm": 0.8453986939453443, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 34609 + }, + { + "epoch": 0.3461, + "grad_norm": 0.8334210974995194, + "learning_rate": 0.003, + "loss": 4.031, + "step": 34610 + }, + { + "epoch": 0.34611, + "grad_norm": 0.9457224619920491, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 34611 + }, + { + "epoch": 0.34612, + "grad_norm": 1.0695793809515244, + "learning_rate": 0.003, + "loss": 4.023, + "step": 34612 + }, + { + "epoch": 0.34613, + "grad_norm": 0.962696952853266, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 34613 + }, + { + "epoch": 0.34614, + "grad_norm": 0.9366829546836091, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 34614 + }, + { + "epoch": 0.34615, + "grad_norm": 0.9225906274237546, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 34615 + }, + { + "epoch": 0.34616, + "grad_norm": 0.9356656459165791, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 34616 + }, + { + "epoch": 0.34617, + "grad_norm": 1.0443650388386483, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 34617 + }, + { + "epoch": 0.34618, + "grad_norm": 0.9089256689905104, + "learning_rate": 0.003, + "loss": 4.073, + "step": 34618 + }, + { + "epoch": 0.34619, + "grad_norm": 0.9245679770470324, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 34619 + }, + { + "epoch": 0.3462, + "grad_norm": 0.906816281061801, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 34620 + }, + { + "epoch": 0.34621, + "grad_norm": 0.8900554779205011, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 34621 + }, + { + "epoch": 0.34622, + "grad_norm": 0.9167210755072268, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 34622 + }, + { + "epoch": 0.34623, + "grad_norm": 0.9271246485469452, + "learning_rate": 0.003, + "loss": 4.056, + "step": 34623 + }, + { + "epoch": 0.34624, + "grad_norm": 0.9671650664491138, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 34624 + }, + { + "epoch": 0.34625, + "grad_norm": 1.0878599429961624, + "learning_rate": 0.003, + "loss": 4.055, + "step": 34625 + }, + { + "epoch": 0.34626, + "grad_norm": 0.9730720371756786, + "learning_rate": 0.003, + "loss": 4.077, + "step": 34626 + }, + { + "epoch": 0.34627, + "grad_norm": 0.9880396702064486, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 34627 + }, + { + "epoch": 0.34628, + "grad_norm": 0.8894830436738097, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 34628 + }, + { + "epoch": 0.34629, + "grad_norm": 0.8647243078311875, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 34629 + }, + { + "epoch": 0.3463, + "grad_norm": 0.940285445468824, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 34630 + }, + { + "epoch": 0.34631, + "grad_norm": 1.054761862061168, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 34631 + }, + { + "epoch": 0.34632, + "grad_norm": 0.9956283918936183, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 34632 + }, + { + "epoch": 0.34633, + "grad_norm": 0.9920727029686527, + "learning_rate": 0.003, + "loss": 4.068, + "step": 34633 + }, + { + "epoch": 0.34634, + "grad_norm": 0.9818429444654575, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 34634 + }, + { + "epoch": 0.34635, + "grad_norm": 0.8343387384144243, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 34635 + }, + { + "epoch": 0.34636, + "grad_norm": 0.7037457315731984, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 34636 + }, + { + "epoch": 0.34637, + "grad_norm": 0.6375211942443303, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 34637 + }, + { + "epoch": 0.34638, + "grad_norm": 0.6151064195032665, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 34638 + }, + { + "epoch": 0.34639, + "grad_norm": 0.6225598502319716, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 34639 + }, + { + "epoch": 0.3464, + "grad_norm": 0.5565320308539808, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 34640 + }, + { + "epoch": 0.34641, + "grad_norm": 0.5391187835487762, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 34641 + }, + { + "epoch": 0.34642, + "grad_norm": 0.5304891140402174, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 34642 + }, + { + "epoch": 0.34643, + "grad_norm": 0.53725390236833, + "learning_rate": 0.003, + "loss": 4.039, + "step": 34643 + }, + { + "epoch": 0.34644, + "grad_norm": 0.5373666527301979, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 34644 + }, + { + "epoch": 0.34645, + "grad_norm": 0.5363503966411274, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 34645 + }, + { + "epoch": 0.34646, + "grad_norm": 0.6447024696473392, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 34646 + }, + { + "epoch": 0.34647, + "grad_norm": 0.7033485475880766, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 34647 + }, + { + "epoch": 0.34648, + "grad_norm": 0.7326603503677568, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 34648 + }, + { + "epoch": 0.34649, + "grad_norm": 0.9234486521403993, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 34649 + }, + { + "epoch": 0.3465, + "grad_norm": 1.1905030246273414, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 34650 + }, + { + "epoch": 0.34651, + "grad_norm": 0.8167809875810752, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 34651 + }, + { + "epoch": 0.34652, + "grad_norm": 0.6583152294429505, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 34652 + }, + { + "epoch": 0.34653, + "grad_norm": 0.7267714883663945, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 34653 + }, + { + "epoch": 0.34654, + "grad_norm": 0.8434706300935787, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 34654 + }, + { + "epoch": 0.34655, + "grad_norm": 0.9338549873440369, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 34655 + }, + { + "epoch": 0.34656, + "grad_norm": 1.0213280635295177, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 34656 + }, + { + "epoch": 0.34657, + "grad_norm": 1.1361374853118171, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 34657 + }, + { + "epoch": 0.34658, + "grad_norm": 0.8473788764978512, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 34658 + }, + { + "epoch": 0.34659, + "grad_norm": 0.7906467663928731, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 34659 + }, + { + "epoch": 0.3466, + "grad_norm": 0.6680314017972818, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 34660 + }, + { + "epoch": 0.34661, + "grad_norm": 0.6267316251819774, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 34661 + }, + { + "epoch": 0.34662, + "grad_norm": 0.6906038335237771, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 34662 + }, + { + "epoch": 0.34663, + "grad_norm": 0.7703863127917444, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 34663 + }, + { + "epoch": 0.34664, + "grad_norm": 0.8309692348678631, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 34664 + }, + { + "epoch": 0.34665, + "grad_norm": 1.0428412189437197, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 34665 + }, + { + "epoch": 0.34666, + "grad_norm": 1.012451033553391, + "learning_rate": 0.003, + "loss": 4.076, + "step": 34666 + }, + { + "epoch": 0.34667, + "grad_norm": 0.7973078728052969, + "learning_rate": 0.003, + "loss": 4.073, + "step": 34667 + }, + { + "epoch": 0.34668, + "grad_norm": 0.7654093633267353, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 34668 + }, + { + "epoch": 0.34669, + "grad_norm": 0.7487491333825574, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 34669 + }, + { + "epoch": 0.3467, + "grad_norm": 0.6943802948742906, + "learning_rate": 0.003, + "loss": 4.0058, + "step": 34670 + }, + { + "epoch": 0.34671, + "grad_norm": 0.6145934426615317, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 34671 + }, + { + "epoch": 0.34672, + "grad_norm": 0.6353469729646086, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 34672 + }, + { + "epoch": 0.34673, + "grad_norm": 0.9130658519726988, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 34673 + }, + { + "epoch": 0.34674, + "grad_norm": 1.210078100423299, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 34674 + }, + { + "epoch": 0.34675, + "grad_norm": 0.7404195414523346, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 34675 + }, + { + "epoch": 0.34676, + "grad_norm": 0.6580430913109181, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 34676 + }, + { + "epoch": 0.34677, + "grad_norm": 0.7268081665402026, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 34677 + }, + { + "epoch": 0.34678, + "grad_norm": 0.8115557553891181, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 34678 + }, + { + "epoch": 0.34679, + "grad_norm": 0.885773058727303, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 34679 + }, + { + "epoch": 0.3468, + "grad_norm": 0.8319471666526586, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 34680 + }, + { + "epoch": 0.34681, + "grad_norm": 0.8267729688490805, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 34681 + }, + { + "epoch": 0.34682, + "grad_norm": 0.8343788479570813, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 34682 + }, + { + "epoch": 0.34683, + "grad_norm": 0.8142642524117197, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 34683 + }, + { + "epoch": 0.34684, + "grad_norm": 0.7368165380678396, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 34684 + }, + { + "epoch": 0.34685, + "grad_norm": 0.6959416429170956, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 34685 + }, + { + "epoch": 0.34686, + "grad_norm": 0.6897600147936405, + "learning_rate": 0.003, + "loss": 3.9975, + "step": 34686 + }, + { + "epoch": 0.34687, + "grad_norm": 0.7394440975365838, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 34687 + }, + { + "epoch": 0.34688, + "grad_norm": 0.82523715530707, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 34688 + }, + { + "epoch": 0.34689, + "grad_norm": 0.8574437515414237, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 34689 + }, + { + "epoch": 0.3469, + "grad_norm": 0.943895653666287, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 34690 + }, + { + "epoch": 0.34691, + "grad_norm": 1.0091286049220174, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 34691 + }, + { + "epoch": 0.34692, + "grad_norm": 1.1548239807244807, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 34692 + }, + { + "epoch": 0.34693, + "grad_norm": 0.8357080412700646, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 34693 + }, + { + "epoch": 0.34694, + "grad_norm": 0.7171877625081226, + "learning_rate": 0.003, + "loss": 4.008, + "step": 34694 + }, + { + "epoch": 0.34695, + "grad_norm": 0.7422227148232964, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 34695 + }, + { + "epoch": 0.34696, + "grad_norm": 0.7896766684999509, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 34696 + }, + { + "epoch": 0.34697, + "grad_norm": 0.7988083693700572, + "learning_rate": 0.003, + "loss": 3.9833, + "step": 34697 + }, + { + "epoch": 0.34698, + "grad_norm": 0.8094935048323625, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 34698 + }, + { + "epoch": 0.34699, + "grad_norm": 0.9552294378053499, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 34699 + }, + { + "epoch": 0.347, + "grad_norm": 1.045563797404123, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 34700 + }, + { + "epoch": 0.34701, + "grad_norm": 1.0876783982786937, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 34701 + }, + { + "epoch": 0.34702, + "grad_norm": 0.9982285691774913, + "learning_rate": 0.003, + "loss": 4.0928, + "step": 34702 + }, + { + "epoch": 0.34703, + "grad_norm": 1.0507514051842994, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 34703 + }, + { + "epoch": 0.34704, + "grad_norm": 0.925438513338819, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 34704 + }, + { + "epoch": 0.34705, + "grad_norm": 0.9406906776710916, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 34705 + }, + { + "epoch": 0.34706, + "grad_norm": 0.7965824073342543, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 34706 + }, + { + "epoch": 0.34707, + "grad_norm": 0.7313383107360606, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 34707 + }, + { + "epoch": 0.34708, + "grad_norm": 0.5729166377715145, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 34708 + }, + { + "epoch": 0.34709, + "grad_norm": 0.6134522572149388, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 34709 + }, + { + "epoch": 0.3471, + "grad_norm": 0.6931489569884725, + "learning_rate": 0.003, + "loss": 4.033, + "step": 34710 + }, + { + "epoch": 0.34711, + "grad_norm": 0.9295916027675596, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 34711 + }, + { + "epoch": 0.34712, + "grad_norm": 1.2382729696312988, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 34712 + }, + { + "epoch": 0.34713, + "grad_norm": 0.8174289835703898, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 34713 + }, + { + "epoch": 0.34714, + "grad_norm": 0.7743258962350672, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 34714 + }, + { + "epoch": 0.34715, + "grad_norm": 0.7144251891931662, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 34715 + }, + { + "epoch": 0.34716, + "grad_norm": 0.7155749841536181, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 34716 + }, + { + "epoch": 0.34717, + "grad_norm": 0.561932277594511, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 34717 + }, + { + "epoch": 0.34718, + "grad_norm": 0.5502834180443676, + "learning_rate": 0.003, + "loss": 4.0031, + "step": 34718 + }, + { + "epoch": 0.34719, + "grad_norm": 0.5480694725499755, + "learning_rate": 0.003, + "loss": 4.007, + "step": 34719 + }, + { + "epoch": 0.3472, + "grad_norm": 0.6273215776253827, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 34720 + }, + { + "epoch": 0.34721, + "grad_norm": 0.7584603593059065, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 34721 + }, + { + "epoch": 0.34722, + "grad_norm": 0.81603078100306, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 34722 + }, + { + "epoch": 0.34723, + "grad_norm": 0.8594573437945927, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 34723 + }, + { + "epoch": 0.34724, + "grad_norm": 0.9951991951311653, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 34724 + }, + { + "epoch": 0.34725, + "grad_norm": 1.041162204345738, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 34725 + }, + { + "epoch": 0.34726, + "grad_norm": 0.9239428607476896, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 34726 + }, + { + "epoch": 0.34727, + "grad_norm": 0.9465568236722067, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 34727 + }, + { + "epoch": 0.34728, + "grad_norm": 0.9433310374917266, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 34728 + }, + { + "epoch": 0.34729, + "grad_norm": 1.0748439474844254, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 34729 + }, + { + "epoch": 0.3473, + "grad_norm": 0.9226510694415608, + "learning_rate": 0.003, + "loss": 4.008, + "step": 34730 + }, + { + "epoch": 0.34731, + "grad_norm": 0.7656419682194239, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 34731 + }, + { + "epoch": 0.34732, + "grad_norm": 0.836469575787767, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 34732 + }, + { + "epoch": 0.34733, + "grad_norm": 0.8174535565188165, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 34733 + }, + { + "epoch": 0.34734, + "grad_norm": 0.7917935316776014, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 34734 + }, + { + "epoch": 0.34735, + "grad_norm": 0.9103871409165347, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 34735 + }, + { + "epoch": 0.34736, + "grad_norm": 1.0253272727526552, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 34736 + }, + { + "epoch": 0.34737, + "grad_norm": 1.0005938816867341, + "learning_rate": 0.003, + "loss": 4.077, + "step": 34737 + }, + { + "epoch": 0.34738, + "grad_norm": 0.9042397886112195, + "learning_rate": 0.003, + "loss": 4.077, + "step": 34738 + }, + { + "epoch": 0.34739, + "grad_norm": 0.7530757325860072, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 34739 + }, + { + "epoch": 0.3474, + "grad_norm": 0.840596787096, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 34740 + }, + { + "epoch": 0.34741, + "grad_norm": 0.9256269651664081, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 34741 + }, + { + "epoch": 0.34742, + "grad_norm": 0.9318867064370578, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 34742 + }, + { + "epoch": 0.34743, + "grad_norm": 0.7676936460923819, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 34743 + }, + { + "epoch": 0.34744, + "grad_norm": 0.7296999772557023, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 34744 + }, + { + "epoch": 0.34745, + "grad_norm": 0.712787120356627, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 34745 + }, + { + "epoch": 0.34746, + "grad_norm": 0.835509356460299, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 34746 + }, + { + "epoch": 0.34747, + "grad_norm": 0.9458465800789766, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 34747 + }, + { + "epoch": 0.34748, + "grad_norm": 1.2109992687389635, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 34748 + }, + { + "epoch": 0.34749, + "grad_norm": 0.8745993912754669, + "learning_rate": 0.003, + "loss": 4.035, + "step": 34749 + }, + { + "epoch": 0.3475, + "grad_norm": 0.8908407250534748, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 34750 + }, + { + "epoch": 0.34751, + "grad_norm": 0.8823839356389209, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 34751 + }, + { + "epoch": 0.34752, + "grad_norm": 0.8316918283183974, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 34752 + }, + { + "epoch": 0.34753, + "grad_norm": 0.6936438907768921, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 34753 + }, + { + "epoch": 0.34754, + "grad_norm": 0.5529946924955295, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 34754 + }, + { + "epoch": 0.34755, + "grad_norm": 0.6047875636151336, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 34755 + }, + { + "epoch": 0.34756, + "grad_norm": 0.6940884590116089, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 34756 + }, + { + "epoch": 0.34757, + "grad_norm": 0.7180263609359335, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 34757 + }, + { + "epoch": 0.34758, + "grad_norm": 0.7433459948604162, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 34758 + }, + { + "epoch": 0.34759, + "grad_norm": 0.7606738060953074, + "learning_rate": 0.003, + "loss": 4.044, + "step": 34759 + }, + { + "epoch": 0.3476, + "grad_norm": 0.8412939640293552, + "learning_rate": 0.003, + "loss": 4.035, + "step": 34760 + }, + { + "epoch": 0.34761, + "grad_norm": 0.8368877813487658, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 34761 + }, + { + "epoch": 0.34762, + "grad_norm": 0.9483894419874874, + "learning_rate": 0.003, + "loss": 4.077, + "step": 34762 + }, + { + "epoch": 0.34763, + "grad_norm": 1.1263368545780004, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 34763 + }, + { + "epoch": 0.34764, + "grad_norm": 1.0033864793508824, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 34764 + }, + { + "epoch": 0.34765, + "grad_norm": 0.8768536146889513, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 34765 + }, + { + "epoch": 0.34766, + "grad_norm": 0.8227487190867583, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 34766 + }, + { + "epoch": 0.34767, + "grad_norm": 0.8243665457106375, + "learning_rate": 0.003, + "loss": 4.01, + "step": 34767 + }, + { + "epoch": 0.34768, + "grad_norm": 0.7966291930657166, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 34768 + }, + { + "epoch": 0.34769, + "grad_norm": 0.8748551111439837, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 34769 + }, + { + "epoch": 0.3477, + "grad_norm": 0.9892927715322323, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 34770 + }, + { + "epoch": 0.34771, + "grad_norm": 0.950218747700461, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 34771 + }, + { + "epoch": 0.34772, + "grad_norm": 0.8584039565498719, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 34772 + }, + { + "epoch": 0.34773, + "grad_norm": 0.8669418712782091, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 34773 + }, + { + "epoch": 0.34774, + "grad_norm": 0.7933989289868888, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 34774 + }, + { + "epoch": 0.34775, + "grad_norm": 0.7796433417534827, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 34775 + }, + { + "epoch": 0.34776, + "grad_norm": 0.9833326748327749, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 34776 + }, + { + "epoch": 0.34777, + "grad_norm": 1.2772744338775783, + "learning_rate": 0.003, + "loss": 4.028, + "step": 34777 + }, + { + "epoch": 0.34778, + "grad_norm": 0.7544013510428906, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 34778 + }, + { + "epoch": 0.34779, + "grad_norm": 0.6275361939374325, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 34779 + }, + { + "epoch": 0.3478, + "grad_norm": 0.6064907532218664, + "learning_rate": 0.003, + "loss": 4.075, + "step": 34780 + }, + { + "epoch": 0.34781, + "grad_norm": 0.5767263730471315, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 34781 + }, + { + "epoch": 0.34782, + "grad_norm": 0.5723237641714202, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 34782 + }, + { + "epoch": 0.34783, + "grad_norm": 0.5662318260534049, + "learning_rate": 0.003, + "loss": 4.034, + "step": 34783 + }, + { + "epoch": 0.34784, + "grad_norm": 0.6685311465520342, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 34784 + }, + { + "epoch": 0.34785, + "grad_norm": 0.7023981363431896, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 34785 + }, + { + "epoch": 0.34786, + "grad_norm": 0.6467955636982758, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 34786 + }, + { + "epoch": 0.34787, + "grad_norm": 0.6838634906520717, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 34787 + }, + { + "epoch": 0.34788, + "grad_norm": 0.7455780827863456, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 34788 + }, + { + "epoch": 0.34789, + "grad_norm": 0.8718816447851796, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 34789 + }, + { + "epoch": 0.3479, + "grad_norm": 1.1541168474226622, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 34790 + }, + { + "epoch": 0.34791, + "grad_norm": 1.0879160390794589, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 34791 + }, + { + "epoch": 0.34792, + "grad_norm": 0.8138527950484187, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 34792 + }, + { + "epoch": 0.34793, + "grad_norm": 0.6525770067634966, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 34793 + }, + { + "epoch": 0.34794, + "grad_norm": 0.6547578962947094, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 34794 + }, + { + "epoch": 0.34795, + "grad_norm": 0.7514528002685812, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 34795 + }, + { + "epoch": 0.34796, + "grad_norm": 0.9044656162626779, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 34796 + }, + { + "epoch": 0.34797, + "grad_norm": 1.042847608493205, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 34797 + }, + { + "epoch": 0.34798, + "grad_norm": 0.9049053123567403, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 34798 + }, + { + "epoch": 0.34799, + "grad_norm": 0.8683226658531115, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 34799 + }, + { + "epoch": 0.348, + "grad_norm": 0.7914889806116041, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 34800 + }, + { + "epoch": 0.34801, + "grad_norm": 0.8924653497452334, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 34801 + }, + { + "epoch": 0.34802, + "grad_norm": 1.0624968105762187, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 34802 + }, + { + "epoch": 0.34803, + "grad_norm": 0.8943419126907487, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 34803 + }, + { + "epoch": 0.34804, + "grad_norm": 0.9276623079610434, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 34804 + }, + { + "epoch": 0.34805, + "grad_norm": 1.0710552048905924, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 34805 + }, + { + "epoch": 0.34806, + "grad_norm": 0.916326203789788, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 34806 + }, + { + "epoch": 0.34807, + "grad_norm": 0.8451856335538793, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 34807 + }, + { + "epoch": 0.34808, + "grad_norm": 0.8845098805651169, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 34808 + }, + { + "epoch": 0.34809, + "grad_norm": 0.8298754709042115, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 34809 + }, + { + "epoch": 0.3481, + "grad_norm": 0.7979616576512674, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 34810 + }, + { + "epoch": 0.34811, + "grad_norm": 0.8174193558346711, + "learning_rate": 0.003, + "loss": 4.046, + "step": 34811 + }, + { + "epoch": 0.34812, + "grad_norm": 0.761507673595115, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 34812 + }, + { + "epoch": 0.34813, + "grad_norm": 0.7483840031618263, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 34813 + }, + { + "epoch": 0.34814, + "grad_norm": 0.7162146534626899, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 34814 + }, + { + "epoch": 0.34815, + "grad_norm": 0.7308426874742306, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 34815 + }, + { + "epoch": 0.34816, + "grad_norm": 0.7157343914499259, + "learning_rate": 0.003, + "loss": 4.032, + "step": 34816 + }, + { + "epoch": 0.34817, + "grad_norm": 0.7195677003381832, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 34817 + }, + { + "epoch": 0.34818, + "grad_norm": 0.7783442335972965, + "learning_rate": 0.003, + "loss": 4.047, + "step": 34818 + }, + { + "epoch": 0.34819, + "grad_norm": 0.9598094563927628, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 34819 + }, + { + "epoch": 0.3482, + "grad_norm": 1.145875171675095, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 34820 + }, + { + "epoch": 0.34821, + "grad_norm": 0.9451536437119268, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 34821 + }, + { + "epoch": 0.34822, + "grad_norm": 0.942854331384803, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 34822 + }, + { + "epoch": 0.34823, + "grad_norm": 0.9281956947374154, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 34823 + }, + { + "epoch": 0.34824, + "grad_norm": 0.9662005242479036, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 34824 + }, + { + "epoch": 0.34825, + "grad_norm": 0.9761321112506395, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 34825 + }, + { + "epoch": 0.34826, + "grad_norm": 0.9154788958678094, + "learning_rate": 0.003, + "loss": 4.0867, + "step": 34826 + }, + { + "epoch": 0.34827, + "grad_norm": 0.8913679145479757, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 34827 + }, + { + "epoch": 0.34828, + "grad_norm": 1.046474835439531, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 34828 + }, + { + "epoch": 0.34829, + "grad_norm": 1.0788628762490025, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 34829 + }, + { + "epoch": 0.3483, + "grad_norm": 0.8076254421546771, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 34830 + }, + { + "epoch": 0.34831, + "grad_norm": 0.8076094009374168, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 34831 + }, + { + "epoch": 0.34832, + "grad_norm": 0.7520285133875139, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 34832 + }, + { + "epoch": 0.34833, + "grad_norm": 0.6716349828816769, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 34833 + }, + { + "epoch": 0.34834, + "grad_norm": 0.6245404326115679, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 34834 + }, + { + "epoch": 0.34835, + "grad_norm": 0.5777800436049227, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 34835 + }, + { + "epoch": 0.34836, + "grad_norm": 0.6483968595268288, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 34836 + }, + { + "epoch": 0.34837, + "grad_norm": 0.7814632467617743, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 34837 + }, + { + "epoch": 0.34838, + "grad_norm": 1.0141548948813364, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 34838 + }, + { + "epoch": 0.34839, + "grad_norm": 1.119245299189435, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 34839 + }, + { + "epoch": 0.3484, + "grad_norm": 0.8715540293485823, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 34840 + }, + { + "epoch": 0.34841, + "grad_norm": 0.9020741018243255, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 34841 + }, + { + "epoch": 0.34842, + "grad_norm": 0.9053648074141472, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 34842 + }, + { + "epoch": 0.34843, + "grad_norm": 0.8644757439425724, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 34843 + }, + { + "epoch": 0.34844, + "grad_norm": 0.7756459808539292, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 34844 + }, + { + "epoch": 0.34845, + "grad_norm": 0.70387563374554, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 34845 + }, + { + "epoch": 0.34846, + "grad_norm": 0.6723979992201538, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 34846 + }, + { + "epoch": 0.34847, + "grad_norm": 0.6980881245470795, + "learning_rate": 0.003, + "loss": 4.02, + "step": 34847 + }, + { + "epoch": 0.34848, + "grad_norm": 0.7065682781466058, + "learning_rate": 0.003, + "loss": 4.036, + "step": 34848 + }, + { + "epoch": 0.34849, + "grad_norm": 0.6722477228598817, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 34849 + }, + { + "epoch": 0.3485, + "grad_norm": 0.8120389736326744, + "learning_rate": 0.003, + "loss": 4.022, + "step": 34850 + }, + { + "epoch": 0.34851, + "grad_norm": 0.9922655798716832, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 34851 + }, + { + "epoch": 0.34852, + "grad_norm": 1.0960085176025909, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 34852 + }, + { + "epoch": 0.34853, + "grad_norm": 0.7180626813455017, + "learning_rate": 0.003, + "loss": 4.026, + "step": 34853 + }, + { + "epoch": 0.34854, + "grad_norm": 0.7275656432158131, + "learning_rate": 0.003, + "loss": 4.0004, + "step": 34854 + }, + { + "epoch": 0.34855, + "grad_norm": 0.8587000035685581, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 34855 + }, + { + "epoch": 0.34856, + "grad_norm": 0.7190721139882555, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 34856 + }, + { + "epoch": 0.34857, + "grad_norm": 0.6380061215932387, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 34857 + }, + { + "epoch": 0.34858, + "grad_norm": 0.6221569752572831, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 34858 + }, + { + "epoch": 0.34859, + "grad_norm": 0.5866312562054267, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 34859 + }, + { + "epoch": 0.3486, + "grad_norm": 0.6087183847920858, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 34860 + }, + { + "epoch": 0.34861, + "grad_norm": 0.6510287843312375, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 34861 + }, + { + "epoch": 0.34862, + "grad_norm": 0.7122961797898909, + "learning_rate": 0.003, + "loss": 4.024, + "step": 34862 + }, + { + "epoch": 0.34863, + "grad_norm": 0.8678660886149889, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 34863 + }, + { + "epoch": 0.34864, + "grad_norm": 1.0452655633991765, + "learning_rate": 0.003, + "loss": 4.059, + "step": 34864 + }, + { + "epoch": 0.34865, + "grad_norm": 1.070535273349963, + "learning_rate": 0.003, + "loss": 4.045, + "step": 34865 + }, + { + "epoch": 0.34866, + "grad_norm": 1.0987204134562238, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 34866 + }, + { + "epoch": 0.34867, + "grad_norm": 1.0194123626621454, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 34867 + }, + { + "epoch": 0.34868, + "grad_norm": 1.1450976335033896, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 34868 + }, + { + "epoch": 0.34869, + "grad_norm": 0.9508407289361762, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 34869 + }, + { + "epoch": 0.3487, + "grad_norm": 0.8773498017567068, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 34870 + }, + { + "epoch": 0.34871, + "grad_norm": 0.7749844636857697, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 34871 + }, + { + "epoch": 0.34872, + "grad_norm": 0.7835684393940977, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 34872 + }, + { + "epoch": 0.34873, + "grad_norm": 0.8181360259539251, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 34873 + }, + { + "epoch": 0.34874, + "grad_norm": 0.8339656126350715, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 34874 + }, + { + "epoch": 0.34875, + "grad_norm": 0.9468911205620493, + "learning_rate": 0.003, + "loss": 4.041, + "step": 34875 + }, + { + "epoch": 0.34876, + "grad_norm": 0.9454886558754316, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 34876 + }, + { + "epoch": 0.34877, + "grad_norm": 0.7671692708072375, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 34877 + }, + { + "epoch": 0.34878, + "grad_norm": 0.7498095452104756, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 34878 + }, + { + "epoch": 0.34879, + "grad_norm": 0.76104444175185, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 34879 + }, + { + "epoch": 0.3488, + "grad_norm": 1.0246773668676816, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 34880 + }, + { + "epoch": 0.34881, + "grad_norm": 1.1608660473743995, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 34881 + }, + { + "epoch": 0.34882, + "grad_norm": 0.798586879776697, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 34882 + }, + { + "epoch": 0.34883, + "grad_norm": 0.8279175815924015, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 34883 + }, + { + "epoch": 0.34884, + "grad_norm": 0.8724400606069904, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 34884 + }, + { + "epoch": 0.34885, + "grad_norm": 0.8295665767374297, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 34885 + }, + { + "epoch": 0.34886, + "grad_norm": 0.7401645214132678, + "learning_rate": 0.003, + "loss": 4.04, + "step": 34886 + }, + { + "epoch": 0.34887, + "grad_norm": 0.7089590289190293, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 34887 + }, + { + "epoch": 0.34888, + "grad_norm": 0.7372603527671039, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 34888 + }, + { + "epoch": 0.34889, + "grad_norm": 0.8359724033807238, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 34889 + }, + { + "epoch": 0.3489, + "grad_norm": 1.0874551319701007, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 34890 + }, + { + "epoch": 0.34891, + "grad_norm": 0.9708819271664375, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 34891 + }, + { + "epoch": 0.34892, + "grad_norm": 0.9126952802270163, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 34892 + }, + { + "epoch": 0.34893, + "grad_norm": 0.7477364867712268, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 34893 + }, + { + "epoch": 0.34894, + "grad_norm": 0.6833933850107405, + "learning_rate": 0.003, + "loss": 4.047, + "step": 34894 + }, + { + "epoch": 0.34895, + "grad_norm": 0.7990856563361497, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 34895 + }, + { + "epoch": 0.34896, + "grad_norm": 0.7311515511974204, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 34896 + }, + { + "epoch": 0.34897, + "grad_norm": 0.7903113160565656, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 34897 + }, + { + "epoch": 0.34898, + "grad_norm": 0.8242391461252985, + "learning_rate": 0.003, + "loss": 4.041, + "step": 34898 + }, + { + "epoch": 0.34899, + "grad_norm": 0.6987967886226235, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 34899 + }, + { + "epoch": 0.349, + "grad_norm": 0.5935222691876121, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 34900 + }, + { + "epoch": 0.34901, + "grad_norm": 0.6263117706365163, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 34901 + }, + { + "epoch": 0.34902, + "grad_norm": 0.6088291114340829, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 34902 + }, + { + "epoch": 0.34903, + "grad_norm": 0.6601848700701307, + "learning_rate": 0.003, + "loss": 4.002, + "step": 34903 + }, + { + "epoch": 0.34904, + "grad_norm": 0.7829483864790328, + "learning_rate": 0.003, + "loss": 4.0019, + "step": 34904 + }, + { + "epoch": 0.34905, + "grad_norm": 1.0071179558286854, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 34905 + }, + { + "epoch": 0.34906, + "grad_norm": 1.321081572535562, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 34906 + }, + { + "epoch": 0.34907, + "grad_norm": 0.7691147844505147, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 34907 + }, + { + "epoch": 0.34908, + "grad_norm": 0.7701817677468953, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 34908 + }, + { + "epoch": 0.34909, + "grad_norm": 0.7320215391687613, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 34909 + }, + { + "epoch": 0.3491, + "grad_norm": 0.7363600511788945, + "learning_rate": 0.003, + "loss": 4.058, + "step": 34910 + }, + { + "epoch": 0.34911, + "grad_norm": 0.7328670737274582, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 34911 + }, + { + "epoch": 0.34912, + "grad_norm": 0.6882149201952159, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 34912 + }, + { + "epoch": 0.34913, + "grad_norm": 0.7130593051324376, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 34913 + }, + { + "epoch": 0.34914, + "grad_norm": 0.7283930664198441, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 34914 + }, + { + "epoch": 0.34915, + "grad_norm": 0.8252341597086985, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 34915 + }, + { + "epoch": 0.34916, + "grad_norm": 1.0672052679839612, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 34916 + }, + { + "epoch": 0.34917, + "grad_norm": 1.2547054263067807, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 34917 + }, + { + "epoch": 0.34918, + "grad_norm": 0.6214162633391549, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 34918 + }, + { + "epoch": 0.34919, + "grad_norm": 0.7899906182437352, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 34919 + }, + { + "epoch": 0.3492, + "grad_norm": 1.043777560421063, + "learning_rate": 0.003, + "loss": 4.046, + "step": 34920 + }, + { + "epoch": 0.34921, + "grad_norm": 1.0720965230653403, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 34921 + }, + { + "epoch": 0.34922, + "grad_norm": 0.8601826364418788, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 34922 + }, + { + "epoch": 0.34923, + "grad_norm": 0.9235043692964333, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 34923 + }, + { + "epoch": 0.34924, + "grad_norm": 0.9626484774285056, + "learning_rate": 0.003, + "loss": 4.049, + "step": 34924 + }, + { + "epoch": 0.34925, + "grad_norm": 0.9806554946534723, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 34925 + }, + { + "epoch": 0.34926, + "grad_norm": 1.0473652809570817, + "learning_rate": 0.003, + "loss": 4.0002, + "step": 34926 + }, + { + "epoch": 0.34927, + "grad_norm": 0.8903413268018272, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 34927 + }, + { + "epoch": 0.34928, + "grad_norm": 0.9558844563546772, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 34928 + }, + { + "epoch": 0.34929, + "grad_norm": 0.9889250032092969, + "learning_rate": 0.003, + "loss": 4.035, + "step": 34929 + }, + { + "epoch": 0.3493, + "grad_norm": 0.825405417696535, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 34930 + }, + { + "epoch": 0.34931, + "grad_norm": 0.826282040084365, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 34931 + }, + { + "epoch": 0.34932, + "grad_norm": 0.8087967877963714, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 34932 + }, + { + "epoch": 0.34933, + "grad_norm": 0.8591627044173537, + "learning_rate": 0.003, + "loss": 4.014, + "step": 34933 + }, + { + "epoch": 0.34934, + "grad_norm": 0.8110623709338638, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 34934 + }, + { + "epoch": 0.34935, + "grad_norm": 0.856467720450207, + "learning_rate": 0.003, + "loss": 4.049, + "step": 34935 + }, + { + "epoch": 0.34936, + "grad_norm": 0.8106373327325829, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 34936 + }, + { + "epoch": 0.34937, + "grad_norm": 0.8173737568040644, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 34937 + }, + { + "epoch": 0.34938, + "grad_norm": 0.8439875931995124, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 34938 + }, + { + "epoch": 0.34939, + "grad_norm": 0.908260804305364, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 34939 + }, + { + "epoch": 0.3494, + "grad_norm": 0.9759154782961252, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 34940 + }, + { + "epoch": 0.34941, + "grad_norm": 1.0036865543070173, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 34941 + }, + { + "epoch": 0.34942, + "grad_norm": 1.2358434232587203, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 34942 + }, + { + "epoch": 0.34943, + "grad_norm": 0.866674354132802, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 34943 + }, + { + "epoch": 0.34944, + "grad_norm": 0.7842726358655768, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 34944 + }, + { + "epoch": 0.34945, + "grad_norm": 0.8134469222597768, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 34945 + }, + { + "epoch": 0.34946, + "grad_norm": 0.7236051077331772, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 34946 + }, + { + "epoch": 0.34947, + "grad_norm": 0.6664438499428421, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 34947 + }, + { + "epoch": 0.34948, + "grad_norm": 0.7039469222273763, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 34948 + }, + { + "epoch": 0.34949, + "grad_norm": 0.8392810897136651, + "learning_rate": 0.003, + "loss": 4.0036, + "step": 34949 + }, + { + "epoch": 0.3495, + "grad_norm": 1.0181572281318558, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 34950 + }, + { + "epoch": 0.34951, + "grad_norm": 1.103188692030388, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 34951 + }, + { + "epoch": 0.34952, + "grad_norm": 0.723930198452184, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 34952 + }, + { + "epoch": 0.34953, + "grad_norm": 0.6737382053606537, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 34953 + }, + { + "epoch": 0.34954, + "grad_norm": 0.6419348924660938, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 34954 + }, + { + "epoch": 0.34955, + "grad_norm": 0.6441704693685999, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 34955 + }, + { + "epoch": 0.34956, + "grad_norm": 0.7385902097906977, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 34956 + }, + { + "epoch": 0.34957, + "grad_norm": 0.796753005930694, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 34957 + }, + { + "epoch": 0.34958, + "grad_norm": 0.766886692344842, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 34958 + }, + { + "epoch": 0.34959, + "grad_norm": 0.7679551285044743, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 34959 + }, + { + "epoch": 0.3496, + "grad_norm": 0.7362553846525967, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 34960 + }, + { + "epoch": 0.34961, + "grad_norm": 0.709035703445654, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 34961 + }, + { + "epoch": 0.34962, + "grad_norm": 0.6908267010907563, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 34962 + }, + { + "epoch": 0.34963, + "grad_norm": 0.7499825392255908, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 34963 + }, + { + "epoch": 0.34964, + "grad_norm": 0.6418890020898625, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 34964 + }, + { + "epoch": 0.34965, + "grad_norm": 0.6551704798621404, + "learning_rate": 0.003, + "loss": 3.9993, + "step": 34965 + }, + { + "epoch": 0.34966, + "grad_norm": 0.6788889742881529, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 34966 + }, + { + "epoch": 0.34967, + "grad_norm": 0.6532510417139951, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 34967 + }, + { + "epoch": 0.34968, + "grad_norm": 0.5852257473807752, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 34968 + }, + { + "epoch": 0.34969, + "grad_norm": 0.5820421455722822, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 34969 + }, + { + "epoch": 0.3497, + "grad_norm": 0.5476142699187124, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 34970 + }, + { + "epoch": 0.34971, + "grad_norm": 0.5752383288093383, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 34971 + }, + { + "epoch": 0.34972, + "grad_norm": 0.6847070212820777, + "learning_rate": 0.003, + "loss": 3.9992, + "step": 34972 + }, + { + "epoch": 0.34973, + "grad_norm": 0.7854392137693033, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 34973 + }, + { + "epoch": 0.34974, + "grad_norm": 1.053135344836148, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 34974 + }, + { + "epoch": 0.34975, + "grad_norm": 1.3375575311642083, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 34975 + }, + { + "epoch": 0.34976, + "grad_norm": 0.8304729160261032, + "learning_rate": 0.003, + "loss": 3.9996, + "step": 34976 + }, + { + "epoch": 0.34977, + "grad_norm": 0.7550708585476028, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 34977 + }, + { + "epoch": 0.34978, + "grad_norm": 0.7425797556362294, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 34978 + }, + { + "epoch": 0.34979, + "grad_norm": 0.954096013996938, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 34979 + }, + { + "epoch": 0.3498, + "grad_norm": 1.0912874154665941, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 34980 + }, + { + "epoch": 0.34981, + "grad_norm": 0.9734549742054219, + "learning_rate": 0.003, + "loss": 4.036, + "step": 34981 + }, + { + "epoch": 0.34982, + "grad_norm": 1.024006641988347, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 34982 + }, + { + "epoch": 0.34983, + "grad_norm": 0.9228472223504327, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 34983 + }, + { + "epoch": 0.34984, + "grad_norm": 0.9363402346440106, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 34984 + }, + { + "epoch": 0.34985, + "grad_norm": 1.069076097469853, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 34985 + }, + { + "epoch": 0.34986, + "grad_norm": 1.0387048086800195, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 34986 + }, + { + "epoch": 0.34987, + "grad_norm": 1.0044865544482706, + "learning_rate": 0.003, + "loss": 4.019, + "step": 34987 + }, + { + "epoch": 0.34988, + "grad_norm": 1.2686727085051517, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 34988 + }, + { + "epoch": 0.34989, + "grad_norm": 0.9355598162575778, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 34989 + }, + { + "epoch": 0.3499, + "grad_norm": 0.8710304163430269, + "learning_rate": 0.003, + "loss": 4.079, + "step": 34990 + }, + { + "epoch": 0.34991, + "grad_norm": 0.8257121968182667, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 34991 + }, + { + "epoch": 0.34992, + "grad_norm": 0.8396003288220003, + "learning_rate": 0.003, + "loss": 4.0852, + "step": 34992 + }, + { + "epoch": 0.34993, + "grad_norm": 0.8349132604862769, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 34993 + }, + { + "epoch": 0.34994, + "grad_norm": 0.9069760332043583, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 34994 + }, + { + "epoch": 0.34995, + "grad_norm": 0.8696612831725559, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 34995 + }, + { + "epoch": 0.34996, + "grad_norm": 0.9211346313192956, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 34996 + }, + { + "epoch": 0.34997, + "grad_norm": 0.99567284058509, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 34997 + }, + { + "epoch": 0.34998, + "grad_norm": 1.1283076331874475, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 34998 + }, + { + "epoch": 0.34999, + "grad_norm": 0.8725202183638248, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 34999 + }, + { + "epoch": 0.35, + "grad_norm": 0.7830043897907385, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 35000 + }, + { + "epoch": 0.35001, + "grad_norm": 0.7517297792424326, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 35001 + }, + { + "epoch": 0.35002, + "grad_norm": 0.7792110701093973, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 35002 + }, + { + "epoch": 0.35003, + "grad_norm": 0.7301755495280043, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 35003 + }, + { + "epoch": 0.35004, + "grad_norm": 0.7814715615626235, + "learning_rate": 0.003, + "loss": 4.047, + "step": 35004 + }, + { + "epoch": 0.35005, + "grad_norm": 0.9511098641968765, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 35005 + }, + { + "epoch": 0.35006, + "grad_norm": 1.1690917819110918, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 35006 + }, + { + "epoch": 0.35007, + "grad_norm": 0.9408308480293311, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 35007 + }, + { + "epoch": 0.35008, + "grad_norm": 1.037018529443783, + "learning_rate": 0.003, + "loss": 4.0789, + "step": 35008 + }, + { + "epoch": 0.35009, + "grad_norm": 0.8957172442389573, + "learning_rate": 0.003, + "loss": 4.018, + "step": 35009 + }, + { + "epoch": 0.3501, + "grad_norm": 0.8415008616100952, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 35010 + }, + { + "epoch": 0.35011, + "grad_norm": 0.7150578389441221, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 35011 + }, + { + "epoch": 0.35012, + "grad_norm": 0.6340238671011771, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 35012 + }, + { + "epoch": 0.35013, + "grad_norm": 0.6287837210175218, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 35013 + }, + { + "epoch": 0.35014, + "grad_norm": 0.7477959421676172, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 35014 + }, + { + "epoch": 0.35015, + "grad_norm": 0.6712303300886862, + "learning_rate": 0.003, + "loss": 4.0035, + "step": 35015 + }, + { + "epoch": 0.35016, + "grad_norm": 0.6518584338173726, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 35016 + }, + { + "epoch": 0.35017, + "grad_norm": 0.632700754663963, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 35017 + }, + { + "epoch": 0.35018, + "grad_norm": 0.6495640951944327, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 35018 + }, + { + "epoch": 0.35019, + "grad_norm": 0.7376806819845483, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 35019 + }, + { + "epoch": 0.3502, + "grad_norm": 0.8201940541838998, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 35020 + }, + { + "epoch": 0.35021, + "grad_norm": 0.8378929810724172, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 35021 + }, + { + "epoch": 0.35022, + "grad_norm": 0.8383607931304853, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 35022 + }, + { + "epoch": 0.35023, + "grad_norm": 0.8889798925536566, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 35023 + }, + { + "epoch": 0.35024, + "grad_norm": 0.8105888107757856, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 35024 + }, + { + "epoch": 0.35025, + "grad_norm": 0.7387957090083844, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 35025 + }, + { + "epoch": 0.35026, + "grad_norm": 0.8002618347208682, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 35026 + }, + { + "epoch": 0.35027, + "grad_norm": 0.9158858928219336, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 35027 + }, + { + "epoch": 0.35028, + "grad_norm": 0.9733923708503148, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 35028 + }, + { + "epoch": 0.35029, + "grad_norm": 1.08049571183365, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 35029 + }, + { + "epoch": 0.3503, + "grad_norm": 0.8776048078994035, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 35030 + }, + { + "epoch": 0.35031, + "grad_norm": 0.6797855022436746, + "learning_rate": 0.003, + "loss": 4.021, + "step": 35031 + }, + { + "epoch": 0.35032, + "grad_norm": 0.698300708642211, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 35032 + }, + { + "epoch": 0.35033, + "grad_norm": 0.7624607982630223, + "learning_rate": 0.003, + "loss": 4.02, + "step": 35033 + }, + { + "epoch": 0.35034, + "grad_norm": 0.7375379510344108, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 35034 + }, + { + "epoch": 0.35035, + "grad_norm": 0.7304271560357061, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 35035 + }, + { + "epoch": 0.35036, + "grad_norm": 0.6533157696109166, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 35036 + }, + { + "epoch": 0.35037, + "grad_norm": 0.6090612250027095, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 35037 + }, + { + "epoch": 0.35038, + "grad_norm": 0.5466583554433534, + "learning_rate": 0.003, + "loss": 4.003, + "step": 35038 + }, + { + "epoch": 0.35039, + "grad_norm": 0.6034909368229479, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 35039 + }, + { + "epoch": 0.3504, + "grad_norm": 0.6260186228290786, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 35040 + }, + { + "epoch": 0.35041, + "grad_norm": 0.6275060184941971, + "learning_rate": 0.003, + "loss": 3.9833, + "step": 35041 + }, + { + "epoch": 0.35042, + "grad_norm": 0.6579380967251274, + "learning_rate": 0.003, + "loss": 4.031, + "step": 35042 + }, + { + "epoch": 0.35043, + "grad_norm": 0.7684750543074379, + "learning_rate": 0.003, + "loss": 4.017, + "step": 35043 + }, + { + "epoch": 0.35044, + "grad_norm": 1.0964682454624022, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 35044 + }, + { + "epoch": 0.35045, + "grad_norm": 1.2887835171915043, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 35045 + }, + { + "epoch": 0.35046, + "grad_norm": 0.6590118362285846, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 35046 + }, + { + "epoch": 0.35047, + "grad_norm": 0.6883438285716045, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 35047 + }, + { + "epoch": 0.35048, + "grad_norm": 0.685641787428032, + "learning_rate": 0.003, + "loss": 4.031, + "step": 35048 + }, + { + "epoch": 0.35049, + "grad_norm": 0.7409801216819885, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 35049 + }, + { + "epoch": 0.3505, + "grad_norm": 0.6530823937737744, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 35050 + }, + { + "epoch": 0.35051, + "grad_norm": 0.6887583747543514, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 35051 + }, + { + "epoch": 0.35052, + "grad_norm": 0.7730769030325064, + "learning_rate": 0.003, + "loss": 4.014, + "step": 35052 + }, + { + "epoch": 0.35053, + "grad_norm": 0.9486658117690205, + "learning_rate": 0.003, + "loss": 4.027, + "step": 35053 + }, + { + "epoch": 0.35054, + "grad_norm": 1.1602073327777291, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 35054 + }, + { + "epoch": 0.35055, + "grad_norm": 1.0764205889732579, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 35055 + }, + { + "epoch": 0.35056, + "grad_norm": 1.0017567716364841, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 35056 + }, + { + "epoch": 0.35057, + "grad_norm": 1.0197284361419308, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 35057 + }, + { + "epoch": 0.35058, + "grad_norm": 1.0881178406090495, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 35058 + }, + { + "epoch": 0.35059, + "grad_norm": 0.7943516792366818, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 35059 + }, + { + "epoch": 0.3506, + "grad_norm": 0.7850535021459475, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 35060 + }, + { + "epoch": 0.35061, + "grad_norm": 0.7595985981427459, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 35061 + }, + { + "epoch": 0.35062, + "grad_norm": 0.8174397123323409, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 35062 + }, + { + "epoch": 0.35063, + "grad_norm": 0.8187734647871031, + "learning_rate": 0.003, + "loss": 4.045, + "step": 35063 + }, + { + "epoch": 0.35064, + "grad_norm": 0.8107168183419964, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 35064 + }, + { + "epoch": 0.35065, + "grad_norm": 1.0866783693661772, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 35065 + }, + { + "epoch": 0.35066, + "grad_norm": 1.0593496247202612, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 35066 + }, + { + "epoch": 0.35067, + "grad_norm": 0.8664509996795793, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 35067 + }, + { + "epoch": 0.35068, + "grad_norm": 0.8209897301815572, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 35068 + }, + { + "epoch": 0.35069, + "grad_norm": 0.7966027776052522, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 35069 + }, + { + "epoch": 0.3507, + "grad_norm": 0.7688643249370594, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 35070 + }, + { + "epoch": 0.35071, + "grad_norm": 0.8690179250804031, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 35071 + }, + { + "epoch": 0.35072, + "grad_norm": 0.9819582714983514, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 35072 + }, + { + "epoch": 0.35073, + "grad_norm": 1.2962567416010777, + "learning_rate": 0.003, + "loss": 4.0883, + "step": 35073 + }, + { + "epoch": 0.35074, + "grad_norm": 0.8910626013633564, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 35074 + }, + { + "epoch": 0.35075, + "grad_norm": 1.000108646013681, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 35075 + }, + { + "epoch": 0.35076, + "grad_norm": 1.1141919912715788, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 35076 + }, + { + "epoch": 0.35077, + "grad_norm": 0.900985244109463, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 35077 + }, + { + "epoch": 0.35078, + "grad_norm": 0.8776270982778284, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 35078 + }, + { + "epoch": 0.35079, + "grad_norm": 0.8773465509027232, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 35079 + }, + { + "epoch": 0.3508, + "grad_norm": 0.9806824485119736, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 35080 + }, + { + "epoch": 0.35081, + "grad_norm": 0.9835573241960879, + "learning_rate": 0.003, + "loss": 4.053, + "step": 35081 + }, + { + "epoch": 0.35082, + "grad_norm": 0.828644375756164, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 35082 + }, + { + "epoch": 0.35083, + "grad_norm": 0.758955707960909, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 35083 + }, + { + "epoch": 0.35084, + "grad_norm": 0.8164085080549616, + "learning_rate": 0.003, + "loss": 4.029, + "step": 35084 + }, + { + "epoch": 0.35085, + "grad_norm": 0.7387917598679636, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 35085 + }, + { + "epoch": 0.35086, + "grad_norm": 0.6774982655073124, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 35086 + }, + { + "epoch": 0.35087, + "grad_norm": 0.6800619600513751, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 35087 + }, + { + "epoch": 0.35088, + "grad_norm": 0.7633018001080065, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 35088 + }, + { + "epoch": 0.35089, + "grad_norm": 0.9152842223309234, + "learning_rate": 0.003, + "loss": 4.0, + "step": 35089 + }, + { + "epoch": 0.3509, + "grad_norm": 0.9418514742602236, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 35090 + }, + { + "epoch": 0.35091, + "grad_norm": 0.9272776545326409, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 35091 + }, + { + "epoch": 0.35092, + "grad_norm": 0.8386504573892916, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 35092 + }, + { + "epoch": 0.35093, + "grad_norm": 0.8622652916646232, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 35093 + }, + { + "epoch": 0.35094, + "grad_norm": 0.7951835212803332, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 35094 + }, + { + "epoch": 0.35095, + "grad_norm": 0.7208987612282648, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 35095 + }, + { + "epoch": 0.35096, + "grad_norm": 0.6909268608991546, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 35096 + }, + { + "epoch": 0.35097, + "grad_norm": 0.6769278636746499, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 35097 + }, + { + "epoch": 0.35098, + "grad_norm": 0.7222917515626588, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 35098 + }, + { + "epoch": 0.35099, + "grad_norm": 0.750258209282132, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 35099 + }, + { + "epoch": 0.351, + "grad_norm": 0.7777574255951669, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 35100 + }, + { + "epoch": 0.35101, + "grad_norm": 0.7516913644886785, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 35101 + }, + { + "epoch": 0.35102, + "grad_norm": 0.8272808931777358, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 35102 + }, + { + "epoch": 0.35103, + "grad_norm": 0.9777459626413545, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 35103 + }, + { + "epoch": 0.35104, + "grad_norm": 1.0834813747047427, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 35104 + }, + { + "epoch": 0.35105, + "grad_norm": 1.1217254029542798, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 35105 + }, + { + "epoch": 0.35106, + "grad_norm": 1.0797312968152044, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 35106 + }, + { + "epoch": 0.35107, + "grad_norm": 0.9130053465551803, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 35107 + }, + { + "epoch": 0.35108, + "grad_norm": 0.8438150857106466, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 35108 + }, + { + "epoch": 0.35109, + "grad_norm": 0.8121404063186625, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 35109 + }, + { + "epoch": 0.3511, + "grad_norm": 0.8211915357299913, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 35110 + }, + { + "epoch": 0.35111, + "grad_norm": 0.9434591928372962, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 35111 + }, + { + "epoch": 0.35112, + "grad_norm": 1.025832081480744, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 35112 + }, + { + "epoch": 0.35113, + "grad_norm": 1.029549871123057, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 35113 + }, + { + "epoch": 0.35114, + "grad_norm": 0.982914146580783, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 35114 + }, + { + "epoch": 0.35115, + "grad_norm": 1.027941043374991, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 35115 + }, + { + "epoch": 0.35116, + "grad_norm": 0.9094824497739847, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 35116 + }, + { + "epoch": 0.35117, + "grad_norm": 0.8413092581113586, + "learning_rate": 0.003, + "loss": 4.034, + "step": 35117 + }, + { + "epoch": 0.35118, + "grad_norm": 0.8298983664439524, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 35118 + }, + { + "epoch": 0.35119, + "grad_norm": 0.8381640324118718, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 35119 + }, + { + "epoch": 0.3512, + "grad_norm": 0.8486525020668845, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 35120 + }, + { + "epoch": 0.35121, + "grad_norm": 0.7631173268234093, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 35121 + }, + { + "epoch": 0.35122, + "grad_norm": 0.6620480132590333, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 35122 + }, + { + "epoch": 0.35123, + "grad_norm": 0.6955631997999159, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 35123 + }, + { + "epoch": 0.35124, + "grad_norm": 0.7301466909232661, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 35124 + }, + { + "epoch": 0.35125, + "grad_norm": 0.794837422485452, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 35125 + }, + { + "epoch": 0.35126, + "grad_norm": 0.8487164461627894, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 35126 + }, + { + "epoch": 0.35127, + "grad_norm": 0.8713276034477562, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 35127 + }, + { + "epoch": 0.35128, + "grad_norm": 0.8857008667795788, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 35128 + }, + { + "epoch": 0.35129, + "grad_norm": 0.9332736617121964, + "learning_rate": 0.003, + "loss": 4.047, + "step": 35129 + }, + { + "epoch": 0.3513, + "grad_norm": 0.8976997090514381, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 35130 + }, + { + "epoch": 0.35131, + "grad_norm": 0.8081145981076185, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 35131 + }, + { + "epoch": 0.35132, + "grad_norm": 0.8129911540637825, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 35132 + }, + { + "epoch": 0.35133, + "grad_norm": 0.7141832848125637, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 35133 + }, + { + "epoch": 0.35134, + "grad_norm": 0.6915840824342027, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 35134 + }, + { + "epoch": 0.35135, + "grad_norm": 0.6257224736512339, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 35135 + }, + { + "epoch": 0.35136, + "grad_norm": 0.6316318545969387, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 35136 + }, + { + "epoch": 0.35137, + "grad_norm": 0.7468413258774057, + "learning_rate": 0.003, + "loss": 3.9997, + "step": 35137 + }, + { + "epoch": 0.35138, + "grad_norm": 0.8461041028950845, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 35138 + }, + { + "epoch": 0.35139, + "grad_norm": 0.772822011711232, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 35139 + }, + { + "epoch": 0.3514, + "grad_norm": 0.7261199507806635, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 35140 + }, + { + "epoch": 0.35141, + "grad_norm": 0.7755111946368917, + "learning_rate": 0.003, + "loss": 4.03, + "step": 35141 + }, + { + "epoch": 0.35142, + "grad_norm": 0.7928812761470443, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 35142 + }, + { + "epoch": 0.35143, + "grad_norm": 0.7675499568856633, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 35143 + }, + { + "epoch": 0.35144, + "grad_norm": 0.7018499770636035, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 35144 + }, + { + "epoch": 0.35145, + "grad_norm": 0.632219657158281, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 35145 + }, + { + "epoch": 0.35146, + "grad_norm": 0.6519725358901763, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 35146 + }, + { + "epoch": 0.35147, + "grad_norm": 0.5824291776649382, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 35147 + }, + { + "epoch": 0.35148, + "grad_norm": 0.5024329999420618, + "learning_rate": 0.003, + "loss": 4.0011, + "step": 35148 + }, + { + "epoch": 0.35149, + "grad_norm": 0.5000654808166667, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 35149 + }, + { + "epoch": 0.3515, + "grad_norm": 0.49174102538565756, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 35150 + }, + { + "epoch": 0.35151, + "grad_norm": 0.5428347939873523, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 35151 + }, + { + "epoch": 0.35152, + "grad_norm": 0.6916883410612155, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 35152 + }, + { + "epoch": 0.35153, + "grad_norm": 0.9568346513416838, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 35153 + }, + { + "epoch": 0.35154, + "grad_norm": 1.4107748345896118, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 35154 + }, + { + "epoch": 0.35155, + "grad_norm": 0.8261708747967849, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 35155 + }, + { + "epoch": 0.35156, + "grad_norm": 0.9293042620697722, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35156 + }, + { + "epoch": 0.35157, + "grad_norm": 0.9172823753215151, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 35157 + }, + { + "epoch": 0.35158, + "grad_norm": 0.9156288865616014, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 35158 + }, + { + "epoch": 0.35159, + "grad_norm": 1.0510766638757187, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 35159 + }, + { + "epoch": 0.3516, + "grad_norm": 1.042796026095851, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35160 + }, + { + "epoch": 0.35161, + "grad_norm": 0.9482540605681067, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 35161 + }, + { + "epoch": 0.35162, + "grad_norm": 1.0875008875195806, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 35162 + }, + { + "epoch": 0.35163, + "grad_norm": 0.9090039867269806, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 35163 + }, + { + "epoch": 0.35164, + "grad_norm": 0.9542706349271162, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 35164 + }, + { + "epoch": 0.35165, + "grad_norm": 0.9428979490833859, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 35165 + }, + { + "epoch": 0.35166, + "grad_norm": 0.7806272058367609, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 35166 + }, + { + "epoch": 0.35167, + "grad_norm": 0.9624486171473935, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 35167 + }, + { + "epoch": 0.35168, + "grad_norm": 1.1690748237112876, + "learning_rate": 0.003, + "loss": 4.023, + "step": 35168 + }, + { + "epoch": 0.35169, + "grad_norm": 0.8959193343238345, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 35169 + }, + { + "epoch": 0.3517, + "grad_norm": 0.8881199689791797, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 35170 + }, + { + "epoch": 0.35171, + "grad_norm": 0.9819844887597202, + "learning_rate": 0.003, + "loss": 4.037, + "step": 35171 + }, + { + "epoch": 0.35172, + "grad_norm": 1.0116040716956964, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 35172 + }, + { + "epoch": 0.35173, + "grad_norm": 1.0047978538638624, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 35173 + }, + { + "epoch": 0.35174, + "grad_norm": 1.0293123402739492, + "learning_rate": 0.003, + "loss": 4.062, + "step": 35174 + }, + { + "epoch": 0.35175, + "grad_norm": 0.9532082773177747, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 35175 + }, + { + "epoch": 0.35176, + "grad_norm": 0.9746004059549227, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 35176 + }, + { + "epoch": 0.35177, + "grad_norm": 1.1478827556370528, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 35177 + }, + { + "epoch": 0.35178, + "grad_norm": 0.7804840027115564, + "learning_rate": 0.003, + "loss": 4.055, + "step": 35178 + }, + { + "epoch": 0.35179, + "grad_norm": 0.6885827881226108, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 35179 + }, + { + "epoch": 0.3518, + "grad_norm": 0.7504629046010015, + "learning_rate": 0.003, + "loss": 3.9984, + "step": 35180 + }, + { + "epoch": 0.35181, + "grad_norm": 0.8949395930259307, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 35181 + }, + { + "epoch": 0.35182, + "grad_norm": 0.9167871893743135, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 35182 + }, + { + "epoch": 0.35183, + "grad_norm": 0.9746095546970701, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 35183 + }, + { + "epoch": 0.35184, + "grad_norm": 0.9171353972430094, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 35184 + }, + { + "epoch": 0.35185, + "grad_norm": 0.8173292816387988, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 35185 + }, + { + "epoch": 0.35186, + "grad_norm": 0.7918938065462896, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 35186 + }, + { + "epoch": 0.35187, + "grad_norm": 0.9232843381696075, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 35187 + }, + { + "epoch": 0.35188, + "grad_norm": 0.9725765778081803, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 35188 + }, + { + "epoch": 0.35189, + "grad_norm": 0.9149842898974258, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 35189 + }, + { + "epoch": 0.3519, + "grad_norm": 0.7346880479937125, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 35190 + }, + { + "epoch": 0.35191, + "grad_norm": 0.7354618072173863, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 35191 + }, + { + "epoch": 0.35192, + "grad_norm": 0.7629810862549169, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 35192 + }, + { + "epoch": 0.35193, + "grad_norm": 0.7084090875544662, + "learning_rate": 0.003, + "loss": 4.061, + "step": 35193 + }, + { + "epoch": 0.35194, + "grad_norm": 0.5936256355618695, + "learning_rate": 0.003, + "loss": 4.041, + "step": 35194 + }, + { + "epoch": 0.35195, + "grad_norm": 0.5188344716348993, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 35195 + }, + { + "epoch": 0.35196, + "grad_norm": 0.4958534130197045, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 35196 + }, + { + "epoch": 0.35197, + "grad_norm": 0.5183729214185196, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 35197 + }, + { + "epoch": 0.35198, + "grad_norm": 0.6085589486641731, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 35198 + }, + { + "epoch": 0.35199, + "grad_norm": 0.6873093747896977, + "learning_rate": 0.003, + "loss": 4.036, + "step": 35199 + }, + { + "epoch": 0.352, + "grad_norm": 0.7017582184330183, + "learning_rate": 0.003, + "loss": 3.9897, + "step": 35200 + }, + { + "epoch": 0.35201, + "grad_norm": 0.6940647347600101, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 35201 + }, + { + "epoch": 0.35202, + "grad_norm": 0.690740739673416, + "learning_rate": 0.003, + "loss": 4.0073, + "step": 35202 + }, + { + "epoch": 0.35203, + "grad_norm": 0.8198806429153129, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 35203 + }, + { + "epoch": 0.35204, + "grad_norm": 0.9105584722622847, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 35204 + }, + { + "epoch": 0.35205, + "grad_norm": 0.9707130283968068, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 35205 + }, + { + "epoch": 0.35206, + "grad_norm": 1.003768943623242, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 35206 + }, + { + "epoch": 0.35207, + "grad_norm": 1.0477587565920572, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 35207 + }, + { + "epoch": 0.35208, + "grad_norm": 0.9233216031467486, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 35208 + }, + { + "epoch": 0.35209, + "grad_norm": 0.8374310343632653, + "learning_rate": 0.003, + "loss": 4.046, + "step": 35209 + }, + { + "epoch": 0.3521, + "grad_norm": 0.8832694738029971, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 35210 + }, + { + "epoch": 0.35211, + "grad_norm": 0.9161110887331092, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 35211 + }, + { + "epoch": 0.35212, + "grad_norm": 0.9890461681376855, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 35212 + }, + { + "epoch": 0.35213, + "grad_norm": 1.0634445456620187, + "learning_rate": 0.003, + "loss": 4.0734, + "step": 35213 + }, + { + "epoch": 0.35214, + "grad_norm": 1.109885219426023, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 35214 + }, + { + "epoch": 0.35215, + "grad_norm": 0.8927602059332673, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 35215 + }, + { + "epoch": 0.35216, + "grad_norm": 0.8657659440127494, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 35216 + }, + { + "epoch": 0.35217, + "grad_norm": 0.847471499035758, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 35217 + }, + { + "epoch": 0.35218, + "grad_norm": 0.8605124597892159, + "learning_rate": 0.003, + "loss": 4.012, + "step": 35218 + }, + { + "epoch": 0.35219, + "grad_norm": 0.9073893163124725, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 35219 + }, + { + "epoch": 0.3522, + "grad_norm": 1.0050593060388826, + "learning_rate": 0.003, + "loss": 4.036, + "step": 35220 + }, + { + "epoch": 0.35221, + "grad_norm": 1.153010504498366, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 35221 + }, + { + "epoch": 0.35222, + "grad_norm": 0.8406918688088859, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 35222 + }, + { + "epoch": 0.35223, + "grad_norm": 0.7636479044122304, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 35223 + }, + { + "epoch": 0.35224, + "grad_norm": 0.6711598585789031, + "learning_rate": 0.003, + "loss": 4.04, + "step": 35224 + }, + { + "epoch": 0.35225, + "grad_norm": 0.6965400809562491, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 35225 + }, + { + "epoch": 0.35226, + "grad_norm": 0.6952603549848736, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 35226 + }, + { + "epoch": 0.35227, + "grad_norm": 0.7726354856016462, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 35227 + }, + { + "epoch": 0.35228, + "grad_norm": 0.7705632516058647, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 35228 + }, + { + "epoch": 0.35229, + "grad_norm": 0.7255883960314718, + "learning_rate": 0.003, + "loss": 4.031, + "step": 35229 + }, + { + "epoch": 0.3523, + "grad_norm": 0.674024550072049, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 35230 + }, + { + "epoch": 0.35231, + "grad_norm": 0.6703878154998932, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 35231 + }, + { + "epoch": 0.35232, + "grad_norm": 0.7727737394254651, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 35232 + }, + { + "epoch": 0.35233, + "grad_norm": 0.9421880346423944, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 35233 + }, + { + "epoch": 0.35234, + "grad_norm": 1.029642520496911, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 35234 + }, + { + "epoch": 0.35235, + "grad_norm": 0.9835474427954992, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 35235 + }, + { + "epoch": 0.35236, + "grad_norm": 1.0152696852541423, + "learning_rate": 0.003, + "loss": 4.026, + "step": 35236 + }, + { + "epoch": 0.35237, + "grad_norm": 0.9668427300175659, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 35237 + }, + { + "epoch": 0.35238, + "grad_norm": 0.9509871512851996, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 35238 + }, + { + "epoch": 0.35239, + "grad_norm": 1.026365856446077, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35239 + }, + { + "epoch": 0.3524, + "grad_norm": 0.9773777733028364, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 35240 + }, + { + "epoch": 0.35241, + "grad_norm": 0.879535097880324, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 35241 + }, + { + "epoch": 0.35242, + "grad_norm": 0.713640638616957, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 35242 + }, + { + "epoch": 0.35243, + "grad_norm": 0.7524319737468896, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 35243 + }, + { + "epoch": 0.35244, + "grad_norm": 0.8643963325644647, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 35244 + }, + { + "epoch": 0.35245, + "grad_norm": 0.8381654620170323, + "learning_rate": 0.003, + "loss": 4.071, + "step": 35245 + }, + { + "epoch": 0.35246, + "grad_norm": 0.8452606839376504, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 35246 + }, + { + "epoch": 0.35247, + "grad_norm": 0.6327150633186418, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 35247 + }, + { + "epoch": 0.35248, + "grad_norm": 0.5804175720552595, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 35248 + }, + { + "epoch": 0.35249, + "grad_norm": 0.5289644308204289, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 35249 + }, + { + "epoch": 0.3525, + "grad_norm": 0.5619816666662856, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 35250 + }, + { + "epoch": 0.35251, + "grad_norm": 0.6180242065111174, + "learning_rate": 0.003, + "loss": 4.0068, + "step": 35251 + }, + { + "epoch": 0.35252, + "grad_norm": 0.7450196800734594, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 35252 + }, + { + "epoch": 0.35253, + "grad_norm": 0.9477313461212996, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 35253 + }, + { + "epoch": 0.35254, + "grad_norm": 1.1819879939281142, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 35254 + }, + { + "epoch": 0.35255, + "grad_norm": 0.6607777513171338, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 35255 + }, + { + "epoch": 0.35256, + "grad_norm": 0.5182338901969771, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 35256 + }, + { + "epoch": 0.35257, + "grad_norm": 0.7826713776692046, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 35257 + }, + { + "epoch": 0.35258, + "grad_norm": 0.9464866036150354, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 35258 + }, + { + "epoch": 0.35259, + "grad_norm": 0.9698199904881265, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 35259 + }, + { + "epoch": 0.3526, + "grad_norm": 0.953806695998152, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 35260 + }, + { + "epoch": 0.35261, + "grad_norm": 1.0306115448608555, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 35261 + }, + { + "epoch": 0.35262, + "grad_norm": 1.0117067658906804, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 35262 + }, + { + "epoch": 0.35263, + "grad_norm": 1.0153326214395861, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 35263 + }, + { + "epoch": 0.35264, + "grad_norm": 0.9912024881233239, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 35264 + }, + { + "epoch": 0.35265, + "grad_norm": 1.0083962864751475, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 35265 + }, + { + "epoch": 0.35266, + "grad_norm": 1.1541331472113723, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 35266 + }, + { + "epoch": 0.35267, + "grad_norm": 0.9140923766461305, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 35267 + }, + { + "epoch": 0.35268, + "grad_norm": 0.7486582971515309, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 35268 + }, + { + "epoch": 0.35269, + "grad_norm": 0.6090931035653594, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 35269 + }, + { + "epoch": 0.3527, + "grad_norm": 0.6005395353085234, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 35270 + }, + { + "epoch": 0.35271, + "grad_norm": 0.7089725678297071, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 35271 + }, + { + "epoch": 0.35272, + "grad_norm": 0.6582917452391771, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 35272 + }, + { + "epoch": 0.35273, + "grad_norm": 0.6297966682698514, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 35273 + }, + { + "epoch": 0.35274, + "grad_norm": 0.7195598883254993, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 35274 + }, + { + "epoch": 0.35275, + "grad_norm": 0.8399066323297114, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 35275 + }, + { + "epoch": 0.35276, + "grad_norm": 0.7940913993773373, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 35276 + }, + { + "epoch": 0.35277, + "grad_norm": 0.660348767931516, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 35277 + }, + { + "epoch": 0.35278, + "grad_norm": 0.5605881054341464, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 35278 + }, + { + "epoch": 0.35279, + "grad_norm": 0.5878417607232299, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 35279 + }, + { + "epoch": 0.3528, + "grad_norm": 0.7528141937469021, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 35280 + }, + { + "epoch": 0.35281, + "grad_norm": 0.8652571896198665, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 35281 + }, + { + "epoch": 0.35282, + "grad_norm": 0.9390485229867606, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 35282 + }, + { + "epoch": 0.35283, + "grad_norm": 1.0761150772442634, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 35283 + }, + { + "epoch": 0.35284, + "grad_norm": 0.9711986991788634, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 35284 + }, + { + "epoch": 0.35285, + "grad_norm": 0.9350176493550356, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 35285 + }, + { + "epoch": 0.35286, + "grad_norm": 0.8946115564270194, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 35286 + }, + { + "epoch": 0.35287, + "grad_norm": 0.7554610157216101, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 35287 + }, + { + "epoch": 0.35288, + "grad_norm": 0.7125452595360842, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 35288 + }, + { + "epoch": 0.35289, + "grad_norm": 0.5689732281880509, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 35289 + }, + { + "epoch": 0.3529, + "grad_norm": 0.5408928747346811, + "learning_rate": 0.003, + "loss": 4.021, + "step": 35290 + }, + { + "epoch": 0.35291, + "grad_norm": 0.6068706962545, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 35291 + }, + { + "epoch": 0.35292, + "grad_norm": 0.7454729650156972, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 35292 + }, + { + "epoch": 0.35293, + "grad_norm": 0.9724543930770964, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 35293 + }, + { + "epoch": 0.35294, + "grad_norm": 1.232415292237379, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 35294 + }, + { + "epoch": 0.35295, + "grad_norm": 0.7685965333663676, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 35295 + }, + { + "epoch": 0.35296, + "grad_norm": 0.6873593666600332, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 35296 + }, + { + "epoch": 0.35297, + "grad_norm": 0.6782988605592402, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 35297 + }, + { + "epoch": 0.35298, + "grad_norm": 0.7074295438460946, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 35298 + }, + { + "epoch": 0.35299, + "grad_norm": 0.7315444233805894, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 35299 + }, + { + "epoch": 0.353, + "grad_norm": 0.7762064958766007, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 35300 + }, + { + "epoch": 0.35301, + "grad_norm": 0.8224294025287957, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 35301 + }, + { + "epoch": 0.35302, + "grad_norm": 0.9688558956074059, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 35302 + }, + { + "epoch": 0.35303, + "grad_norm": 1.0780247342951372, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 35303 + }, + { + "epoch": 0.35304, + "grad_norm": 0.8711172263839682, + "learning_rate": 0.003, + "loss": 4.025, + "step": 35304 + }, + { + "epoch": 0.35305, + "grad_norm": 0.8078287486399488, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 35305 + }, + { + "epoch": 0.35306, + "grad_norm": 0.7396013986913598, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 35306 + }, + { + "epoch": 0.35307, + "grad_norm": 0.6686000744893293, + "learning_rate": 0.003, + "loss": 4.054, + "step": 35307 + }, + { + "epoch": 0.35308, + "grad_norm": 0.7225306264438495, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 35308 + }, + { + "epoch": 0.35309, + "grad_norm": 0.6450555661790196, + "learning_rate": 0.003, + "loss": 3.9985, + "step": 35309 + }, + { + "epoch": 0.3531, + "grad_norm": 0.6563703282321344, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 35310 + }, + { + "epoch": 0.35311, + "grad_norm": 0.7057884901969602, + "learning_rate": 0.003, + "loss": 4.031, + "step": 35311 + }, + { + "epoch": 0.35312, + "grad_norm": 0.8101023891292665, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 35312 + }, + { + "epoch": 0.35313, + "grad_norm": 1.0249610509011964, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 35313 + }, + { + "epoch": 0.35314, + "grad_norm": 1.133988599462333, + "learning_rate": 0.003, + "loss": 3.9984, + "step": 35314 + }, + { + "epoch": 0.35315, + "grad_norm": 0.9100342553308453, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 35315 + }, + { + "epoch": 0.35316, + "grad_norm": 0.8690460268490119, + "learning_rate": 0.003, + "loss": 3.9925, + "step": 35316 + }, + { + "epoch": 0.35317, + "grad_norm": 0.861741682447213, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 35317 + }, + { + "epoch": 0.35318, + "grad_norm": 1.0478812593091773, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 35318 + }, + { + "epoch": 0.35319, + "grad_norm": 0.9145304262181804, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 35319 + }, + { + "epoch": 0.3532, + "grad_norm": 0.8776495542421046, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 35320 + }, + { + "epoch": 0.35321, + "grad_norm": 0.836206451867468, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 35321 + }, + { + "epoch": 0.35322, + "grad_norm": 0.8026880283593412, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 35322 + }, + { + "epoch": 0.35323, + "grad_norm": 0.8403358542862727, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 35323 + }, + { + "epoch": 0.35324, + "grad_norm": 0.8755099906895667, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 35324 + }, + { + "epoch": 0.35325, + "grad_norm": 0.9190745476207055, + "learning_rate": 0.003, + "loss": 4.0782, + "step": 35325 + }, + { + "epoch": 0.35326, + "grad_norm": 0.8350433822593392, + "learning_rate": 0.003, + "loss": 4.0076, + "step": 35326 + }, + { + "epoch": 0.35327, + "grad_norm": 0.7379402144776968, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 35327 + }, + { + "epoch": 0.35328, + "grad_norm": 0.7169295253046076, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 35328 + }, + { + "epoch": 0.35329, + "grad_norm": 0.8472143100561214, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 35329 + }, + { + "epoch": 0.3533, + "grad_norm": 0.8797433394605029, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 35330 + }, + { + "epoch": 0.35331, + "grad_norm": 0.7804285090975526, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 35331 + }, + { + "epoch": 0.35332, + "grad_norm": 0.7806297996145116, + "learning_rate": 0.003, + "loss": 4.058, + "step": 35332 + }, + { + "epoch": 0.35333, + "grad_norm": 0.8798346598999109, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 35333 + }, + { + "epoch": 0.35334, + "grad_norm": 0.9706168380966738, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 35334 + }, + { + "epoch": 0.35335, + "grad_norm": 1.0347084731161396, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 35335 + }, + { + "epoch": 0.35336, + "grad_norm": 0.8972123458236516, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 35336 + }, + { + "epoch": 0.35337, + "grad_norm": 0.864444232949959, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 35337 + }, + { + "epoch": 0.35338, + "grad_norm": 0.8705508409423729, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 35338 + }, + { + "epoch": 0.35339, + "grad_norm": 0.9696513539574135, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 35339 + }, + { + "epoch": 0.3534, + "grad_norm": 1.064576144927214, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 35340 + }, + { + "epoch": 0.35341, + "grad_norm": 0.9757702989934408, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 35341 + }, + { + "epoch": 0.35342, + "grad_norm": 1.1119081591992588, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 35342 + }, + { + "epoch": 0.35343, + "grad_norm": 0.852659547569621, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 35343 + }, + { + "epoch": 0.35344, + "grad_norm": 0.8834683625706019, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 35344 + }, + { + "epoch": 0.35345, + "grad_norm": 0.756295042970097, + "learning_rate": 0.003, + "loss": 4.074, + "step": 35345 + }, + { + "epoch": 0.35346, + "grad_norm": 0.6875298182348494, + "learning_rate": 0.003, + "loss": 4.023, + "step": 35346 + }, + { + "epoch": 0.35347, + "grad_norm": 0.6839180575795568, + "learning_rate": 0.003, + "loss": 4.055, + "step": 35347 + }, + { + "epoch": 0.35348, + "grad_norm": 0.7869682067176487, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 35348 + }, + { + "epoch": 0.35349, + "grad_norm": 0.9250045873983749, + "learning_rate": 0.003, + "loss": 4.058, + "step": 35349 + }, + { + "epoch": 0.3535, + "grad_norm": 1.020484221466413, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 35350 + }, + { + "epoch": 0.35351, + "grad_norm": 0.9741178258701142, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 35351 + }, + { + "epoch": 0.35352, + "grad_norm": 0.9100336993704421, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 35352 + }, + { + "epoch": 0.35353, + "grad_norm": 0.7671341377722294, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 35353 + }, + { + "epoch": 0.35354, + "grad_norm": 0.7556911563786995, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 35354 + }, + { + "epoch": 0.35355, + "grad_norm": 0.7869474495516388, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 35355 + }, + { + "epoch": 0.35356, + "grad_norm": 0.9216353722263187, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 35356 + }, + { + "epoch": 0.35357, + "grad_norm": 0.9309308206545308, + "learning_rate": 0.003, + "loss": 4.046, + "step": 35357 + }, + { + "epoch": 0.35358, + "grad_norm": 0.8854001884721865, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 35358 + }, + { + "epoch": 0.35359, + "grad_norm": 1.0028556364577885, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 35359 + }, + { + "epoch": 0.3536, + "grad_norm": 1.047990839566149, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 35360 + }, + { + "epoch": 0.35361, + "grad_norm": 0.9188657549870218, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 35361 + }, + { + "epoch": 0.35362, + "grad_norm": 0.8114004795879127, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 35362 + }, + { + "epoch": 0.35363, + "grad_norm": 0.7229253146182816, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 35363 + }, + { + "epoch": 0.35364, + "grad_norm": 0.6568712555270999, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 35364 + }, + { + "epoch": 0.35365, + "grad_norm": 0.6880630045396562, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 35365 + }, + { + "epoch": 0.35366, + "grad_norm": 0.5796398298358054, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 35366 + }, + { + "epoch": 0.35367, + "grad_norm": 0.5247208664326352, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 35367 + }, + { + "epoch": 0.35368, + "grad_norm": 0.5663349124674729, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 35368 + }, + { + "epoch": 0.35369, + "grad_norm": 0.5400870318266853, + "learning_rate": 0.003, + "loss": 3.985, + "step": 35369 + }, + { + "epoch": 0.3537, + "grad_norm": 0.5247435492319512, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 35370 + }, + { + "epoch": 0.35371, + "grad_norm": 0.5998273815954971, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 35371 + }, + { + "epoch": 0.35372, + "grad_norm": 0.6551432583646265, + "learning_rate": 0.003, + "loss": 4.051, + "step": 35372 + }, + { + "epoch": 0.35373, + "grad_norm": 0.7358287292269177, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 35373 + }, + { + "epoch": 0.35374, + "grad_norm": 0.8983625561306371, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 35374 + }, + { + "epoch": 0.35375, + "grad_norm": 1.0235256693710033, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 35375 + }, + { + "epoch": 0.35376, + "grad_norm": 1.1603737671672083, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 35376 + }, + { + "epoch": 0.35377, + "grad_norm": 0.8645561120081271, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 35377 + }, + { + "epoch": 0.35378, + "grad_norm": 0.8046297129629648, + "learning_rate": 0.003, + "loss": 4.038, + "step": 35378 + }, + { + "epoch": 0.35379, + "grad_norm": 0.8221691424420592, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 35379 + }, + { + "epoch": 0.3538, + "grad_norm": 0.7053458891488756, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 35380 + }, + { + "epoch": 0.35381, + "grad_norm": 0.7948976578657914, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 35381 + }, + { + "epoch": 0.35382, + "grad_norm": 0.7864013298484848, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 35382 + }, + { + "epoch": 0.35383, + "grad_norm": 0.850102623055183, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 35383 + }, + { + "epoch": 0.35384, + "grad_norm": 0.8913609993806474, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 35384 + }, + { + "epoch": 0.35385, + "grad_norm": 0.8348017433982378, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 35385 + }, + { + "epoch": 0.35386, + "grad_norm": 0.714294957069829, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 35386 + }, + { + "epoch": 0.35387, + "grad_norm": 0.7006362427376348, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 35387 + }, + { + "epoch": 0.35388, + "grad_norm": 0.8013106496227456, + "learning_rate": 0.003, + "loss": 3.9976, + "step": 35388 + }, + { + "epoch": 0.35389, + "grad_norm": 0.9018824605009831, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 35389 + }, + { + "epoch": 0.3539, + "grad_norm": 1.0607096500433855, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 35390 + }, + { + "epoch": 0.35391, + "grad_norm": 0.9712076058479823, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 35391 + }, + { + "epoch": 0.35392, + "grad_norm": 0.8850733634871047, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 35392 + }, + { + "epoch": 0.35393, + "grad_norm": 0.9064245592099144, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 35393 + }, + { + "epoch": 0.35394, + "grad_norm": 0.9242585077694726, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 35394 + }, + { + "epoch": 0.35395, + "grad_norm": 0.8876698410376317, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 35395 + }, + { + "epoch": 0.35396, + "grad_norm": 0.8660372536059948, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 35396 + }, + { + "epoch": 0.35397, + "grad_norm": 0.8415891691368266, + "learning_rate": 0.003, + "loss": 4.068, + "step": 35397 + }, + { + "epoch": 0.35398, + "grad_norm": 0.9176796834716632, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 35398 + }, + { + "epoch": 0.35399, + "grad_norm": 1.0698904466940191, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 35399 + }, + { + "epoch": 0.354, + "grad_norm": 1.0383445833283258, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 35400 + }, + { + "epoch": 0.35401, + "grad_norm": 1.1196866007761461, + "learning_rate": 0.003, + "loss": 4.086, + "step": 35401 + }, + { + "epoch": 0.35402, + "grad_norm": 0.8912817293772949, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 35402 + }, + { + "epoch": 0.35403, + "grad_norm": 0.7002773235008907, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 35403 + }, + { + "epoch": 0.35404, + "grad_norm": 0.7646154910017946, + "learning_rate": 0.003, + "loss": 4.062, + "step": 35404 + }, + { + "epoch": 0.35405, + "grad_norm": 0.9005732476823645, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 35405 + }, + { + "epoch": 0.35406, + "grad_norm": 1.2007459869668298, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 35406 + }, + { + "epoch": 0.35407, + "grad_norm": 1.0437292645267002, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 35407 + }, + { + "epoch": 0.35408, + "grad_norm": 0.9001446858574974, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 35408 + }, + { + "epoch": 0.35409, + "grad_norm": 0.7628853581995292, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 35409 + }, + { + "epoch": 0.3541, + "grad_norm": 0.7182388758999324, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 35410 + }, + { + "epoch": 0.35411, + "grad_norm": 0.6351385385217057, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 35411 + }, + { + "epoch": 0.35412, + "grad_norm": 0.6617658461659434, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 35412 + }, + { + "epoch": 0.35413, + "grad_norm": 0.7278165094375213, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 35413 + }, + { + "epoch": 0.35414, + "grad_norm": 0.8389160928755836, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 35414 + }, + { + "epoch": 0.35415, + "grad_norm": 0.9094257645524942, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 35415 + }, + { + "epoch": 0.35416, + "grad_norm": 0.9686318858392784, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 35416 + }, + { + "epoch": 0.35417, + "grad_norm": 1.061241103914107, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 35417 + }, + { + "epoch": 0.35418, + "grad_norm": 0.8761443973856404, + "learning_rate": 0.003, + "loss": 4.052, + "step": 35418 + }, + { + "epoch": 0.35419, + "grad_norm": 0.7023794223482236, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 35419 + }, + { + "epoch": 0.3542, + "grad_norm": 0.6642286276421315, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 35420 + }, + { + "epoch": 0.35421, + "grad_norm": 0.5739740805687852, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 35421 + }, + { + "epoch": 0.35422, + "grad_norm": 0.5675688254678937, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 35422 + }, + { + "epoch": 0.35423, + "grad_norm": 0.5488874376116479, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 35423 + }, + { + "epoch": 0.35424, + "grad_norm": 0.6186204671715866, + "learning_rate": 0.003, + "loss": 4.0013, + "step": 35424 + }, + { + "epoch": 0.35425, + "grad_norm": 0.6592067784840431, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 35425 + }, + { + "epoch": 0.35426, + "grad_norm": 0.7059352012527849, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 35426 + }, + { + "epoch": 0.35427, + "grad_norm": 0.7869286816287573, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 35427 + }, + { + "epoch": 0.35428, + "grad_norm": 0.9689694880785674, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 35428 + }, + { + "epoch": 0.35429, + "grad_norm": 0.9817857113715803, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 35429 + }, + { + "epoch": 0.3543, + "grad_norm": 0.7651993872981392, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 35430 + }, + { + "epoch": 0.35431, + "grad_norm": 0.7099705037031928, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 35431 + }, + { + "epoch": 0.35432, + "grad_norm": 0.7017381941545233, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 35432 + }, + { + "epoch": 0.35433, + "grad_norm": 0.581192715987244, + "learning_rate": 0.003, + "loss": 4.0088, + "step": 35433 + }, + { + "epoch": 0.35434, + "grad_norm": 0.6008515088269819, + "learning_rate": 0.003, + "loss": 4.004, + "step": 35434 + }, + { + "epoch": 0.35435, + "grad_norm": 0.7137652236081956, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 35435 + }, + { + "epoch": 0.35436, + "grad_norm": 0.8741253260856033, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 35436 + }, + { + "epoch": 0.35437, + "grad_norm": 0.9158389873368245, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 35437 + }, + { + "epoch": 0.35438, + "grad_norm": 0.9182323811968163, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 35438 + }, + { + "epoch": 0.35439, + "grad_norm": 0.9869966499017926, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 35439 + }, + { + "epoch": 0.3544, + "grad_norm": 1.124228761670888, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 35440 + }, + { + "epoch": 0.35441, + "grad_norm": 0.8920640898219491, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 35441 + }, + { + "epoch": 0.35442, + "grad_norm": 0.8502617671458848, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 35442 + }, + { + "epoch": 0.35443, + "grad_norm": 0.8404157469406964, + "learning_rate": 0.003, + "loss": 4.036, + "step": 35443 + }, + { + "epoch": 0.35444, + "grad_norm": 0.8460198528199361, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 35444 + }, + { + "epoch": 0.35445, + "grad_norm": 0.7754404782796849, + "learning_rate": 0.003, + "loss": 4.0783, + "step": 35445 + }, + { + "epoch": 0.35446, + "grad_norm": 0.8554021724472471, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 35446 + }, + { + "epoch": 0.35447, + "grad_norm": 0.9123742963858626, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 35447 + }, + { + "epoch": 0.35448, + "grad_norm": 0.7759855308659531, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 35448 + }, + { + "epoch": 0.35449, + "grad_norm": 0.8195627593124858, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 35449 + }, + { + "epoch": 0.3545, + "grad_norm": 0.8370337435525346, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 35450 + }, + { + "epoch": 0.35451, + "grad_norm": 0.8780373491212887, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 35451 + }, + { + "epoch": 0.35452, + "grad_norm": 0.9662638733666206, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 35452 + }, + { + "epoch": 0.35453, + "grad_norm": 0.9551058221115036, + "learning_rate": 0.003, + "loss": 4.066, + "step": 35453 + }, + { + "epoch": 0.35454, + "grad_norm": 0.9329223391564759, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 35454 + }, + { + "epoch": 0.35455, + "grad_norm": 0.8095055469220586, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 35455 + }, + { + "epoch": 0.35456, + "grad_norm": 0.7525264242342562, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 35456 + }, + { + "epoch": 0.35457, + "grad_norm": 0.7446229595470634, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 35457 + }, + { + "epoch": 0.35458, + "grad_norm": 0.7606209693406574, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 35458 + }, + { + "epoch": 0.35459, + "grad_norm": 0.8992902480038396, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 35459 + }, + { + "epoch": 0.3546, + "grad_norm": 1.1703342294162382, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 35460 + }, + { + "epoch": 0.35461, + "grad_norm": 0.9816325070035651, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 35461 + }, + { + "epoch": 0.35462, + "grad_norm": 1.0079624051287999, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 35462 + }, + { + "epoch": 0.35463, + "grad_norm": 1.0551909022970964, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 35463 + }, + { + "epoch": 0.35464, + "grad_norm": 1.013437883306526, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 35464 + }, + { + "epoch": 0.35465, + "grad_norm": 0.9240021234396032, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 35465 + }, + { + "epoch": 0.35466, + "grad_norm": 0.7804810265110531, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 35466 + }, + { + "epoch": 0.35467, + "grad_norm": 0.6922883824077382, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 35467 + }, + { + "epoch": 0.35468, + "grad_norm": 0.7730069910482615, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 35468 + }, + { + "epoch": 0.35469, + "grad_norm": 0.8012463096266607, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 35469 + }, + { + "epoch": 0.3547, + "grad_norm": 0.9036135282892178, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 35470 + }, + { + "epoch": 0.35471, + "grad_norm": 1.0024960086696617, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 35471 + }, + { + "epoch": 0.35472, + "grad_norm": 1.246108237269644, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 35472 + }, + { + "epoch": 0.35473, + "grad_norm": 0.6871332843035695, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 35473 + }, + { + "epoch": 0.35474, + "grad_norm": 0.5429466243643678, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 35474 + }, + { + "epoch": 0.35475, + "grad_norm": 0.7004175000602282, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 35475 + }, + { + "epoch": 0.35476, + "grad_norm": 0.7588862515395214, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 35476 + }, + { + "epoch": 0.35477, + "grad_norm": 0.8013748202508281, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 35477 + }, + { + "epoch": 0.35478, + "grad_norm": 0.7686109613683162, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 35478 + }, + { + "epoch": 0.35479, + "grad_norm": 0.7889566830240237, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 35479 + }, + { + "epoch": 0.3548, + "grad_norm": 0.8221127693913399, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 35480 + }, + { + "epoch": 0.35481, + "grad_norm": 0.70090166983967, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 35481 + }, + { + "epoch": 0.35482, + "grad_norm": 0.6113639835146235, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 35482 + }, + { + "epoch": 0.35483, + "grad_norm": 0.5969069035492758, + "learning_rate": 0.003, + "loss": 4.019, + "step": 35483 + }, + { + "epoch": 0.35484, + "grad_norm": 0.6436367649700976, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 35484 + }, + { + "epoch": 0.35485, + "grad_norm": 0.6567738727394041, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 35485 + }, + { + "epoch": 0.35486, + "grad_norm": 0.6655943040058273, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 35486 + }, + { + "epoch": 0.35487, + "grad_norm": 0.5996525797539428, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 35487 + }, + { + "epoch": 0.35488, + "grad_norm": 0.5466117844547181, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 35488 + }, + { + "epoch": 0.35489, + "grad_norm": 0.5212695599076461, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 35489 + }, + { + "epoch": 0.3549, + "grad_norm": 0.5812236963965408, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 35490 + }, + { + "epoch": 0.35491, + "grad_norm": 0.5680105500200279, + "learning_rate": 0.003, + "loss": 3.9829, + "step": 35491 + }, + { + "epoch": 0.35492, + "grad_norm": 0.5819739732184216, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 35492 + }, + { + "epoch": 0.35493, + "grad_norm": 0.6366794945282233, + "learning_rate": 0.003, + "loss": 3.9968, + "step": 35493 + }, + { + "epoch": 0.35494, + "grad_norm": 0.8082036486306512, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 35494 + }, + { + "epoch": 0.35495, + "grad_norm": 1.0810733124370882, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 35495 + }, + { + "epoch": 0.35496, + "grad_norm": 1.1864534428628468, + "learning_rate": 0.003, + "loss": 3.9896, + "step": 35496 + }, + { + "epoch": 0.35497, + "grad_norm": 0.7539216179462753, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 35497 + }, + { + "epoch": 0.35498, + "grad_norm": 0.7157850564539804, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 35498 + }, + { + "epoch": 0.35499, + "grad_norm": 0.7906746239712356, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 35499 + }, + { + "epoch": 0.355, + "grad_norm": 0.8254569531211231, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 35500 + }, + { + "epoch": 0.35501, + "grad_norm": 0.8228127910814118, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35501 + }, + { + "epoch": 0.35502, + "grad_norm": 0.8677643424650084, + "learning_rate": 0.003, + "loss": 4.025, + "step": 35502 + }, + { + "epoch": 0.35503, + "grad_norm": 0.9274158072269734, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 35503 + }, + { + "epoch": 0.35504, + "grad_norm": 1.0617704511569044, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 35504 + }, + { + "epoch": 0.35505, + "grad_norm": 0.9485630512402317, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 35505 + }, + { + "epoch": 0.35506, + "grad_norm": 0.8931346402986252, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 35506 + }, + { + "epoch": 0.35507, + "grad_norm": 0.8029627701583673, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 35507 + }, + { + "epoch": 0.35508, + "grad_norm": 0.8218367352625724, + "learning_rate": 0.003, + "loss": 4.029, + "step": 35508 + }, + { + "epoch": 0.35509, + "grad_norm": 0.7794129345318784, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 35509 + }, + { + "epoch": 0.3551, + "grad_norm": 0.8999416421444704, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 35510 + }, + { + "epoch": 0.35511, + "grad_norm": 1.1819951273386162, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 35511 + }, + { + "epoch": 0.35512, + "grad_norm": 1.1454617936174853, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 35512 + }, + { + "epoch": 0.35513, + "grad_norm": 0.8841410400493929, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 35513 + }, + { + "epoch": 0.35514, + "grad_norm": 0.9630639338382225, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 35514 + }, + { + "epoch": 0.35515, + "grad_norm": 1.0641706614949202, + "learning_rate": 0.003, + "loss": 4.055, + "step": 35515 + }, + { + "epoch": 0.35516, + "grad_norm": 0.8779166440122825, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 35516 + }, + { + "epoch": 0.35517, + "grad_norm": 0.8342835931354293, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 35517 + }, + { + "epoch": 0.35518, + "grad_norm": 0.7217692896391702, + "learning_rate": 0.003, + "loss": 4.047, + "step": 35518 + }, + { + "epoch": 0.35519, + "grad_norm": 0.8128729445412435, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 35519 + }, + { + "epoch": 0.3552, + "grad_norm": 0.8831018072071011, + "learning_rate": 0.003, + "loss": 4.036, + "step": 35520 + }, + { + "epoch": 0.35521, + "grad_norm": 1.0273283220439666, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 35521 + }, + { + "epoch": 0.35522, + "grad_norm": 1.0414509910057848, + "learning_rate": 0.003, + "loss": 4.0832, + "step": 35522 + }, + { + "epoch": 0.35523, + "grad_norm": 1.0727947371051423, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 35523 + }, + { + "epoch": 0.35524, + "grad_norm": 0.7932233754886141, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 35524 + }, + { + "epoch": 0.35525, + "grad_norm": 0.769114435020755, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 35525 + }, + { + "epoch": 0.35526, + "grad_norm": 0.7468679725824244, + "learning_rate": 0.003, + "loss": 4.067, + "step": 35526 + }, + { + "epoch": 0.35527, + "grad_norm": 0.7833049177634545, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 35527 + }, + { + "epoch": 0.35528, + "grad_norm": 0.8355066727841511, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 35528 + }, + { + "epoch": 0.35529, + "grad_norm": 0.7579379380962419, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 35529 + }, + { + "epoch": 0.3553, + "grad_norm": 0.6645146848088115, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 35530 + }, + { + "epoch": 0.35531, + "grad_norm": 0.6167020852726559, + "learning_rate": 0.003, + "loss": 4.03, + "step": 35531 + }, + { + "epoch": 0.35532, + "grad_norm": 0.6518658342172913, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 35532 + }, + { + "epoch": 0.35533, + "grad_norm": 0.6875152442508491, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 35533 + }, + { + "epoch": 0.35534, + "grad_norm": 0.7086514968768225, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 35534 + }, + { + "epoch": 0.35535, + "grad_norm": 0.7669555376707017, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 35535 + }, + { + "epoch": 0.35536, + "grad_norm": 0.8515685693199104, + "learning_rate": 0.003, + "loss": 4.001, + "step": 35536 + }, + { + "epoch": 0.35537, + "grad_norm": 0.9856250130191145, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 35537 + }, + { + "epoch": 0.35538, + "grad_norm": 1.2720365034123322, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 35538 + }, + { + "epoch": 0.35539, + "grad_norm": 0.7709964096162728, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 35539 + }, + { + "epoch": 0.3554, + "grad_norm": 0.6117648240171862, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 35540 + }, + { + "epoch": 0.35541, + "grad_norm": 0.6265897285479147, + "learning_rate": 0.003, + "loss": 4.0039, + "step": 35541 + }, + { + "epoch": 0.35542, + "grad_norm": 0.7278098155207409, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 35542 + }, + { + "epoch": 0.35543, + "grad_norm": 0.8467449762739124, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 35543 + }, + { + "epoch": 0.35544, + "grad_norm": 0.9980062798865201, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 35544 + }, + { + "epoch": 0.35545, + "grad_norm": 1.0530627611853915, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 35545 + }, + { + "epoch": 0.35546, + "grad_norm": 1.0257438471950722, + "learning_rate": 0.003, + "loss": 4.041, + "step": 35546 + }, + { + "epoch": 0.35547, + "grad_norm": 1.177106759770191, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 35547 + }, + { + "epoch": 0.35548, + "grad_norm": 0.8847480801199755, + "learning_rate": 0.003, + "loss": 4.081, + "step": 35548 + }, + { + "epoch": 0.35549, + "grad_norm": 0.7156875765719646, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 35549 + }, + { + "epoch": 0.3555, + "grad_norm": 0.6587318538034197, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 35550 + }, + { + "epoch": 0.35551, + "grad_norm": 0.6761456268385696, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 35551 + }, + { + "epoch": 0.35552, + "grad_norm": 0.8097477941853418, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 35552 + }, + { + "epoch": 0.35553, + "grad_norm": 0.9704542896312899, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 35553 + }, + { + "epoch": 0.35554, + "grad_norm": 0.9406978980735771, + "learning_rate": 0.003, + "loss": 4.031, + "step": 35554 + }, + { + "epoch": 0.35555, + "grad_norm": 0.8809474996016134, + "learning_rate": 0.003, + "loss": 4.034, + "step": 35555 + }, + { + "epoch": 0.35556, + "grad_norm": 0.7963865416393352, + "learning_rate": 0.003, + "loss": 4.0045, + "step": 35556 + }, + { + "epoch": 0.35557, + "grad_norm": 0.7242247163165031, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 35557 + }, + { + "epoch": 0.35558, + "grad_norm": 0.7328387720765082, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 35558 + }, + { + "epoch": 0.35559, + "grad_norm": 0.8243266882935657, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 35559 + }, + { + "epoch": 0.3556, + "grad_norm": 0.8723530599326395, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 35560 + }, + { + "epoch": 0.35561, + "grad_norm": 1.0566770957594827, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 35561 + }, + { + "epoch": 0.35562, + "grad_norm": 1.028254968726956, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 35562 + }, + { + "epoch": 0.35563, + "grad_norm": 0.9844385949474204, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 35563 + }, + { + "epoch": 0.35564, + "grad_norm": 0.9525308524027485, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 35564 + }, + { + "epoch": 0.35565, + "grad_norm": 0.8997534654437397, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 35565 + }, + { + "epoch": 0.35566, + "grad_norm": 0.8603629761680712, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 35566 + }, + { + "epoch": 0.35567, + "grad_norm": 0.8993786928019031, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 35567 + }, + { + "epoch": 0.35568, + "grad_norm": 0.9832693710122777, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 35568 + }, + { + "epoch": 0.35569, + "grad_norm": 0.8835962914608959, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 35569 + }, + { + "epoch": 0.3557, + "grad_norm": 0.8106325261137978, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 35570 + }, + { + "epoch": 0.35571, + "grad_norm": 0.7680980036144063, + "learning_rate": 0.003, + "loss": 4.0806, + "step": 35571 + }, + { + "epoch": 0.35572, + "grad_norm": 0.8255545515300683, + "learning_rate": 0.003, + "loss": 4.027, + "step": 35572 + }, + { + "epoch": 0.35573, + "grad_norm": 0.8040403273344309, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 35573 + }, + { + "epoch": 0.35574, + "grad_norm": 0.8710634859590739, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 35574 + }, + { + "epoch": 0.35575, + "grad_norm": 0.8826007700576655, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 35575 + }, + { + "epoch": 0.35576, + "grad_norm": 0.8774583482108382, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 35576 + }, + { + "epoch": 0.35577, + "grad_norm": 0.8854128775302702, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 35577 + }, + { + "epoch": 0.35578, + "grad_norm": 0.9199942423913751, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 35578 + }, + { + "epoch": 0.35579, + "grad_norm": 0.9598962740676245, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 35579 + }, + { + "epoch": 0.3558, + "grad_norm": 1.0010455071077455, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 35580 + }, + { + "epoch": 0.35581, + "grad_norm": 0.9870978796097563, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 35581 + }, + { + "epoch": 0.35582, + "grad_norm": 0.9452054621743715, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 35582 + }, + { + "epoch": 0.35583, + "grad_norm": 0.7799836923257747, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 35583 + }, + { + "epoch": 0.35584, + "grad_norm": 0.6823097994412459, + "learning_rate": 0.003, + "loss": 4.037, + "step": 35584 + }, + { + "epoch": 0.35585, + "grad_norm": 0.6329547812475629, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 35585 + }, + { + "epoch": 0.35586, + "grad_norm": 0.6059535786109996, + "learning_rate": 0.003, + "loss": 3.994, + "step": 35586 + }, + { + "epoch": 0.35587, + "grad_norm": 0.6139006554217994, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 35587 + }, + { + "epoch": 0.35588, + "grad_norm": 0.5892930525779317, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 35588 + }, + { + "epoch": 0.35589, + "grad_norm": 0.6463888956744265, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 35589 + }, + { + "epoch": 0.3559, + "grad_norm": 0.7327634667561984, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 35590 + }, + { + "epoch": 0.35591, + "grad_norm": 0.8906705874850709, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 35591 + }, + { + "epoch": 0.35592, + "grad_norm": 1.0079326815533545, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 35592 + }, + { + "epoch": 0.35593, + "grad_norm": 0.9996379490059354, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 35593 + }, + { + "epoch": 0.35594, + "grad_norm": 0.8104297155238523, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 35594 + }, + { + "epoch": 0.35595, + "grad_norm": 0.7245955273114536, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 35595 + }, + { + "epoch": 0.35596, + "grad_norm": 0.8071081999913456, + "learning_rate": 0.003, + "loss": 4.011, + "step": 35596 + }, + { + "epoch": 0.35597, + "grad_norm": 0.8264448881034141, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 35597 + }, + { + "epoch": 0.35598, + "grad_norm": 0.8959183103735588, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 35598 + }, + { + "epoch": 0.35599, + "grad_norm": 0.8856612560928192, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 35599 + }, + { + "epoch": 0.356, + "grad_norm": 0.8213477277748193, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 35600 + }, + { + "epoch": 0.35601, + "grad_norm": 0.6854831352161981, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 35601 + }, + { + "epoch": 0.35602, + "grad_norm": 0.6734393723793403, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 35602 + }, + { + "epoch": 0.35603, + "grad_norm": 0.8434871330075129, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 35603 + }, + { + "epoch": 0.35604, + "grad_norm": 1.0066102226087577, + "learning_rate": 0.003, + "loss": 4.065, + "step": 35604 + }, + { + "epoch": 0.35605, + "grad_norm": 1.024795617466296, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 35605 + }, + { + "epoch": 0.35606, + "grad_norm": 0.8527908522013521, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 35606 + }, + { + "epoch": 0.35607, + "grad_norm": 0.776680642092252, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 35607 + }, + { + "epoch": 0.35608, + "grad_norm": 0.8076314946283357, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 35608 + }, + { + "epoch": 0.35609, + "grad_norm": 0.8409965670546519, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 35609 + }, + { + "epoch": 0.3561, + "grad_norm": 0.751464434316504, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 35610 + }, + { + "epoch": 0.35611, + "grad_norm": 0.6950080605499266, + "learning_rate": 0.003, + "loss": 4.034, + "step": 35611 + }, + { + "epoch": 0.35612, + "grad_norm": 0.6465073485550279, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 35612 + }, + { + "epoch": 0.35613, + "grad_norm": 0.7199595671038781, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 35613 + }, + { + "epoch": 0.35614, + "grad_norm": 0.7937611986605759, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 35614 + }, + { + "epoch": 0.35615, + "grad_norm": 0.8366000998606218, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 35615 + }, + { + "epoch": 0.35616, + "grad_norm": 0.8172045351919865, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 35616 + }, + { + "epoch": 0.35617, + "grad_norm": 0.89973338525607, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 35617 + }, + { + "epoch": 0.35618, + "grad_norm": 0.8788400233400065, + "learning_rate": 0.003, + "loss": 4.043, + "step": 35618 + }, + { + "epoch": 0.35619, + "grad_norm": 0.8674971785907195, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 35619 + }, + { + "epoch": 0.3562, + "grad_norm": 0.8506353834252806, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 35620 + }, + { + "epoch": 0.35621, + "grad_norm": 0.9242726208705895, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 35621 + }, + { + "epoch": 0.35622, + "grad_norm": 0.8950835402909704, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 35622 + }, + { + "epoch": 0.35623, + "grad_norm": 0.7404139085725188, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 35623 + }, + { + "epoch": 0.35624, + "grad_norm": 0.7900457853163831, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 35624 + }, + { + "epoch": 0.35625, + "grad_norm": 0.9475956628830594, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 35625 + }, + { + "epoch": 0.35626, + "grad_norm": 1.2848803834395723, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 35626 + }, + { + "epoch": 0.35627, + "grad_norm": 0.9793321975608914, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 35627 + }, + { + "epoch": 0.35628, + "grad_norm": 0.9363119161820275, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 35628 + }, + { + "epoch": 0.35629, + "grad_norm": 0.8710769346246292, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 35629 + }, + { + "epoch": 0.3563, + "grad_norm": 0.8535843830255943, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 35630 + }, + { + "epoch": 0.35631, + "grad_norm": 0.6993579227361221, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 35631 + }, + { + "epoch": 0.35632, + "grad_norm": 0.6895740148125786, + "learning_rate": 0.003, + "loss": 3.9861, + "step": 35632 + }, + { + "epoch": 0.35633, + "grad_norm": 0.7347109266107757, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 35633 + }, + { + "epoch": 0.35634, + "grad_norm": 0.8182167986920449, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 35634 + }, + { + "epoch": 0.35635, + "grad_norm": 0.8952372440982915, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 35635 + }, + { + "epoch": 0.35636, + "grad_norm": 1.1045496111115596, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 35636 + }, + { + "epoch": 0.35637, + "grad_norm": 1.0206717318220273, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 35637 + }, + { + "epoch": 0.35638, + "grad_norm": 1.117655261302622, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 35638 + }, + { + "epoch": 0.35639, + "grad_norm": 0.8060628125253311, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 35639 + }, + { + "epoch": 0.3564, + "grad_norm": 0.6668476328683659, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 35640 + }, + { + "epoch": 0.35641, + "grad_norm": 0.5854658827694242, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 35641 + }, + { + "epoch": 0.35642, + "grad_norm": 0.5720785686479769, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 35642 + }, + { + "epoch": 0.35643, + "grad_norm": 0.6368651612920068, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 35643 + }, + { + "epoch": 0.35644, + "grad_norm": 0.6709454309389514, + "learning_rate": 0.003, + "loss": 4.038, + "step": 35644 + }, + { + "epoch": 0.35645, + "grad_norm": 0.7428107112251099, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 35645 + }, + { + "epoch": 0.35646, + "grad_norm": 0.8977368645919559, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 35646 + }, + { + "epoch": 0.35647, + "grad_norm": 1.0964348545991895, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 35647 + }, + { + "epoch": 0.35648, + "grad_norm": 1.0898039064099752, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 35648 + }, + { + "epoch": 0.35649, + "grad_norm": 0.868695775301868, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 35649 + }, + { + "epoch": 0.3565, + "grad_norm": 0.6685591742752819, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 35650 + }, + { + "epoch": 0.35651, + "grad_norm": 0.6075220108469472, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 35651 + }, + { + "epoch": 0.35652, + "grad_norm": 0.5597705629615594, + "learning_rate": 0.003, + "loss": 4.054, + "step": 35652 + }, + { + "epoch": 0.35653, + "grad_norm": 0.594995733462146, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 35653 + }, + { + "epoch": 0.35654, + "grad_norm": 0.5772478980983075, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 35654 + }, + { + "epoch": 0.35655, + "grad_norm": 0.5981366146628451, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 35655 + }, + { + "epoch": 0.35656, + "grad_norm": 0.5781208845240513, + "learning_rate": 0.003, + "loss": 3.9879, + "step": 35656 + }, + { + "epoch": 0.35657, + "grad_norm": 0.5292548917462926, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 35657 + }, + { + "epoch": 0.35658, + "grad_norm": 0.5055722217035394, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 35658 + }, + { + "epoch": 0.35659, + "grad_norm": 0.5213369006341231, + "learning_rate": 0.003, + "loss": 3.9975, + "step": 35659 + }, + { + "epoch": 0.3566, + "grad_norm": 0.6134609430403047, + "learning_rate": 0.003, + "loss": 3.9995, + "step": 35660 + }, + { + "epoch": 0.35661, + "grad_norm": 0.6720311495034645, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 35661 + }, + { + "epoch": 0.35662, + "grad_norm": 0.6616594136370846, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 35662 + }, + { + "epoch": 0.35663, + "grad_norm": 0.6545444946597759, + "learning_rate": 0.003, + "loss": 3.9938, + "step": 35663 + }, + { + "epoch": 0.35664, + "grad_norm": 0.6896095792991492, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 35664 + }, + { + "epoch": 0.35665, + "grad_norm": 0.7270167048894088, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 35665 + }, + { + "epoch": 0.35666, + "grad_norm": 0.8708316391014879, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 35666 + }, + { + "epoch": 0.35667, + "grad_norm": 1.202704569954379, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 35667 + }, + { + "epoch": 0.35668, + "grad_norm": 1.070754231139595, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 35668 + }, + { + "epoch": 0.35669, + "grad_norm": 0.8868673610262438, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 35669 + }, + { + "epoch": 0.3567, + "grad_norm": 0.8580689435200328, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 35670 + }, + { + "epoch": 0.35671, + "grad_norm": 0.9056492583376805, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 35671 + }, + { + "epoch": 0.35672, + "grad_norm": 1.0695903535691391, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 35672 + }, + { + "epoch": 0.35673, + "grad_norm": 1.1758185934747443, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 35673 + }, + { + "epoch": 0.35674, + "grad_norm": 0.838267299740318, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 35674 + }, + { + "epoch": 0.35675, + "grad_norm": 0.9059065237059448, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 35675 + }, + { + "epoch": 0.35676, + "grad_norm": 0.8767640996053887, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 35676 + }, + { + "epoch": 0.35677, + "grad_norm": 0.9001532769348776, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35677 + }, + { + "epoch": 0.35678, + "grad_norm": 1.0044900987688095, + "learning_rate": 0.003, + "loss": 4.073, + "step": 35678 + }, + { + "epoch": 0.35679, + "grad_norm": 1.0595302639842004, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 35679 + }, + { + "epoch": 0.3568, + "grad_norm": 1.0554006121817447, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 35680 + }, + { + "epoch": 0.35681, + "grad_norm": 1.0359540300818586, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 35681 + }, + { + "epoch": 0.35682, + "grad_norm": 1.0496549415094933, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 35682 + }, + { + "epoch": 0.35683, + "grad_norm": 1.1193186233760666, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 35683 + }, + { + "epoch": 0.35684, + "grad_norm": 1.0289417562870988, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 35684 + }, + { + "epoch": 0.35685, + "grad_norm": 0.9443888636827353, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 35685 + }, + { + "epoch": 0.35686, + "grad_norm": 0.8832169588394676, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 35686 + }, + { + "epoch": 0.35687, + "grad_norm": 0.7441475426570914, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 35687 + }, + { + "epoch": 0.35688, + "grad_norm": 0.7371616302804537, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 35688 + }, + { + "epoch": 0.35689, + "grad_norm": 0.7140159926015877, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 35689 + }, + { + "epoch": 0.3569, + "grad_norm": 0.6883988835206376, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 35690 + }, + { + "epoch": 0.35691, + "grad_norm": 0.7476805459953952, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 35691 + }, + { + "epoch": 0.35692, + "grad_norm": 0.7932129633222358, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 35692 + }, + { + "epoch": 0.35693, + "grad_norm": 0.884939421216784, + "learning_rate": 0.003, + "loss": 4.046, + "step": 35693 + }, + { + "epoch": 0.35694, + "grad_norm": 0.9201676174588778, + "learning_rate": 0.003, + "loss": 4.0089, + "step": 35694 + }, + { + "epoch": 0.35695, + "grad_norm": 0.796262460485084, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 35695 + }, + { + "epoch": 0.35696, + "grad_norm": 0.6440851813124607, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 35696 + }, + { + "epoch": 0.35697, + "grad_norm": 0.5752703503067458, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 35697 + }, + { + "epoch": 0.35698, + "grad_norm": 0.6587468210284368, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 35698 + }, + { + "epoch": 0.35699, + "grad_norm": 0.7020238763218456, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 35699 + }, + { + "epoch": 0.357, + "grad_norm": 0.640345949998875, + "learning_rate": 0.003, + "loss": 4.025, + "step": 35700 + }, + { + "epoch": 0.35701, + "grad_norm": 0.6602586334705417, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 35701 + }, + { + "epoch": 0.35702, + "grad_norm": 0.7696306080726952, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 35702 + }, + { + "epoch": 0.35703, + "grad_norm": 0.7729789988296374, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 35703 + }, + { + "epoch": 0.35704, + "grad_norm": 0.7179321852284546, + "learning_rate": 0.003, + "loss": 4.023, + "step": 35704 + }, + { + "epoch": 0.35705, + "grad_norm": 0.8787784929494487, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 35705 + }, + { + "epoch": 0.35706, + "grad_norm": 1.1224697717720735, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 35706 + }, + { + "epoch": 0.35707, + "grad_norm": 0.8999590573780386, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 35707 + }, + { + "epoch": 0.35708, + "grad_norm": 0.7419019613569772, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 35708 + }, + { + "epoch": 0.35709, + "grad_norm": 0.8306810437530958, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 35709 + }, + { + "epoch": 0.3571, + "grad_norm": 0.8955408833203646, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 35710 + }, + { + "epoch": 0.35711, + "grad_norm": 0.9928348083128814, + "learning_rate": 0.003, + "loss": 4.032, + "step": 35711 + }, + { + "epoch": 0.35712, + "grad_norm": 1.1378248510819395, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 35712 + }, + { + "epoch": 0.35713, + "grad_norm": 0.9160755596437987, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 35713 + }, + { + "epoch": 0.35714, + "grad_norm": 0.8228665434239927, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 35714 + }, + { + "epoch": 0.35715, + "grad_norm": 0.8056043565290397, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 35715 + }, + { + "epoch": 0.35716, + "grad_norm": 0.7320479300225768, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 35716 + }, + { + "epoch": 0.35717, + "grad_norm": 0.7923578963997363, + "learning_rate": 0.003, + "loss": 4.013, + "step": 35717 + }, + { + "epoch": 0.35718, + "grad_norm": 0.8158172273819033, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 35718 + }, + { + "epoch": 0.35719, + "grad_norm": 0.8302044476275666, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 35719 + }, + { + "epoch": 0.3572, + "grad_norm": 0.887616083926048, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 35720 + }, + { + "epoch": 0.35721, + "grad_norm": 0.9427311440966861, + "learning_rate": 0.003, + "loss": 4.026, + "step": 35721 + }, + { + "epoch": 0.35722, + "grad_norm": 0.9337743861315516, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 35722 + }, + { + "epoch": 0.35723, + "grad_norm": 0.9599231272601718, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 35723 + }, + { + "epoch": 0.35724, + "grad_norm": 0.8962227981402232, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 35724 + }, + { + "epoch": 0.35725, + "grad_norm": 0.8272482967510436, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 35725 + }, + { + "epoch": 0.35726, + "grad_norm": 0.8710650912849195, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 35726 + }, + { + "epoch": 0.35727, + "grad_norm": 0.8441960372054734, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 35727 + }, + { + "epoch": 0.35728, + "grad_norm": 0.7243374222418736, + "learning_rate": 0.003, + "loss": 4.036, + "step": 35728 + }, + { + "epoch": 0.35729, + "grad_norm": 0.7536326928214573, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 35729 + }, + { + "epoch": 0.3573, + "grad_norm": 0.9254168923490416, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 35730 + }, + { + "epoch": 0.35731, + "grad_norm": 1.1840355043884516, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 35731 + }, + { + "epoch": 0.35732, + "grad_norm": 0.9781255563794017, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 35732 + }, + { + "epoch": 0.35733, + "grad_norm": 1.0797901642510934, + "learning_rate": 0.003, + "loss": 4.072, + "step": 35733 + }, + { + "epoch": 0.35734, + "grad_norm": 0.8965002076531271, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 35734 + }, + { + "epoch": 0.35735, + "grad_norm": 0.8253981263162796, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 35735 + }, + { + "epoch": 0.35736, + "grad_norm": 0.8797476365662505, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 35736 + }, + { + "epoch": 0.35737, + "grad_norm": 0.8168856016840561, + "learning_rate": 0.003, + "loss": 4.038, + "step": 35737 + }, + { + "epoch": 0.35738, + "grad_norm": 0.8269705070885212, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 35738 + }, + { + "epoch": 0.35739, + "grad_norm": 0.6958354973779299, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 35739 + }, + { + "epoch": 0.3574, + "grad_norm": 0.6947639916319921, + "learning_rate": 0.003, + "loss": 4.033, + "step": 35740 + }, + { + "epoch": 0.35741, + "grad_norm": 0.7655183508904077, + "learning_rate": 0.003, + "loss": 4.013, + "step": 35741 + }, + { + "epoch": 0.35742, + "grad_norm": 1.0051490067783835, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 35742 + }, + { + "epoch": 0.35743, + "grad_norm": 1.2664906631774058, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 35743 + }, + { + "epoch": 0.35744, + "grad_norm": 0.6912057699718547, + "learning_rate": 0.003, + "loss": 4.035, + "step": 35744 + }, + { + "epoch": 0.35745, + "grad_norm": 0.6595572185855328, + "learning_rate": 0.003, + "loss": 3.9995, + "step": 35745 + }, + { + "epoch": 0.35746, + "grad_norm": 0.7058174846101571, + "learning_rate": 0.003, + "loss": 4.0011, + "step": 35746 + }, + { + "epoch": 0.35747, + "grad_norm": 0.8339045332934717, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 35747 + }, + { + "epoch": 0.35748, + "grad_norm": 0.9718697434974324, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 35748 + }, + { + "epoch": 0.35749, + "grad_norm": 0.9313153354131446, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 35749 + }, + { + "epoch": 0.3575, + "grad_norm": 0.8200877491281663, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 35750 + }, + { + "epoch": 0.35751, + "grad_norm": 0.9066927403661558, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 35751 + }, + { + "epoch": 0.35752, + "grad_norm": 0.8038414210303122, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 35752 + }, + { + "epoch": 0.35753, + "grad_norm": 0.5934362619105293, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 35753 + }, + { + "epoch": 0.35754, + "grad_norm": 0.6411011935286931, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 35754 + }, + { + "epoch": 0.35755, + "grad_norm": 0.6996753745207122, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 35755 + }, + { + "epoch": 0.35756, + "grad_norm": 0.7255755058558641, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 35756 + }, + { + "epoch": 0.35757, + "grad_norm": 0.7485450940230268, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 35757 + }, + { + "epoch": 0.35758, + "grad_norm": 0.746790779413974, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 35758 + }, + { + "epoch": 0.35759, + "grad_norm": 0.6877561301482958, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 35759 + }, + { + "epoch": 0.3576, + "grad_norm": 0.6074135828916071, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 35760 + }, + { + "epoch": 0.35761, + "grad_norm": 0.6358759129297189, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 35761 + }, + { + "epoch": 0.35762, + "grad_norm": 0.7858513419540549, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 35762 + }, + { + "epoch": 0.35763, + "grad_norm": 1.1489242435958604, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 35763 + }, + { + "epoch": 0.35764, + "grad_norm": 0.9451925504323201, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 35764 + }, + { + "epoch": 0.35765, + "grad_norm": 0.7370724999113019, + "learning_rate": 0.003, + "loss": 4.003, + "step": 35765 + }, + { + "epoch": 0.35766, + "grad_norm": 0.6083271594004961, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 35766 + }, + { + "epoch": 0.35767, + "grad_norm": 0.6358724150449878, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 35767 + }, + { + "epoch": 0.35768, + "grad_norm": 0.7113769266019271, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 35768 + }, + { + "epoch": 0.35769, + "grad_norm": 0.731297909806317, + "learning_rate": 0.003, + "loss": 3.9976, + "step": 35769 + }, + { + "epoch": 0.3577, + "grad_norm": 0.7436818150107795, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 35770 + }, + { + "epoch": 0.35771, + "grad_norm": 0.7989994563124555, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 35771 + }, + { + "epoch": 0.35772, + "grad_norm": 0.8562412550087297, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 35772 + }, + { + "epoch": 0.35773, + "grad_norm": 0.8768679913407814, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 35773 + }, + { + "epoch": 0.35774, + "grad_norm": 0.9254009323210972, + "learning_rate": 0.003, + "loss": 4.0035, + "step": 35774 + }, + { + "epoch": 0.35775, + "grad_norm": 0.9077319144017103, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 35775 + }, + { + "epoch": 0.35776, + "grad_norm": 0.7643280899222715, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 35776 + }, + { + "epoch": 0.35777, + "grad_norm": 0.8753310524115449, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 35777 + }, + { + "epoch": 0.35778, + "grad_norm": 0.9503089406596378, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 35778 + }, + { + "epoch": 0.35779, + "grad_norm": 0.989208203643738, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 35779 + }, + { + "epoch": 0.3578, + "grad_norm": 0.9226713704629453, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 35780 + }, + { + "epoch": 0.35781, + "grad_norm": 0.9476995824285103, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 35781 + }, + { + "epoch": 0.35782, + "grad_norm": 0.9506169621962408, + "learning_rate": 0.003, + "loss": 4.045, + "step": 35782 + }, + { + "epoch": 0.35783, + "grad_norm": 1.0956381327555391, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 35783 + }, + { + "epoch": 0.35784, + "grad_norm": 1.2110592107460763, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 35784 + }, + { + "epoch": 0.35785, + "grad_norm": 0.8052704752057784, + "learning_rate": 0.003, + "loss": 4.019, + "step": 35785 + }, + { + "epoch": 0.35786, + "grad_norm": 0.6404169853516094, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35786 + }, + { + "epoch": 0.35787, + "grad_norm": 0.7146104427398803, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 35787 + }, + { + "epoch": 0.35788, + "grad_norm": 0.8991411899150431, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 35788 + }, + { + "epoch": 0.35789, + "grad_norm": 1.115339881941168, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 35789 + }, + { + "epoch": 0.3579, + "grad_norm": 1.104152317999324, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 35790 + }, + { + "epoch": 0.35791, + "grad_norm": 0.8595676893025892, + "learning_rate": 0.003, + "loss": 3.9982, + "step": 35791 + }, + { + "epoch": 0.35792, + "grad_norm": 0.8123888232614761, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 35792 + }, + { + "epoch": 0.35793, + "grad_norm": 0.7562181224453615, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 35793 + }, + { + "epoch": 0.35794, + "grad_norm": 0.767511110378619, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 35794 + }, + { + "epoch": 0.35795, + "grad_norm": 0.840848489941889, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 35795 + }, + { + "epoch": 0.35796, + "grad_norm": 0.922914159958431, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 35796 + }, + { + "epoch": 0.35797, + "grad_norm": 1.0846332374797876, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 35797 + }, + { + "epoch": 0.35798, + "grad_norm": 0.8021538767525749, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 35798 + }, + { + "epoch": 0.35799, + "grad_norm": 0.7323737307816428, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 35799 + }, + { + "epoch": 0.358, + "grad_norm": 0.7311887990612411, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 35800 + }, + { + "epoch": 0.35801, + "grad_norm": 0.5727139715453451, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 35801 + }, + { + "epoch": 0.35802, + "grad_norm": 0.5897442276136924, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 35802 + }, + { + "epoch": 0.35803, + "grad_norm": 0.6983796947891424, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 35803 + }, + { + "epoch": 0.35804, + "grad_norm": 0.7837855446886233, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 35804 + }, + { + "epoch": 0.35805, + "grad_norm": 0.8682041559614913, + "learning_rate": 0.003, + "loss": 4.057, + "step": 35805 + }, + { + "epoch": 0.35806, + "grad_norm": 0.969551002817275, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 35806 + }, + { + "epoch": 0.35807, + "grad_norm": 1.0287386012304156, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 35807 + }, + { + "epoch": 0.35808, + "grad_norm": 0.8580712247624265, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 35808 + }, + { + "epoch": 0.35809, + "grad_norm": 0.7544976513189149, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 35809 + }, + { + "epoch": 0.3581, + "grad_norm": 0.6977270295680401, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 35810 + }, + { + "epoch": 0.35811, + "grad_norm": 0.8201142515078498, + "learning_rate": 0.003, + "loss": 4.053, + "step": 35811 + }, + { + "epoch": 0.35812, + "grad_norm": 0.8830025438106857, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 35812 + }, + { + "epoch": 0.35813, + "grad_norm": 0.9584571491655394, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 35813 + }, + { + "epoch": 0.35814, + "grad_norm": 0.9000776794277944, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 35814 + }, + { + "epoch": 0.35815, + "grad_norm": 0.7658226606278749, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 35815 + }, + { + "epoch": 0.35816, + "grad_norm": 0.8733202301812838, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 35816 + }, + { + "epoch": 0.35817, + "grad_norm": 0.9466515496795979, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 35817 + }, + { + "epoch": 0.35818, + "grad_norm": 0.8312895394016373, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 35818 + }, + { + "epoch": 0.35819, + "grad_norm": 0.8807284835195941, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 35819 + }, + { + "epoch": 0.3582, + "grad_norm": 0.9770344746294646, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 35820 + }, + { + "epoch": 0.35821, + "grad_norm": 1.2742784317916769, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 35821 + }, + { + "epoch": 0.35822, + "grad_norm": 0.9587846759605987, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 35822 + }, + { + "epoch": 0.35823, + "grad_norm": 1.006576833377721, + "learning_rate": 0.003, + "loss": 4.06, + "step": 35823 + }, + { + "epoch": 0.35824, + "grad_norm": 1.028722691839176, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 35824 + }, + { + "epoch": 0.35825, + "grad_norm": 1.0085291659514588, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 35825 + }, + { + "epoch": 0.35826, + "grad_norm": 0.8882942999500639, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 35826 + }, + { + "epoch": 0.35827, + "grad_norm": 0.9628206649503425, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 35827 + }, + { + "epoch": 0.35828, + "grad_norm": 0.9624857453553891, + "learning_rate": 0.003, + "loss": 4.071, + "step": 35828 + }, + { + "epoch": 0.35829, + "grad_norm": 0.9477547563462154, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 35829 + }, + { + "epoch": 0.3583, + "grad_norm": 1.0562872739994562, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 35830 + }, + { + "epoch": 0.35831, + "grad_norm": 0.9062565818936928, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 35831 + }, + { + "epoch": 0.35832, + "grad_norm": 0.8462554693802367, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 35832 + }, + { + "epoch": 0.35833, + "grad_norm": 0.7993484928513301, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 35833 + }, + { + "epoch": 0.35834, + "grad_norm": 0.787445764699777, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 35834 + }, + { + "epoch": 0.35835, + "grad_norm": 0.8014684316455056, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 35835 + }, + { + "epoch": 0.35836, + "grad_norm": 0.8925262608560436, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 35836 + }, + { + "epoch": 0.35837, + "grad_norm": 0.9491587356609208, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 35837 + }, + { + "epoch": 0.35838, + "grad_norm": 0.8048143487825181, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 35838 + }, + { + "epoch": 0.35839, + "grad_norm": 0.6215864907299954, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 35839 + }, + { + "epoch": 0.3584, + "grad_norm": 0.5918753385135883, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 35840 + }, + { + "epoch": 0.35841, + "grad_norm": 0.5635948495219978, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 35841 + }, + { + "epoch": 0.35842, + "grad_norm": 0.53481792365274, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 35842 + }, + { + "epoch": 0.35843, + "grad_norm": 0.6311979982127771, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 35843 + }, + { + "epoch": 0.35844, + "grad_norm": 0.7606152889723282, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 35844 + }, + { + "epoch": 0.35845, + "grad_norm": 0.9191801092473408, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 35845 + }, + { + "epoch": 0.35846, + "grad_norm": 1.126926836154685, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 35846 + }, + { + "epoch": 0.35847, + "grad_norm": 0.9016179542828027, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 35847 + }, + { + "epoch": 0.35848, + "grad_norm": 0.7456965502743051, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 35848 + }, + { + "epoch": 0.35849, + "grad_norm": 0.6055919165224919, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 35849 + }, + { + "epoch": 0.3585, + "grad_norm": 0.5926808223111274, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 35850 + }, + { + "epoch": 0.35851, + "grad_norm": 0.6435694033811742, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 35851 + }, + { + "epoch": 0.35852, + "grad_norm": 0.6336047419389741, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 35852 + }, + { + "epoch": 0.35853, + "grad_norm": 0.6080603726646344, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 35853 + }, + { + "epoch": 0.35854, + "grad_norm": 0.5773159298418195, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 35854 + }, + { + "epoch": 0.35855, + "grad_norm": 0.5894811806845447, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 35855 + }, + { + "epoch": 0.35856, + "grad_norm": 0.5769730868359955, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 35856 + }, + { + "epoch": 0.35857, + "grad_norm": 0.697784314023132, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 35857 + }, + { + "epoch": 0.35858, + "grad_norm": 0.9553486091938277, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 35858 + }, + { + "epoch": 0.35859, + "grad_norm": 1.2720336755106505, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 35859 + }, + { + "epoch": 0.3586, + "grad_norm": 0.6882596422330394, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 35860 + }, + { + "epoch": 0.35861, + "grad_norm": 0.6588307229709995, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 35861 + }, + { + "epoch": 0.35862, + "grad_norm": 0.736417974897671, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 35862 + }, + { + "epoch": 0.35863, + "grad_norm": 0.7649227408945096, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 35863 + }, + { + "epoch": 0.35864, + "grad_norm": 0.760987825956093, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 35864 + }, + { + "epoch": 0.35865, + "grad_norm": 0.7540592879390556, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 35865 + }, + { + "epoch": 0.35866, + "grad_norm": 0.8059999608284294, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 35866 + }, + { + "epoch": 0.35867, + "grad_norm": 0.909376605840871, + "learning_rate": 0.003, + "loss": 4.0757, + "step": 35867 + }, + { + "epoch": 0.35868, + "grad_norm": 1.0702820025786035, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 35868 + }, + { + "epoch": 0.35869, + "grad_norm": 1.0668750690656204, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 35869 + }, + { + "epoch": 0.3587, + "grad_norm": 0.9771639813335892, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 35870 + }, + { + "epoch": 0.35871, + "grad_norm": 0.9388381545406745, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 35871 + }, + { + "epoch": 0.35872, + "grad_norm": 0.9316609735920678, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 35872 + }, + { + "epoch": 0.35873, + "grad_norm": 0.8275491269716966, + "learning_rate": 0.003, + "loss": 4.033, + "step": 35873 + }, + { + "epoch": 0.35874, + "grad_norm": 0.8904251615424199, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 35874 + }, + { + "epoch": 0.35875, + "grad_norm": 0.8311514105160075, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 35875 + }, + { + "epoch": 0.35876, + "grad_norm": 0.8233615428886373, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 35876 + }, + { + "epoch": 0.35877, + "grad_norm": 0.8802855925778794, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 35877 + }, + { + "epoch": 0.35878, + "grad_norm": 0.9232019989122032, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 35878 + }, + { + "epoch": 0.35879, + "grad_norm": 0.9638917866888298, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 35879 + }, + { + "epoch": 0.3588, + "grad_norm": 1.0411728515588798, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 35880 + }, + { + "epoch": 0.35881, + "grad_norm": 0.7350074992996689, + "learning_rate": 0.003, + "loss": 4.014, + "step": 35881 + }, + { + "epoch": 0.35882, + "grad_norm": 0.6678018743584874, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 35882 + }, + { + "epoch": 0.35883, + "grad_norm": 0.6688161464951423, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 35883 + }, + { + "epoch": 0.35884, + "grad_norm": 0.6875551857424248, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 35884 + }, + { + "epoch": 0.35885, + "grad_norm": 0.664680453722645, + "learning_rate": 0.003, + "loss": 4.051, + "step": 35885 + }, + { + "epoch": 0.35886, + "grad_norm": 0.7533618505561995, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 35886 + }, + { + "epoch": 0.35887, + "grad_norm": 0.8581671096389137, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 35887 + }, + { + "epoch": 0.35888, + "grad_norm": 0.9396249143604423, + "learning_rate": 0.003, + "loss": 4.056, + "step": 35888 + }, + { + "epoch": 0.35889, + "grad_norm": 1.0283851510310789, + "learning_rate": 0.003, + "loss": 4.038, + "step": 35889 + }, + { + "epoch": 0.3589, + "grad_norm": 1.1040121933510236, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 35890 + }, + { + "epoch": 0.35891, + "grad_norm": 0.8564892923471411, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 35891 + }, + { + "epoch": 0.35892, + "grad_norm": 0.8420840487431895, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 35892 + }, + { + "epoch": 0.35893, + "grad_norm": 0.954698804571236, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 35893 + }, + { + "epoch": 0.35894, + "grad_norm": 1.0649186116898905, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 35894 + }, + { + "epoch": 0.35895, + "grad_norm": 1.006670972825318, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 35895 + }, + { + "epoch": 0.35896, + "grad_norm": 0.9026554313297486, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 35896 + }, + { + "epoch": 0.35897, + "grad_norm": 0.8334738032550582, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 35897 + }, + { + "epoch": 0.35898, + "grad_norm": 0.9443334749453474, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 35898 + }, + { + "epoch": 0.35899, + "grad_norm": 1.1087248889912282, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 35899 + }, + { + "epoch": 0.359, + "grad_norm": 1.0176737131204736, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 35900 + }, + { + "epoch": 0.35901, + "grad_norm": 0.925975943028702, + "learning_rate": 0.003, + "loss": 4.047, + "step": 35901 + }, + { + "epoch": 0.35902, + "grad_norm": 0.9553335022742603, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 35902 + }, + { + "epoch": 0.35903, + "grad_norm": 0.96660447781142, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 35903 + }, + { + "epoch": 0.35904, + "grad_norm": 0.8600038327906341, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 35904 + }, + { + "epoch": 0.35905, + "grad_norm": 0.7845353560503746, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 35905 + }, + { + "epoch": 0.35906, + "grad_norm": 0.6720916719612227, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 35906 + }, + { + "epoch": 0.35907, + "grad_norm": 0.6144870766152345, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 35907 + }, + { + "epoch": 0.35908, + "grad_norm": 0.5876846812451021, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 35908 + }, + { + "epoch": 0.35909, + "grad_norm": 0.6308608548336441, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 35909 + }, + { + "epoch": 0.3591, + "grad_norm": 0.62943823472297, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 35910 + }, + { + "epoch": 0.35911, + "grad_norm": 0.6644537495559605, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 35911 + }, + { + "epoch": 0.35912, + "grad_norm": 0.6264166446795104, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 35912 + }, + { + "epoch": 0.35913, + "grad_norm": 0.6138886970791118, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 35913 + }, + { + "epoch": 0.35914, + "grad_norm": 0.7229476627746637, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 35914 + }, + { + "epoch": 0.35915, + "grad_norm": 0.8577840609265494, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 35915 + }, + { + "epoch": 0.35916, + "grad_norm": 0.809921476185462, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 35916 + }, + { + "epoch": 0.35917, + "grad_norm": 0.8604316911744897, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 35917 + }, + { + "epoch": 0.35918, + "grad_norm": 0.9068764805890928, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 35918 + }, + { + "epoch": 0.35919, + "grad_norm": 0.9199608103285492, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 35919 + }, + { + "epoch": 0.3592, + "grad_norm": 0.8555724367436517, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 35920 + }, + { + "epoch": 0.35921, + "grad_norm": 0.7963735112223053, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 35921 + }, + { + "epoch": 0.35922, + "grad_norm": 0.7501068259878613, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 35922 + }, + { + "epoch": 0.35923, + "grad_norm": 0.6962561149086551, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 35923 + }, + { + "epoch": 0.35924, + "grad_norm": 0.7084535846556932, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 35924 + }, + { + "epoch": 0.35925, + "grad_norm": 0.6912773609407928, + "learning_rate": 0.003, + "loss": 4.038, + "step": 35925 + }, + { + "epoch": 0.35926, + "grad_norm": 0.7374355237727781, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 35926 + }, + { + "epoch": 0.35927, + "grad_norm": 0.8630800860790284, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 35927 + }, + { + "epoch": 0.35928, + "grad_norm": 1.1003344862148627, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 35928 + }, + { + "epoch": 0.35929, + "grad_norm": 1.1512845731859822, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 35929 + }, + { + "epoch": 0.3593, + "grad_norm": 0.7765597474530024, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 35930 + }, + { + "epoch": 0.35931, + "grad_norm": 0.7153068798408476, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 35931 + }, + { + "epoch": 0.35932, + "grad_norm": 0.8150397236262578, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 35932 + }, + { + "epoch": 0.35933, + "grad_norm": 0.8284141375687092, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 35933 + }, + { + "epoch": 0.35934, + "grad_norm": 0.7731974352973713, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 35934 + }, + { + "epoch": 0.35935, + "grad_norm": 0.7049902229889523, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 35935 + }, + { + "epoch": 0.35936, + "grad_norm": 0.7028124229101087, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 35936 + }, + { + "epoch": 0.35937, + "grad_norm": 0.735761832258226, + "learning_rate": 0.003, + "loss": 4.032, + "step": 35937 + }, + { + "epoch": 0.35938, + "grad_norm": 0.8695468151747444, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 35938 + }, + { + "epoch": 0.35939, + "grad_norm": 0.9182114629786018, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 35939 + }, + { + "epoch": 0.3594, + "grad_norm": 0.9592484474695742, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 35940 + }, + { + "epoch": 0.35941, + "grad_norm": 1.0174974604142175, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 35941 + }, + { + "epoch": 0.35942, + "grad_norm": 0.9189207856229025, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 35942 + }, + { + "epoch": 0.35943, + "grad_norm": 0.7582563193506823, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 35943 + }, + { + "epoch": 0.35944, + "grad_norm": 0.7949544930176887, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 35944 + }, + { + "epoch": 0.35945, + "grad_norm": 0.8463110646232557, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 35945 + }, + { + "epoch": 0.35946, + "grad_norm": 0.9854405820681579, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 35946 + }, + { + "epoch": 0.35947, + "grad_norm": 1.1100310605662145, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 35947 + }, + { + "epoch": 0.35948, + "grad_norm": 0.8146795702447497, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 35948 + }, + { + "epoch": 0.35949, + "grad_norm": 0.8078292109636466, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 35949 + }, + { + "epoch": 0.3595, + "grad_norm": 0.8036254626730283, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 35950 + }, + { + "epoch": 0.35951, + "grad_norm": 0.850089158673278, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 35951 + }, + { + "epoch": 0.35952, + "grad_norm": 0.9845374415394457, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 35952 + }, + { + "epoch": 0.35953, + "grad_norm": 0.9822529068123973, + "learning_rate": 0.003, + "loss": 4.049, + "step": 35953 + }, + { + "epoch": 0.35954, + "grad_norm": 0.7769876229293872, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 35954 + }, + { + "epoch": 0.35955, + "grad_norm": 0.7518321331407024, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 35955 + }, + { + "epoch": 0.35956, + "grad_norm": 0.7998332826835556, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 35956 + }, + { + "epoch": 0.35957, + "grad_norm": 0.7441501239077445, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 35957 + }, + { + "epoch": 0.35958, + "grad_norm": 0.7334715441549721, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 35958 + }, + { + "epoch": 0.35959, + "grad_norm": 0.7550967048413534, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 35959 + }, + { + "epoch": 0.3596, + "grad_norm": 0.8731127302711513, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 35960 + }, + { + "epoch": 0.35961, + "grad_norm": 0.9814649973342221, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 35961 + }, + { + "epoch": 0.35962, + "grad_norm": 0.9793736134364558, + "learning_rate": 0.003, + "loss": 4.018, + "step": 35962 + }, + { + "epoch": 0.35963, + "grad_norm": 0.8648079461882516, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 35963 + }, + { + "epoch": 0.35964, + "grad_norm": 0.7648642421468954, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 35964 + }, + { + "epoch": 0.35965, + "grad_norm": 0.8309324924905529, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 35965 + }, + { + "epoch": 0.35966, + "grad_norm": 0.8038760318471242, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 35966 + }, + { + "epoch": 0.35967, + "grad_norm": 0.8498864404603288, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 35967 + }, + { + "epoch": 0.35968, + "grad_norm": 0.949733729900237, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 35968 + }, + { + "epoch": 0.35969, + "grad_norm": 0.9891045457364769, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 35969 + }, + { + "epoch": 0.3597, + "grad_norm": 0.8473434221274576, + "learning_rate": 0.003, + "loss": 4.0779, + "step": 35970 + }, + { + "epoch": 0.35971, + "grad_norm": 0.7529946230716891, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 35971 + }, + { + "epoch": 0.35972, + "grad_norm": 0.8469415156044917, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 35972 + }, + { + "epoch": 0.35973, + "grad_norm": 1.0225938944613675, + "learning_rate": 0.003, + "loss": 4.053, + "step": 35973 + }, + { + "epoch": 0.35974, + "grad_norm": 1.1885124752805654, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 35974 + }, + { + "epoch": 0.35975, + "grad_norm": 0.7377708136575195, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 35975 + }, + { + "epoch": 0.35976, + "grad_norm": 0.6108701028578247, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 35976 + }, + { + "epoch": 0.35977, + "grad_norm": 0.6763047040537055, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 35977 + }, + { + "epoch": 0.35978, + "grad_norm": 0.6577297866494564, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 35978 + }, + { + "epoch": 0.35979, + "grad_norm": 0.6176876476517045, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 35979 + }, + { + "epoch": 0.3598, + "grad_norm": 0.5678339640433867, + "learning_rate": 0.003, + "loss": 3.9983, + "step": 35980 + }, + { + "epoch": 0.35981, + "grad_norm": 0.631091271136768, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 35981 + }, + { + "epoch": 0.35982, + "grad_norm": 0.6508105398155135, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 35982 + }, + { + "epoch": 0.35983, + "grad_norm": 0.6580766819813928, + "learning_rate": 0.003, + "loss": 4.021, + "step": 35983 + }, + { + "epoch": 0.35984, + "grad_norm": 0.6357600424279186, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 35984 + }, + { + "epoch": 0.35985, + "grad_norm": 0.6335636324062313, + "learning_rate": 0.003, + "loss": 3.9899, + "step": 35985 + }, + { + "epoch": 0.35986, + "grad_norm": 0.7525869457763189, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 35986 + }, + { + "epoch": 0.35987, + "grad_norm": 1.025104682863561, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 35987 + }, + { + "epoch": 0.35988, + "grad_norm": 1.0752689506830986, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 35988 + }, + { + "epoch": 0.35989, + "grad_norm": 0.958530850944837, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 35989 + }, + { + "epoch": 0.3599, + "grad_norm": 1.06229076255523, + "learning_rate": 0.003, + "loss": 4.046, + "step": 35990 + }, + { + "epoch": 0.35991, + "grad_norm": 0.8641403115919098, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 35991 + }, + { + "epoch": 0.35992, + "grad_norm": 0.8174529895114744, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 35992 + }, + { + "epoch": 0.35993, + "grad_norm": 0.8002915089135579, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 35993 + }, + { + "epoch": 0.35994, + "grad_norm": 0.8253868223067767, + "learning_rate": 0.003, + "loss": 4.061, + "step": 35994 + }, + { + "epoch": 0.35995, + "grad_norm": 0.9035360802492447, + "learning_rate": 0.003, + "loss": 4.037, + "step": 35995 + }, + { + "epoch": 0.35996, + "grad_norm": 0.9999856265904967, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 35996 + }, + { + "epoch": 0.35997, + "grad_norm": 0.9207725467796689, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 35997 + }, + { + "epoch": 0.35998, + "grad_norm": 0.9007772937057376, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 35998 + }, + { + "epoch": 0.35999, + "grad_norm": 0.9153076250849782, + "learning_rate": 0.003, + "loss": 4.0943, + "step": 35999 + }, + { + "epoch": 0.36, + "grad_norm": 1.0456260840601326, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 36000 + }, + { + "epoch": 0.36001, + "grad_norm": 0.9976752555642442, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 36001 + }, + { + "epoch": 0.36002, + "grad_norm": 0.8860241985652977, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 36002 + }, + { + "epoch": 0.36003, + "grad_norm": 0.8190522445599919, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 36003 + }, + { + "epoch": 0.36004, + "grad_norm": 0.7994624002518261, + "learning_rate": 0.003, + "loss": 4.046, + "step": 36004 + }, + { + "epoch": 0.36005, + "grad_norm": 0.9143259219762881, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 36005 + }, + { + "epoch": 0.36006, + "grad_norm": 0.9025093314032023, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 36006 + }, + { + "epoch": 0.36007, + "grad_norm": 0.851294901052986, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 36007 + }, + { + "epoch": 0.36008, + "grad_norm": 0.8275655861806055, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 36008 + }, + { + "epoch": 0.36009, + "grad_norm": 0.8324318694960143, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 36009 + }, + { + "epoch": 0.3601, + "grad_norm": 0.7902030643218143, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 36010 + }, + { + "epoch": 0.36011, + "grad_norm": 0.7437233600065077, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 36011 + }, + { + "epoch": 0.36012, + "grad_norm": 0.8176108452088148, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 36012 + }, + { + "epoch": 0.36013, + "grad_norm": 0.7961927327510231, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 36013 + }, + { + "epoch": 0.36014, + "grad_norm": 0.7682486516963732, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 36014 + }, + { + "epoch": 0.36015, + "grad_norm": 0.7971696341537308, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 36015 + }, + { + "epoch": 0.36016, + "grad_norm": 0.8930874082679988, + "learning_rate": 0.003, + "loss": 4.02, + "step": 36016 + }, + { + "epoch": 0.36017, + "grad_norm": 1.104602438852375, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 36017 + }, + { + "epoch": 0.36018, + "grad_norm": 1.0697303289954676, + "learning_rate": 0.003, + "loss": 4.014, + "step": 36018 + }, + { + "epoch": 0.36019, + "grad_norm": 0.9077920282466889, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 36019 + }, + { + "epoch": 0.3602, + "grad_norm": 0.7969938860776167, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 36020 + }, + { + "epoch": 0.36021, + "grad_norm": 0.6925845480474783, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 36021 + }, + { + "epoch": 0.36022, + "grad_norm": 0.6292161412354174, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 36022 + }, + { + "epoch": 0.36023, + "grad_norm": 0.6616523199243267, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 36023 + }, + { + "epoch": 0.36024, + "grad_norm": 0.7607346892590663, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 36024 + }, + { + "epoch": 0.36025, + "grad_norm": 0.7730325490825467, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 36025 + }, + { + "epoch": 0.36026, + "grad_norm": 0.7644847249462404, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 36026 + }, + { + "epoch": 0.36027, + "grad_norm": 0.7050631551380216, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 36027 + }, + { + "epoch": 0.36028, + "grad_norm": 0.5882533552211097, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 36028 + }, + { + "epoch": 0.36029, + "grad_norm": 0.6451464723263445, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 36029 + }, + { + "epoch": 0.3603, + "grad_norm": 0.7241864619698184, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 36030 + }, + { + "epoch": 0.36031, + "grad_norm": 0.9751044642172642, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 36031 + }, + { + "epoch": 0.36032, + "grad_norm": 1.168553600470053, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 36032 + }, + { + "epoch": 0.36033, + "grad_norm": 1.0198543291928768, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 36033 + }, + { + "epoch": 0.36034, + "grad_norm": 0.9908177560533459, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 36034 + }, + { + "epoch": 0.36035, + "grad_norm": 0.989813630191513, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 36035 + }, + { + "epoch": 0.36036, + "grad_norm": 0.9737083578315803, + "learning_rate": 0.003, + "loss": 4.0051, + "step": 36036 + }, + { + "epoch": 0.36037, + "grad_norm": 0.8513928299372253, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 36037 + }, + { + "epoch": 0.36038, + "grad_norm": 0.7241378795301376, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 36038 + }, + { + "epoch": 0.36039, + "grad_norm": 0.7152838192244744, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 36039 + }, + { + "epoch": 0.3604, + "grad_norm": 0.6474283459624993, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 36040 + }, + { + "epoch": 0.36041, + "grad_norm": 0.6408977522102101, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 36041 + }, + { + "epoch": 0.36042, + "grad_norm": 0.72471750355534, + "learning_rate": 0.003, + "loss": 4.041, + "step": 36042 + }, + { + "epoch": 0.36043, + "grad_norm": 0.7194817946561479, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 36043 + }, + { + "epoch": 0.36044, + "grad_norm": 0.6462299905136741, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 36044 + }, + { + "epoch": 0.36045, + "grad_norm": 0.6882556868063349, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 36045 + }, + { + "epoch": 0.36046, + "grad_norm": 0.8817756699160646, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 36046 + }, + { + "epoch": 0.36047, + "grad_norm": 1.175778505336325, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 36047 + }, + { + "epoch": 0.36048, + "grad_norm": 0.962536775425108, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 36048 + }, + { + "epoch": 0.36049, + "grad_norm": 0.9285235184149256, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 36049 + }, + { + "epoch": 0.3605, + "grad_norm": 0.9324315422063517, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 36050 + }, + { + "epoch": 0.36051, + "grad_norm": 0.954433203765653, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 36051 + }, + { + "epoch": 0.36052, + "grad_norm": 0.932191186650232, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 36052 + }, + { + "epoch": 0.36053, + "grad_norm": 0.7941369447623988, + "learning_rate": 0.003, + "loss": 4.024, + "step": 36053 + }, + { + "epoch": 0.36054, + "grad_norm": 0.8619778088673284, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 36054 + }, + { + "epoch": 0.36055, + "grad_norm": 0.8317873217169406, + "learning_rate": 0.003, + "loss": 4.031, + "step": 36055 + }, + { + "epoch": 0.36056, + "grad_norm": 0.8315923959959453, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 36056 + }, + { + "epoch": 0.36057, + "grad_norm": 0.8643957416384325, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 36057 + }, + { + "epoch": 0.36058, + "grad_norm": 0.8919914032146556, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 36058 + }, + { + "epoch": 0.36059, + "grad_norm": 0.8468277421477948, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 36059 + }, + { + "epoch": 0.3606, + "grad_norm": 0.9774371055336104, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 36060 + }, + { + "epoch": 0.36061, + "grad_norm": 1.180376359023558, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 36061 + }, + { + "epoch": 0.36062, + "grad_norm": 0.7772863198048476, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 36062 + }, + { + "epoch": 0.36063, + "grad_norm": 0.7382068218670568, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 36063 + }, + { + "epoch": 0.36064, + "grad_norm": 0.729613897736231, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 36064 + }, + { + "epoch": 0.36065, + "grad_norm": 0.7665725947247406, + "learning_rate": 0.003, + "loss": 3.9952, + "step": 36065 + }, + { + "epoch": 0.36066, + "grad_norm": 0.7326751674075078, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 36066 + }, + { + "epoch": 0.36067, + "grad_norm": 0.8007243571896153, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 36067 + }, + { + "epoch": 0.36068, + "grad_norm": 0.7544555626743459, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 36068 + }, + { + "epoch": 0.36069, + "grad_norm": 0.822876279319198, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 36069 + }, + { + "epoch": 0.3607, + "grad_norm": 0.8579773720464112, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 36070 + }, + { + "epoch": 0.36071, + "grad_norm": 0.8095081940726709, + "learning_rate": 0.003, + "loss": 4.038, + "step": 36071 + }, + { + "epoch": 0.36072, + "grad_norm": 0.7884054903108624, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 36072 + }, + { + "epoch": 0.36073, + "grad_norm": 0.7414439795225124, + "learning_rate": 0.003, + "loss": 4.051, + "step": 36073 + }, + { + "epoch": 0.36074, + "grad_norm": 0.655709141168917, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 36074 + }, + { + "epoch": 0.36075, + "grad_norm": 0.5758097813162355, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 36075 + }, + { + "epoch": 0.36076, + "grad_norm": 0.6131648238749258, + "learning_rate": 0.003, + "loss": 4.019, + "step": 36076 + }, + { + "epoch": 0.36077, + "grad_norm": 0.6811895223488318, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 36077 + }, + { + "epoch": 0.36078, + "grad_norm": 0.864071452608905, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 36078 + }, + { + "epoch": 0.36079, + "grad_norm": 1.1318870080576766, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 36079 + }, + { + "epoch": 0.3608, + "grad_norm": 0.9447098895584746, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 36080 + }, + { + "epoch": 0.36081, + "grad_norm": 0.7291708446418561, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 36081 + }, + { + "epoch": 0.36082, + "grad_norm": 0.6381488495936043, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 36082 + }, + { + "epoch": 0.36083, + "grad_norm": 0.7454109349550443, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 36083 + }, + { + "epoch": 0.36084, + "grad_norm": 0.805802494918885, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 36084 + }, + { + "epoch": 0.36085, + "grad_norm": 0.8066284205909617, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 36085 + }, + { + "epoch": 0.36086, + "grad_norm": 0.8091350197570315, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 36086 + }, + { + "epoch": 0.36087, + "grad_norm": 0.7620691421566748, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 36087 + }, + { + "epoch": 0.36088, + "grad_norm": 0.764707036373983, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 36088 + }, + { + "epoch": 0.36089, + "grad_norm": 0.8197918149596831, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 36089 + }, + { + "epoch": 0.3609, + "grad_norm": 0.8793714261096014, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 36090 + }, + { + "epoch": 0.36091, + "grad_norm": 0.9553211467691396, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 36091 + }, + { + "epoch": 0.36092, + "grad_norm": 0.9467785123614338, + "learning_rate": 0.003, + "loss": 4.026, + "step": 36092 + }, + { + "epoch": 0.36093, + "grad_norm": 1.0110197568855444, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 36093 + }, + { + "epoch": 0.36094, + "grad_norm": 1.0741927699450853, + "learning_rate": 0.003, + "loss": 4.01, + "step": 36094 + }, + { + "epoch": 0.36095, + "grad_norm": 0.8653316139261935, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 36095 + }, + { + "epoch": 0.36096, + "grad_norm": 0.8180140299368645, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 36096 + }, + { + "epoch": 0.36097, + "grad_norm": 0.8677560629099222, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 36097 + }, + { + "epoch": 0.36098, + "grad_norm": 0.7884633830476191, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 36098 + }, + { + "epoch": 0.36099, + "grad_norm": 0.8637840875403797, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 36099 + }, + { + "epoch": 0.361, + "grad_norm": 0.9607104881344545, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 36100 + }, + { + "epoch": 0.36101, + "grad_norm": 1.0671205729259932, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 36101 + }, + { + "epoch": 0.36102, + "grad_norm": 1.0303325519990805, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 36102 + }, + { + "epoch": 0.36103, + "grad_norm": 1.1824888618091292, + "learning_rate": 0.003, + "loss": 4.067, + "step": 36103 + }, + { + "epoch": 0.36104, + "grad_norm": 0.7912155280993801, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 36104 + }, + { + "epoch": 0.36105, + "grad_norm": 0.7372579038653212, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 36105 + }, + { + "epoch": 0.36106, + "grad_norm": 0.6808470068398697, + "learning_rate": 0.003, + "loss": 4.066, + "step": 36106 + }, + { + "epoch": 0.36107, + "grad_norm": 0.5926337966044818, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 36107 + }, + { + "epoch": 0.36108, + "grad_norm": 0.5660116202821321, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 36108 + }, + { + "epoch": 0.36109, + "grad_norm": 0.5619781098076387, + "learning_rate": 0.003, + "loss": 4.0003, + "step": 36109 + }, + { + "epoch": 0.3611, + "grad_norm": 0.6062405729415818, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 36110 + }, + { + "epoch": 0.36111, + "grad_norm": 0.6104516353783908, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 36111 + }, + { + "epoch": 0.36112, + "grad_norm": 0.6489918556089683, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 36112 + }, + { + "epoch": 0.36113, + "grad_norm": 0.7599831460542599, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 36113 + }, + { + "epoch": 0.36114, + "grad_norm": 0.9239550962438553, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 36114 + }, + { + "epoch": 0.36115, + "grad_norm": 1.0309468027718331, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 36115 + }, + { + "epoch": 0.36116, + "grad_norm": 0.9311397206509183, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 36116 + }, + { + "epoch": 0.36117, + "grad_norm": 0.7744291145242865, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 36117 + }, + { + "epoch": 0.36118, + "grad_norm": 0.7202829901721511, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 36118 + }, + { + "epoch": 0.36119, + "grad_norm": 0.7193544935732338, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 36119 + }, + { + "epoch": 0.3612, + "grad_norm": 0.7409003004908901, + "learning_rate": 0.003, + "loss": 4.05, + "step": 36120 + }, + { + "epoch": 0.36121, + "grad_norm": 0.8046123688594464, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 36121 + }, + { + "epoch": 0.36122, + "grad_norm": 0.8486736991247654, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 36122 + }, + { + "epoch": 0.36123, + "grad_norm": 0.9029538388854433, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 36123 + }, + { + "epoch": 0.36124, + "grad_norm": 1.2760023233758353, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 36124 + }, + { + "epoch": 0.36125, + "grad_norm": 0.8939206666094067, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 36125 + }, + { + "epoch": 0.36126, + "grad_norm": 0.7790066845286676, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 36126 + }, + { + "epoch": 0.36127, + "grad_norm": 0.7982515178638556, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 36127 + }, + { + "epoch": 0.36128, + "grad_norm": 0.7909874900945881, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 36128 + }, + { + "epoch": 0.36129, + "grad_norm": 0.8019591084395326, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 36129 + }, + { + "epoch": 0.3613, + "grad_norm": 0.7684497860591941, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 36130 + }, + { + "epoch": 0.36131, + "grad_norm": 0.8040806115349024, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 36131 + }, + { + "epoch": 0.36132, + "grad_norm": 0.8882085343531806, + "learning_rate": 0.003, + "loss": 4.027, + "step": 36132 + }, + { + "epoch": 0.36133, + "grad_norm": 0.9901819485003814, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 36133 + }, + { + "epoch": 0.36134, + "grad_norm": 1.0385208324822233, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 36134 + }, + { + "epoch": 0.36135, + "grad_norm": 0.9264409310881075, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 36135 + }, + { + "epoch": 0.36136, + "grad_norm": 0.8599065087945325, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 36136 + }, + { + "epoch": 0.36137, + "grad_norm": 0.9327767222636576, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 36137 + }, + { + "epoch": 0.36138, + "grad_norm": 1.1985729888110432, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 36138 + }, + { + "epoch": 0.36139, + "grad_norm": 0.9573295771533262, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 36139 + }, + { + "epoch": 0.3614, + "grad_norm": 0.8728197217041107, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 36140 + }, + { + "epoch": 0.36141, + "grad_norm": 0.95722021178269, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 36141 + }, + { + "epoch": 0.36142, + "grad_norm": 1.0145648959553046, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 36142 + }, + { + "epoch": 0.36143, + "grad_norm": 0.9828182633084855, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 36143 + }, + { + "epoch": 0.36144, + "grad_norm": 1.058310520756486, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 36144 + }, + { + "epoch": 0.36145, + "grad_norm": 0.7990877877546216, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 36145 + }, + { + "epoch": 0.36146, + "grad_norm": 0.7126085794699403, + "learning_rate": 0.003, + "loss": 4.0023, + "step": 36146 + }, + { + "epoch": 0.36147, + "grad_norm": 0.7934433109564034, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 36147 + }, + { + "epoch": 0.36148, + "grad_norm": 0.9652966157151753, + "learning_rate": 0.003, + "loss": 4.018, + "step": 36148 + }, + { + "epoch": 0.36149, + "grad_norm": 1.229237883318824, + "learning_rate": 0.003, + "loss": 4.083, + "step": 36149 + }, + { + "epoch": 0.3615, + "grad_norm": 0.7048830672041387, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 36150 + }, + { + "epoch": 0.36151, + "grad_norm": 0.6602110977273594, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 36151 + }, + { + "epoch": 0.36152, + "grad_norm": 0.6936658839485019, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 36152 + }, + { + "epoch": 0.36153, + "grad_norm": 0.6863970946329797, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 36153 + }, + { + "epoch": 0.36154, + "grad_norm": 0.8347747327762084, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 36154 + }, + { + "epoch": 0.36155, + "grad_norm": 0.9771446961618659, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 36155 + }, + { + "epoch": 0.36156, + "grad_norm": 1.0015932309884417, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 36156 + }, + { + "epoch": 0.36157, + "grad_norm": 0.8978324946709514, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 36157 + }, + { + "epoch": 0.36158, + "grad_norm": 0.6859622968697874, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 36158 + }, + { + "epoch": 0.36159, + "grad_norm": 0.6962456779324124, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 36159 + }, + { + "epoch": 0.3616, + "grad_norm": 0.6923240896203624, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 36160 + }, + { + "epoch": 0.36161, + "grad_norm": 0.6471884452382608, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 36161 + }, + { + "epoch": 0.36162, + "grad_norm": 0.6573532167043596, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 36162 + }, + { + "epoch": 0.36163, + "grad_norm": 0.7373046631249566, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 36163 + }, + { + "epoch": 0.36164, + "grad_norm": 0.920474630506166, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 36164 + }, + { + "epoch": 0.36165, + "grad_norm": 1.108397612993503, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 36165 + }, + { + "epoch": 0.36166, + "grad_norm": 0.9016056333821059, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 36166 + }, + { + "epoch": 0.36167, + "grad_norm": 0.8643930102276338, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 36167 + }, + { + "epoch": 0.36168, + "grad_norm": 0.888597876752641, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 36168 + }, + { + "epoch": 0.36169, + "grad_norm": 0.846943863960629, + "learning_rate": 0.003, + "loss": 3.9924, + "step": 36169 + }, + { + "epoch": 0.3617, + "grad_norm": 0.9764453456094867, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 36170 + }, + { + "epoch": 0.36171, + "grad_norm": 1.143743736113318, + "learning_rate": 0.003, + "loss": 4.061, + "step": 36171 + }, + { + "epoch": 0.36172, + "grad_norm": 0.8327376858613246, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 36172 + }, + { + "epoch": 0.36173, + "grad_norm": 0.7309434660539891, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 36173 + }, + { + "epoch": 0.36174, + "grad_norm": 0.6660888360870262, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 36174 + }, + { + "epoch": 0.36175, + "grad_norm": 0.7124676250836992, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 36175 + }, + { + "epoch": 0.36176, + "grad_norm": 0.6128495700448088, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 36176 + }, + { + "epoch": 0.36177, + "grad_norm": 0.6230516105800905, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 36177 + }, + { + "epoch": 0.36178, + "grad_norm": 0.5923232415104932, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 36178 + }, + { + "epoch": 0.36179, + "grad_norm": 0.6682500241185835, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 36179 + }, + { + "epoch": 0.3618, + "grad_norm": 0.6997058193621772, + "learning_rate": 0.003, + "loss": 3.9757, + "step": 36180 + }, + { + "epoch": 0.36181, + "grad_norm": 0.8392596251297147, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 36181 + }, + { + "epoch": 0.36182, + "grad_norm": 0.9161047905966407, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 36182 + }, + { + "epoch": 0.36183, + "grad_norm": 0.971544424638284, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 36183 + }, + { + "epoch": 0.36184, + "grad_norm": 1.023079795601662, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 36184 + }, + { + "epoch": 0.36185, + "grad_norm": 0.9068379067977755, + "learning_rate": 0.003, + "loss": 4.05, + "step": 36185 + }, + { + "epoch": 0.36186, + "grad_norm": 0.789082451988211, + "learning_rate": 0.003, + "loss": 3.991, + "step": 36186 + }, + { + "epoch": 0.36187, + "grad_norm": 0.790771193544146, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 36187 + }, + { + "epoch": 0.36188, + "grad_norm": 0.6512332994160208, + "learning_rate": 0.003, + "loss": 4.025, + "step": 36188 + }, + { + "epoch": 0.36189, + "grad_norm": 0.7669990763498676, + "learning_rate": 0.003, + "loss": 4.0801, + "step": 36189 + }, + { + "epoch": 0.3619, + "grad_norm": 0.8858381009421384, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 36190 + }, + { + "epoch": 0.36191, + "grad_norm": 1.0607991232283973, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 36191 + }, + { + "epoch": 0.36192, + "grad_norm": 1.1149693762582151, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 36192 + }, + { + "epoch": 0.36193, + "grad_norm": 0.874412670871847, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 36193 + }, + { + "epoch": 0.36194, + "grad_norm": 0.7874914991880638, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 36194 + }, + { + "epoch": 0.36195, + "grad_norm": 0.7441181103277933, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 36195 + }, + { + "epoch": 0.36196, + "grad_norm": 0.7814051866003274, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 36196 + }, + { + "epoch": 0.36197, + "grad_norm": 0.7465618571726641, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 36197 + }, + { + "epoch": 0.36198, + "grad_norm": 0.700901339469586, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 36198 + }, + { + "epoch": 0.36199, + "grad_norm": 0.6986393562300172, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 36199 + }, + { + "epoch": 0.362, + "grad_norm": 0.8080078696104913, + "learning_rate": 0.003, + "loss": 4.033, + "step": 36200 + }, + { + "epoch": 0.36201, + "grad_norm": 0.9492335354784401, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 36201 + }, + { + "epoch": 0.36202, + "grad_norm": 1.1195237205748318, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 36202 + }, + { + "epoch": 0.36203, + "grad_norm": 0.9619534304330097, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 36203 + }, + { + "epoch": 0.36204, + "grad_norm": 0.9654682793661412, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 36204 + }, + { + "epoch": 0.36205, + "grad_norm": 0.9032156095779432, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 36205 + }, + { + "epoch": 0.36206, + "grad_norm": 0.8736426353586842, + "learning_rate": 0.003, + "loss": 4.049, + "step": 36206 + }, + { + "epoch": 0.36207, + "grad_norm": 0.856212862184381, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 36207 + }, + { + "epoch": 0.36208, + "grad_norm": 0.8237365444352605, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 36208 + }, + { + "epoch": 0.36209, + "grad_norm": 0.7310085195974024, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 36209 + }, + { + "epoch": 0.3621, + "grad_norm": 0.6177493859289492, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 36210 + }, + { + "epoch": 0.36211, + "grad_norm": 0.6639200415015591, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 36211 + }, + { + "epoch": 0.36212, + "grad_norm": 0.7614030767776986, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 36212 + }, + { + "epoch": 0.36213, + "grad_norm": 0.787124520262374, + "learning_rate": 0.003, + "loss": 4.028, + "step": 36213 + }, + { + "epoch": 0.36214, + "grad_norm": 0.7862870018336456, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 36214 + }, + { + "epoch": 0.36215, + "grad_norm": 0.8033513147990177, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 36215 + }, + { + "epoch": 0.36216, + "grad_norm": 0.7965167474892196, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 36216 + }, + { + "epoch": 0.36217, + "grad_norm": 0.7587709019685444, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 36217 + }, + { + "epoch": 0.36218, + "grad_norm": 0.8333925991319455, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 36218 + }, + { + "epoch": 0.36219, + "grad_norm": 0.9709513394806073, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 36219 + }, + { + "epoch": 0.3622, + "grad_norm": 1.1883017302849523, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 36220 + }, + { + "epoch": 0.36221, + "grad_norm": 0.9636597417948873, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 36221 + }, + { + "epoch": 0.36222, + "grad_norm": 0.908867206769521, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 36222 + }, + { + "epoch": 0.36223, + "grad_norm": 0.9756587723497309, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 36223 + }, + { + "epoch": 0.36224, + "grad_norm": 1.0448911952801903, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 36224 + }, + { + "epoch": 0.36225, + "grad_norm": 0.9795876532947335, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 36225 + }, + { + "epoch": 0.36226, + "grad_norm": 1.0585674650517614, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 36226 + }, + { + "epoch": 0.36227, + "grad_norm": 0.9032206809531289, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 36227 + }, + { + "epoch": 0.36228, + "grad_norm": 0.9007256365978851, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 36228 + }, + { + "epoch": 0.36229, + "grad_norm": 0.8914711308971454, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 36229 + }, + { + "epoch": 0.3623, + "grad_norm": 0.9127984806604157, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 36230 + }, + { + "epoch": 0.36231, + "grad_norm": 0.7955733875682284, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 36231 + }, + { + "epoch": 0.36232, + "grad_norm": 0.660877152234582, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 36232 + }, + { + "epoch": 0.36233, + "grad_norm": 0.6509815074935072, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 36233 + }, + { + "epoch": 0.36234, + "grad_norm": 0.7214909120061157, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 36234 + }, + { + "epoch": 0.36235, + "grad_norm": 0.6656522527183739, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 36235 + }, + { + "epoch": 0.36236, + "grad_norm": 0.614947849619768, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 36236 + }, + { + "epoch": 0.36237, + "grad_norm": 0.6075525226059626, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 36237 + }, + { + "epoch": 0.36238, + "grad_norm": 0.5457880925374795, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 36238 + }, + { + "epoch": 0.36239, + "grad_norm": 0.4672633832482595, + "learning_rate": 0.003, + "loss": 4.0077, + "step": 36239 + }, + { + "epoch": 0.3624, + "grad_norm": 0.5548144415316796, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 36240 + }, + { + "epoch": 0.36241, + "grad_norm": 0.6658803877803623, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 36241 + }, + { + "epoch": 0.36242, + "grad_norm": 0.6927712681945006, + "learning_rate": 0.003, + "loss": 4.0069, + "step": 36242 + }, + { + "epoch": 0.36243, + "grad_norm": 0.7323619116089557, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 36243 + }, + { + "epoch": 0.36244, + "grad_norm": 0.9373086392378441, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 36244 + }, + { + "epoch": 0.36245, + "grad_norm": 1.262321924230984, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 36245 + }, + { + "epoch": 0.36246, + "grad_norm": 0.7035161011977403, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 36246 + }, + { + "epoch": 0.36247, + "grad_norm": 0.6609472594573719, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 36247 + }, + { + "epoch": 0.36248, + "grad_norm": 0.619833466609567, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 36248 + }, + { + "epoch": 0.36249, + "grad_norm": 0.7743331080607225, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 36249 + }, + { + "epoch": 0.3625, + "grad_norm": 0.9481274961985916, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 36250 + }, + { + "epoch": 0.36251, + "grad_norm": 1.0358524941378258, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 36251 + }, + { + "epoch": 0.36252, + "grad_norm": 0.8926284841298812, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 36252 + }, + { + "epoch": 0.36253, + "grad_norm": 0.8682544048236692, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 36253 + }, + { + "epoch": 0.36254, + "grad_norm": 0.9482589250291137, + "learning_rate": 0.003, + "loss": 4.043, + "step": 36254 + }, + { + "epoch": 0.36255, + "grad_norm": 1.0335902004962085, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 36255 + }, + { + "epoch": 0.36256, + "grad_norm": 1.0518636195628974, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 36256 + }, + { + "epoch": 0.36257, + "grad_norm": 0.8817075698153863, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 36257 + }, + { + "epoch": 0.36258, + "grad_norm": 0.8067763461991213, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 36258 + }, + { + "epoch": 0.36259, + "grad_norm": 0.7697393684161615, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 36259 + }, + { + "epoch": 0.3626, + "grad_norm": 0.6430064631151031, + "learning_rate": 0.003, + "loss": 4.041, + "step": 36260 + }, + { + "epoch": 0.36261, + "grad_norm": 0.6103638766200216, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 36261 + }, + { + "epoch": 0.36262, + "grad_norm": 0.5878085277274959, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 36262 + }, + { + "epoch": 0.36263, + "grad_norm": 0.7266600791676467, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 36263 + }, + { + "epoch": 0.36264, + "grad_norm": 0.9333222421220899, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 36264 + }, + { + "epoch": 0.36265, + "grad_norm": 1.3256698927270232, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 36265 + }, + { + "epoch": 0.36266, + "grad_norm": 0.8180811088912131, + "learning_rate": 0.003, + "loss": 4.0045, + "step": 36266 + }, + { + "epoch": 0.36267, + "grad_norm": 0.734718464180658, + "learning_rate": 0.003, + "loss": 4.042, + "step": 36267 + }, + { + "epoch": 0.36268, + "grad_norm": 0.8546310364129254, + "learning_rate": 0.003, + "loss": 4.03, + "step": 36268 + }, + { + "epoch": 0.36269, + "grad_norm": 0.8868883605057428, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 36269 + }, + { + "epoch": 0.3627, + "grad_norm": 0.9649382127038184, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 36270 + }, + { + "epoch": 0.36271, + "grad_norm": 0.9931772477319083, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 36271 + }, + { + "epoch": 0.36272, + "grad_norm": 1.011665788818521, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 36272 + }, + { + "epoch": 0.36273, + "grad_norm": 0.9203000124233275, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 36273 + }, + { + "epoch": 0.36274, + "grad_norm": 0.8280506201135917, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 36274 + }, + { + "epoch": 0.36275, + "grad_norm": 0.8044854944043571, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 36275 + }, + { + "epoch": 0.36276, + "grad_norm": 0.7381473732696936, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 36276 + }, + { + "epoch": 0.36277, + "grad_norm": 0.6105768013245975, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 36277 + }, + { + "epoch": 0.36278, + "grad_norm": 0.6697942125377337, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 36278 + }, + { + "epoch": 0.36279, + "grad_norm": 0.7410856970185403, + "learning_rate": 0.003, + "loss": 4.0027, + "step": 36279 + }, + { + "epoch": 0.3628, + "grad_norm": 0.96070763581087, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 36280 + }, + { + "epoch": 0.36281, + "grad_norm": 1.2550919409801142, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 36281 + }, + { + "epoch": 0.36282, + "grad_norm": 0.7328993168884109, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 36282 + }, + { + "epoch": 0.36283, + "grad_norm": 0.8341493656609968, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 36283 + }, + { + "epoch": 0.36284, + "grad_norm": 0.8783319335079848, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 36284 + }, + { + "epoch": 0.36285, + "grad_norm": 0.8574284327199292, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 36285 + }, + { + "epoch": 0.36286, + "grad_norm": 0.9123835995646858, + "learning_rate": 0.003, + "loss": 4.05, + "step": 36286 + }, + { + "epoch": 0.36287, + "grad_norm": 0.8464527802672702, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 36287 + }, + { + "epoch": 0.36288, + "grad_norm": 0.7082198155780018, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 36288 + }, + { + "epoch": 0.36289, + "grad_norm": 0.742568331082434, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 36289 + }, + { + "epoch": 0.3629, + "grad_norm": 0.792721573195771, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 36290 + }, + { + "epoch": 0.36291, + "grad_norm": 0.8462339427699568, + "learning_rate": 0.003, + "loss": 4.027, + "step": 36291 + }, + { + "epoch": 0.36292, + "grad_norm": 0.9830715235993148, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 36292 + }, + { + "epoch": 0.36293, + "grad_norm": 1.1889045516886718, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 36293 + }, + { + "epoch": 0.36294, + "grad_norm": 0.8276006174918804, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 36294 + }, + { + "epoch": 0.36295, + "grad_norm": 0.8398070382009011, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 36295 + }, + { + "epoch": 0.36296, + "grad_norm": 0.8669806009313794, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 36296 + }, + { + "epoch": 0.36297, + "grad_norm": 0.9257141343445308, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 36297 + }, + { + "epoch": 0.36298, + "grad_norm": 0.8651371946493249, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 36298 + }, + { + "epoch": 0.36299, + "grad_norm": 0.8724008175860735, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 36299 + }, + { + "epoch": 0.363, + "grad_norm": 0.8767862820707508, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 36300 + }, + { + "epoch": 0.36301, + "grad_norm": 0.8204147388872695, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 36301 + }, + { + "epoch": 0.36302, + "grad_norm": 0.7980359375239197, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 36302 + }, + { + "epoch": 0.36303, + "grad_norm": 1.0242747745344567, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 36303 + }, + { + "epoch": 0.36304, + "grad_norm": 1.0948889101150332, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 36304 + }, + { + "epoch": 0.36305, + "grad_norm": 0.800103064250179, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 36305 + }, + { + "epoch": 0.36306, + "grad_norm": 0.7434173381607175, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 36306 + }, + { + "epoch": 0.36307, + "grad_norm": 0.7272478080741488, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 36307 + }, + { + "epoch": 0.36308, + "grad_norm": 0.6919457760456812, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 36308 + }, + { + "epoch": 0.36309, + "grad_norm": 0.5955649060802995, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 36309 + }, + { + "epoch": 0.3631, + "grad_norm": 0.5661872243203214, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 36310 + }, + { + "epoch": 0.36311, + "grad_norm": 0.5526797008745544, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 36311 + }, + { + "epoch": 0.36312, + "grad_norm": 0.5419313640286038, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 36312 + }, + { + "epoch": 0.36313, + "grad_norm": 0.5955120082158624, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 36313 + }, + { + "epoch": 0.36314, + "grad_norm": 0.6158296183605521, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 36314 + }, + { + "epoch": 0.36315, + "grad_norm": 0.6958191405566765, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 36315 + }, + { + "epoch": 0.36316, + "grad_norm": 0.8152516129318739, + "learning_rate": 0.003, + "loss": 4.033, + "step": 36316 + }, + { + "epoch": 0.36317, + "grad_norm": 1.0453396314225678, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 36317 + }, + { + "epoch": 0.36318, + "grad_norm": 1.2429785766089754, + "learning_rate": 0.003, + "loss": 4.025, + "step": 36318 + }, + { + "epoch": 0.36319, + "grad_norm": 0.8175012427016766, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 36319 + }, + { + "epoch": 0.3632, + "grad_norm": 0.6397428087970309, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 36320 + }, + { + "epoch": 0.36321, + "grad_norm": 0.5949305741056974, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 36321 + }, + { + "epoch": 0.36322, + "grad_norm": 0.5987762797492557, + "learning_rate": 0.003, + "loss": 4.035, + "step": 36322 + }, + { + "epoch": 0.36323, + "grad_norm": 0.5284426636750944, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 36323 + }, + { + "epoch": 0.36324, + "grad_norm": 0.6251713565036452, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 36324 + }, + { + "epoch": 0.36325, + "grad_norm": 0.7147427143685434, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 36325 + }, + { + "epoch": 0.36326, + "grad_norm": 0.856010296595972, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 36326 + }, + { + "epoch": 0.36327, + "grad_norm": 1.0617090833645098, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 36327 + }, + { + "epoch": 0.36328, + "grad_norm": 1.1899525649758427, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 36328 + }, + { + "epoch": 0.36329, + "grad_norm": 0.8183078835949611, + "learning_rate": 0.003, + "loss": 4.005, + "step": 36329 + }, + { + "epoch": 0.3633, + "grad_norm": 0.7955412755294798, + "learning_rate": 0.003, + "loss": 3.9925, + "step": 36330 + }, + { + "epoch": 0.36331, + "grad_norm": 0.8553720328526043, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 36331 + }, + { + "epoch": 0.36332, + "grad_norm": 0.859420393789253, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 36332 + }, + { + "epoch": 0.36333, + "grad_norm": 0.8095303585501911, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 36333 + }, + { + "epoch": 0.36334, + "grad_norm": 0.7662665552826168, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 36334 + }, + { + "epoch": 0.36335, + "grad_norm": 0.9640801674403585, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 36335 + }, + { + "epoch": 0.36336, + "grad_norm": 1.0493136443314859, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 36336 + }, + { + "epoch": 0.36337, + "grad_norm": 0.9463989793402499, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 36337 + }, + { + "epoch": 0.36338, + "grad_norm": 0.8942619954183634, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 36338 + }, + { + "epoch": 0.36339, + "grad_norm": 0.963825999609164, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 36339 + }, + { + "epoch": 0.3634, + "grad_norm": 0.9391473199608329, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 36340 + }, + { + "epoch": 0.36341, + "grad_norm": 0.9339162845847446, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 36341 + }, + { + "epoch": 0.36342, + "grad_norm": 0.9244365984645252, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 36342 + }, + { + "epoch": 0.36343, + "grad_norm": 0.8798580645706109, + "learning_rate": 0.003, + "loss": 4.004, + "step": 36343 + }, + { + "epoch": 0.36344, + "grad_norm": 0.9026414840166828, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 36344 + }, + { + "epoch": 0.36345, + "grad_norm": 0.8770321521037077, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 36345 + }, + { + "epoch": 0.36346, + "grad_norm": 1.0634632833048643, + "learning_rate": 0.003, + "loss": 4.0863, + "step": 36346 + }, + { + "epoch": 0.36347, + "grad_norm": 0.9664353986765878, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 36347 + }, + { + "epoch": 0.36348, + "grad_norm": 1.2131698332961125, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 36348 + }, + { + "epoch": 0.36349, + "grad_norm": 1.0185532602455705, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 36349 + }, + { + "epoch": 0.3635, + "grad_norm": 1.0412027405608237, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 36350 + }, + { + "epoch": 0.36351, + "grad_norm": 0.9669361837470503, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 36351 + }, + { + "epoch": 0.36352, + "grad_norm": 0.9128808781949306, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 36352 + }, + { + "epoch": 0.36353, + "grad_norm": 0.8929158463731717, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 36353 + }, + { + "epoch": 0.36354, + "grad_norm": 0.90231506068704, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 36354 + }, + { + "epoch": 0.36355, + "grad_norm": 0.990097538026194, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 36355 + }, + { + "epoch": 0.36356, + "grad_norm": 1.0956198660761587, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 36356 + }, + { + "epoch": 0.36357, + "grad_norm": 0.7512082141292395, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 36357 + }, + { + "epoch": 0.36358, + "grad_norm": 0.6735672660579661, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 36358 + }, + { + "epoch": 0.36359, + "grad_norm": 0.5995543826270673, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 36359 + }, + { + "epoch": 0.3636, + "grad_norm": 0.5657821299481227, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 36360 + }, + { + "epoch": 0.36361, + "grad_norm": 0.6082097675416681, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 36361 + }, + { + "epoch": 0.36362, + "grad_norm": 0.5917588070638943, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 36362 + }, + { + "epoch": 0.36363, + "grad_norm": 0.5997673302766807, + "learning_rate": 0.003, + "loss": 4.037, + "step": 36363 + }, + { + "epoch": 0.36364, + "grad_norm": 0.6848877594404014, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 36364 + }, + { + "epoch": 0.36365, + "grad_norm": 0.7629301400004116, + "learning_rate": 0.003, + "loss": 4.049, + "step": 36365 + }, + { + "epoch": 0.36366, + "grad_norm": 0.7799071791364665, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 36366 + }, + { + "epoch": 0.36367, + "grad_norm": 0.825402730629563, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 36367 + }, + { + "epoch": 0.36368, + "grad_norm": 0.8400036225511053, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 36368 + }, + { + "epoch": 0.36369, + "grad_norm": 0.8231233478751402, + "learning_rate": 0.003, + "loss": 3.9916, + "step": 36369 + }, + { + "epoch": 0.3637, + "grad_norm": 0.7614769720313015, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 36370 + }, + { + "epoch": 0.36371, + "grad_norm": 0.7696666975532609, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 36371 + }, + { + "epoch": 0.36372, + "grad_norm": 0.7318861469630122, + "learning_rate": 0.003, + "loss": 4.046, + "step": 36372 + }, + { + "epoch": 0.36373, + "grad_norm": 0.7181477713814696, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 36373 + }, + { + "epoch": 0.36374, + "grad_norm": 0.7743388044837819, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 36374 + }, + { + "epoch": 0.36375, + "grad_norm": 0.8276135086343156, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 36375 + }, + { + "epoch": 0.36376, + "grad_norm": 0.9163080984602835, + "learning_rate": 0.003, + "loss": 4.009, + "step": 36376 + }, + { + "epoch": 0.36377, + "grad_norm": 1.0744686903791314, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 36377 + }, + { + "epoch": 0.36378, + "grad_norm": 1.1034002156727107, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 36378 + }, + { + "epoch": 0.36379, + "grad_norm": 0.8054136515053291, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 36379 + }, + { + "epoch": 0.3638, + "grad_norm": 0.8356986487593246, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 36380 + }, + { + "epoch": 0.36381, + "grad_norm": 0.8434480924693779, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 36381 + }, + { + "epoch": 0.36382, + "grad_norm": 0.7051251430826544, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 36382 + }, + { + "epoch": 0.36383, + "grad_norm": 0.6365038916414174, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 36383 + }, + { + "epoch": 0.36384, + "grad_norm": 0.7288812665048945, + "learning_rate": 0.003, + "loss": 3.9983, + "step": 36384 + }, + { + "epoch": 0.36385, + "grad_norm": 0.7986483401439243, + "learning_rate": 0.003, + "loss": 4.032, + "step": 36385 + }, + { + "epoch": 0.36386, + "grad_norm": 0.8211969737426479, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 36386 + }, + { + "epoch": 0.36387, + "grad_norm": 1.0148353160769727, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 36387 + }, + { + "epoch": 0.36388, + "grad_norm": 1.221057584679625, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 36388 + }, + { + "epoch": 0.36389, + "grad_norm": 0.7857698443818177, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 36389 + }, + { + "epoch": 0.3639, + "grad_norm": 0.7227101061201727, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 36390 + }, + { + "epoch": 0.36391, + "grad_norm": 0.7448904301630686, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 36391 + }, + { + "epoch": 0.36392, + "grad_norm": 0.8075018677994241, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 36392 + }, + { + "epoch": 0.36393, + "grad_norm": 0.8445131589945339, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 36393 + }, + { + "epoch": 0.36394, + "grad_norm": 0.7459969826060584, + "learning_rate": 0.003, + "loss": 4.02, + "step": 36394 + }, + { + "epoch": 0.36395, + "grad_norm": 0.7861081334465677, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 36395 + }, + { + "epoch": 0.36396, + "grad_norm": 0.6544813439038074, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 36396 + }, + { + "epoch": 0.36397, + "grad_norm": 0.6613053115839899, + "learning_rate": 0.003, + "loss": 3.9861, + "step": 36397 + }, + { + "epoch": 0.36398, + "grad_norm": 0.6169127309098001, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 36398 + }, + { + "epoch": 0.36399, + "grad_norm": 0.6036182319400673, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 36399 + }, + { + "epoch": 0.364, + "grad_norm": 0.6642454772091294, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 36400 + }, + { + "epoch": 0.36401, + "grad_norm": 0.7510392162257634, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 36401 + }, + { + "epoch": 0.36402, + "grad_norm": 0.8673079438618302, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 36402 + }, + { + "epoch": 0.36403, + "grad_norm": 1.0136878233116833, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 36403 + }, + { + "epoch": 0.36404, + "grad_norm": 1.2485502049586095, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 36404 + }, + { + "epoch": 0.36405, + "grad_norm": 0.6999384286265162, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 36405 + }, + { + "epoch": 0.36406, + "grad_norm": 0.7252634057631322, + "learning_rate": 0.003, + "loss": 4.0003, + "step": 36406 + }, + { + "epoch": 0.36407, + "grad_norm": 0.6761004332581878, + "learning_rate": 0.003, + "loss": 4.042, + "step": 36407 + }, + { + "epoch": 0.36408, + "grad_norm": 0.6723567040421798, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 36408 + }, + { + "epoch": 0.36409, + "grad_norm": 0.834088354721908, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 36409 + }, + { + "epoch": 0.3641, + "grad_norm": 0.9599033206848576, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 36410 + }, + { + "epoch": 0.36411, + "grad_norm": 1.0354436428976026, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 36411 + }, + { + "epoch": 0.36412, + "grad_norm": 0.840839617412903, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 36412 + }, + { + "epoch": 0.36413, + "grad_norm": 0.8755147788979334, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 36413 + }, + { + "epoch": 0.36414, + "grad_norm": 1.051309794148287, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 36414 + }, + { + "epoch": 0.36415, + "grad_norm": 0.9547141734866035, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 36415 + }, + { + "epoch": 0.36416, + "grad_norm": 0.970036866682725, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 36416 + }, + { + "epoch": 0.36417, + "grad_norm": 0.9761036650539251, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 36417 + }, + { + "epoch": 0.36418, + "grad_norm": 0.9499815074888036, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 36418 + }, + { + "epoch": 0.36419, + "grad_norm": 1.07042715161078, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 36419 + }, + { + "epoch": 0.3642, + "grad_norm": 0.9371619461417041, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 36420 + }, + { + "epoch": 0.36421, + "grad_norm": 0.9415725087399028, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 36421 + }, + { + "epoch": 0.36422, + "grad_norm": 1.0571671457261944, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 36422 + }, + { + "epoch": 0.36423, + "grad_norm": 1.048134111916943, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 36423 + }, + { + "epoch": 0.36424, + "grad_norm": 1.0399985501344564, + "learning_rate": 0.003, + "loss": 4.07, + "step": 36424 + }, + { + "epoch": 0.36425, + "grad_norm": 1.1258533057825104, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 36425 + }, + { + "epoch": 0.36426, + "grad_norm": 0.8960731412840298, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 36426 + }, + { + "epoch": 0.36427, + "grad_norm": 0.8036487700923993, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 36427 + }, + { + "epoch": 0.36428, + "grad_norm": 0.8262004370078959, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 36428 + }, + { + "epoch": 0.36429, + "grad_norm": 0.8027974059362506, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 36429 + }, + { + "epoch": 0.3643, + "grad_norm": 0.8490915404994654, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 36430 + }, + { + "epoch": 0.36431, + "grad_norm": 0.8572941528190505, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 36431 + }, + { + "epoch": 0.36432, + "grad_norm": 0.8396369063016456, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 36432 + }, + { + "epoch": 0.36433, + "grad_norm": 0.7978776154950192, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 36433 + }, + { + "epoch": 0.36434, + "grad_norm": 0.7550466608109624, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 36434 + }, + { + "epoch": 0.36435, + "grad_norm": 0.8012656695427353, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 36435 + }, + { + "epoch": 0.36436, + "grad_norm": 0.8720115444127275, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 36436 + }, + { + "epoch": 0.36437, + "grad_norm": 1.0865137054949707, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 36437 + }, + { + "epoch": 0.36438, + "grad_norm": 1.048311000853558, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 36438 + }, + { + "epoch": 0.36439, + "grad_norm": 0.8972360328690727, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 36439 + }, + { + "epoch": 0.3644, + "grad_norm": 0.9064102967471543, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 36440 + }, + { + "epoch": 0.36441, + "grad_norm": 0.8750141945442573, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 36441 + }, + { + "epoch": 0.36442, + "grad_norm": 0.7719174222662374, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 36442 + }, + { + "epoch": 0.36443, + "grad_norm": 0.7327594650986564, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 36443 + }, + { + "epoch": 0.36444, + "grad_norm": 0.7040870664905965, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 36444 + }, + { + "epoch": 0.36445, + "grad_norm": 0.6062669825100394, + "learning_rate": 0.003, + "loss": 4.0025, + "step": 36445 + }, + { + "epoch": 0.36446, + "grad_norm": 0.5245715395009786, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 36446 + }, + { + "epoch": 0.36447, + "grad_norm": 0.5515155189038042, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 36447 + }, + { + "epoch": 0.36448, + "grad_norm": 0.5841246082599573, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 36448 + }, + { + "epoch": 0.36449, + "grad_norm": 0.6887309203903704, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 36449 + }, + { + "epoch": 0.3645, + "grad_norm": 0.8651017455163379, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 36450 + }, + { + "epoch": 0.36451, + "grad_norm": 1.0217926307752794, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 36451 + }, + { + "epoch": 0.36452, + "grad_norm": 0.987272554423854, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 36452 + }, + { + "epoch": 0.36453, + "grad_norm": 0.8239042977223542, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 36453 + }, + { + "epoch": 0.36454, + "grad_norm": 0.6706774960487556, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 36454 + }, + { + "epoch": 0.36455, + "grad_norm": 0.6752546429278812, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 36455 + }, + { + "epoch": 0.36456, + "grad_norm": 0.6595212050032906, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 36456 + }, + { + "epoch": 0.36457, + "grad_norm": 0.72146520794854, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 36457 + }, + { + "epoch": 0.36458, + "grad_norm": 0.8387158186633052, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 36458 + }, + { + "epoch": 0.36459, + "grad_norm": 0.9043509283860192, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 36459 + }, + { + "epoch": 0.3646, + "grad_norm": 0.9529291860179945, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 36460 + }, + { + "epoch": 0.36461, + "grad_norm": 1.0489694442351196, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 36461 + }, + { + "epoch": 0.36462, + "grad_norm": 1.0910174151975458, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 36462 + }, + { + "epoch": 0.36463, + "grad_norm": 0.7871354448849948, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 36463 + }, + { + "epoch": 0.36464, + "grad_norm": 0.6424794131519098, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 36464 + }, + { + "epoch": 0.36465, + "grad_norm": 0.6382594622623775, + "learning_rate": 0.003, + "loss": 4.048, + "step": 36465 + }, + { + "epoch": 0.36466, + "grad_norm": 0.7009211483736809, + "learning_rate": 0.003, + "loss": 4.025, + "step": 36466 + }, + { + "epoch": 0.36467, + "grad_norm": 0.7045287606272039, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 36467 + }, + { + "epoch": 0.36468, + "grad_norm": 0.7659100137306611, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 36468 + }, + { + "epoch": 0.36469, + "grad_norm": 0.8464021900357759, + "learning_rate": 0.003, + "loss": 4.021, + "step": 36469 + }, + { + "epoch": 0.3647, + "grad_norm": 0.8465601008110099, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 36470 + }, + { + "epoch": 0.36471, + "grad_norm": 0.8370711490504655, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 36471 + }, + { + "epoch": 0.36472, + "grad_norm": 0.7982038883632879, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 36472 + }, + { + "epoch": 0.36473, + "grad_norm": 0.7890720223197222, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 36473 + }, + { + "epoch": 0.36474, + "grad_norm": 0.8207447373351334, + "learning_rate": 0.003, + "loss": 4.0073, + "step": 36474 + }, + { + "epoch": 0.36475, + "grad_norm": 0.8037359901099203, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 36475 + }, + { + "epoch": 0.36476, + "grad_norm": 0.9737390863513957, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 36476 + }, + { + "epoch": 0.36477, + "grad_norm": 0.985412628287471, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 36477 + }, + { + "epoch": 0.36478, + "grad_norm": 0.9136000370649007, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 36478 + }, + { + "epoch": 0.36479, + "grad_norm": 0.9666915168441272, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 36479 + }, + { + "epoch": 0.3648, + "grad_norm": 0.9461957480528906, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 36480 + }, + { + "epoch": 0.36481, + "grad_norm": 1.0235336717821757, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 36481 + }, + { + "epoch": 0.36482, + "grad_norm": 1.026865537680538, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 36482 + }, + { + "epoch": 0.36483, + "grad_norm": 0.9833450365724702, + "learning_rate": 0.003, + "loss": 4.024, + "step": 36483 + }, + { + "epoch": 0.36484, + "grad_norm": 0.9197750226636242, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 36484 + }, + { + "epoch": 0.36485, + "grad_norm": 0.8568382599983689, + "learning_rate": 0.003, + "loss": 4.049, + "step": 36485 + }, + { + "epoch": 0.36486, + "grad_norm": 0.7148234949678913, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 36486 + }, + { + "epoch": 0.36487, + "grad_norm": 0.7568442345847668, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 36487 + }, + { + "epoch": 0.36488, + "grad_norm": 0.7545599461236181, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 36488 + }, + { + "epoch": 0.36489, + "grad_norm": 0.6677176155580218, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 36489 + }, + { + "epoch": 0.3649, + "grad_norm": 0.6089012366394658, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 36490 + }, + { + "epoch": 0.36491, + "grad_norm": 0.5841414515621771, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 36491 + }, + { + "epoch": 0.36492, + "grad_norm": 0.549840645253593, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 36492 + }, + { + "epoch": 0.36493, + "grad_norm": 0.6315561434348778, + "learning_rate": 0.003, + "loss": 4.024, + "step": 36493 + }, + { + "epoch": 0.36494, + "grad_norm": 0.7153539212511631, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 36494 + }, + { + "epoch": 0.36495, + "grad_norm": 0.7907647044898811, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 36495 + }, + { + "epoch": 0.36496, + "grad_norm": 0.7080546684387466, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 36496 + }, + { + "epoch": 0.36497, + "grad_norm": 0.695474512139161, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 36497 + }, + { + "epoch": 0.36498, + "grad_norm": 0.7372972667064898, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 36498 + }, + { + "epoch": 0.36499, + "grad_norm": 0.9359652810107976, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 36499 + }, + { + "epoch": 0.365, + "grad_norm": 1.2024600200255557, + "learning_rate": 0.003, + "loss": 4.042, + "step": 36500 + }, + { + "epoch": 0.36501, + "grad_norm": 0.7757727555612071, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 36501 + }, + { + "epoch": 0.36502, + "grad_norm": 0.8771790329999716, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 36502 + }, + { + "epoch": 0.36503, + "grad_norm": 1.0624709459730295, + "learning_rate": 0.003, + "loss": 4.041, + "step": 36503 + }, + { + "epoch": 0.36504, + "grad_norm": 1.0038547197315384, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 36504 + }, + { + "epoch": 0.36505, + "grad_norm": 0.988402713138743, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 36505 + }, + { + "epoch": 0.36506, + "grad_norm": 1.013365077473141, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 36506 + }, + { + "epoch": 0.36507, + "grad_norm": 1.0223047862823758, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 36507 + }, + { + "epoch": 0.36508, + "grad_norm": 1.0196268643988629, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 36508 + }, + { + "epoch": 0.36509, + "grad_norm": 0.984790542060423, + "learning_rate": 0.003, + "loss": 4.048, + "step": 36509 + }, + { + "epoch": 0.3651, + "grad_norm": 0.8341391349776293, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 36510 + }, + { + "epoch": 0.36511, + "grad_norm": 0.8197756967727201, + "learning_rate": 0.003, + "loss": 4.057, + "step": 36511 + }, + { + "epoch": 0.36512, + "grad_norm": 0.7115217834307136, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 36512 + }, + { + "epoch": 0.36513, + "grad_norm": 0.7280241151002692, + "learning_rate": 0.003, + "loss": 3.9983, + "step": 36513 + }, + { + "epoch": 0.36514, + "grad_norm": 0.8089784176087421, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 36514 + }, + { + "epoch": 0.36515, + "grad_norm": 0.8863335128706781, + "learning_rate": 0.003, + "loss": 3.9998, + "step": 36515 + }, + { + "epoch": 0.36516, + "grad_norm": 1.0614415906992931, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 36516 + }, + { + "epoch": 0.36517, + "grad_norm": 1.2226911984576267, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 36517 + }, + { + "epoch": 0.36518, + "grad_norm": 0.6508982780377969, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 36518 + }, + { + "epoch": 0.36519, + "grad_norm": 0.550398788032293, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 36519 + }, + { + "epoch": 0.3652, + "grad_norm": 0.502953045083353, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 36520 + }, + { + "epoch": 0.36521, + "grad_norm": 0.55994222626727, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 36521 + }, + { + "epoch": 0.36522, + "grad_norm": 0.5310398362120261, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 36522 + }, + { + "epoch": 0.36523, + "grad_norm": 0.5803687861972866, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 36523 + }, + { + "epoch": 0.36524, + "grad_norm": 0.6661183262058182, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 36524 + }, + { + "epoch": 0.36525, + "grad_norm": 0.742529344792259, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 36525 + }, + { + "epoch": 0.36526, + "grad_norm": 0.8568907645832611, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 36526 + }, + { + "epoch": 0.36527, + "grad_norm": 1.046634573826985, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 36527 + }, + { + "epoch": 0.36528, + "grad_norm": 1.038621601571503, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 36528 + }, + { + "epoch": 0.36529, + "grad_norm": 0.7419030872499812, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 36529 + }, + { + "epoch": 0.3653, + "grad_norm": 0.7111666092705952, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 36530 + }, + { + "epoch": 0.36531, + "grad_norm": 0.7803219816140674, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 36531 + }, + { + "epoch": 0.36532, + "grad_norm": 0.8395015665585588, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 36532 + }, + { + "epoch": 0.36533, + "grad_norm": 0.8766471262904771, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 36533 + }, + { + "epoch": 0.36534, + "grad_norm": 0.77641373713843, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 36534 + }, + { + "epoch": 0.36535, + "grad_norm": 0.6923972121360056, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 36535 + }, + { + "epoch": 0.36536, + "grad_norm": 0.6919056555429821, + "learning_rate": 0.003, + "loss": 4.054, + "step": 36536 + }, + { + "epoch": 0.36537, + "grad_norm": 0.7487094708539219, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 36537 + }, + { + "epoch": 0.36538, + "grad_norm": 0.857414598144882, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 36538 + }, + { + "epoch": 0.36539, + "grad_norm": 0.9179232331584907, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 36539 + }, + { + "epoch": 0.3654, + "grad_norm": 1.0361064539009535, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 36540 + }, + { + "epoch": 0.36541, + "grad_norm": 1.0835575143783331, + "learning_rate": 0.003, + "loss": 4.046, + "step": 36541 + }, + { + "epoch": 0.36542, + "grad_norm": 0.9219259062724361, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 36542 + }, + { + "epoch": 0.36543, + "grad_norm": 0.7811366674516137, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 36543 + }, + { + "epoch": 0.36544, + "grad_norm": 0.8101520055851457, + "learning_rate": 0.003, + "loss": 4.038, + "step": 36544 + }, + { + "epoch": 0.36545, + "grad_norm": 0.8285603996343616, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 36545 + }, + { + "epoch": 0.36546, + "grad_norm": 0.8051004155107612, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 36546 + }, + { + "epoch": 0.36547, + "grad_norm": 0.7343823042282207, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 36547 + }, + { + "epoch": 0.36548, + "grad_norm": 0.7483566489415985, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 36548 + }, + { + "epoch": 0.36549, + "grad_norm": 0.7215370971415038, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 36549 + }, + { + "epoch": 0.3655, + "grad_norm": 0.8844961942555732, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 36550 + }, + { + "epoch": 0.36551, + "grad_norm": 0.9284332129281083, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 36551 + }, + { + "epoch": 0.36552, + "grad_norm": 0.89251703417917, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 36552 + }, + { + "epoch": 0.36553, + "grad_norm": 1.131589772566236, + "learning_rate": 0.003, + "loss": 4.041, + "step": 36553 + }, + { + "epoch": 0.36554, + "grad_norm": 1.0508476621536427, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 36554 + }, + { + "epoch": 0.36555, + "grad_norm": 0.8700429131849128, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 36555 + }, + { + "epoch": 0.36556, + "grad_norm": 0.6903366920604239, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 36556 + }, + { + "epoch": 0.36557, + "grad_norm": 0.6518306358806111, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 36557 + }, + { + "epoch": 0.36558, + "grad_norm": 0.6807751466776936, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 36558 + }, + { + "epoch": 0.36559, + "grad_norm": 0.6589532180728125, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 36559 + }, + { + "epoch": 0.3656, + "grad_norm": 0.6280179342082526, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 36560 + }, + { + "epoch": 0.36561, + "grad_norm": 0.6400465618837944, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 36561 + }, + { + "epoch": 0.36562, + "grad_norm": 0.6457624935854928, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 36562 + }, + { + "epoch": 0.36563, + "grad_norm": 0.6413547729388942, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 36563 + }, + { + "epoch": 0.36564, + "grad_norm": 0.6521515254757633, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 36564 + }, + { + "epoch": 0.36565, + "grad_norm": 0.6745709337696963, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 36565 + }, + { + "epoch": 0.36566, + "grad_norm": 0.7618007627907951, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 36566 + }, + { + "epoch": 0.36567, + "grad_norm": 0.8099643049322931, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 36567 + }, + { + "epoch": 0.36568, + "grad_norm": 0.7066833981198821, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 36568 + }, + { + "epoch": 0.36569, + "grad_norm": 0.7379068298646154, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 36569 + }, + { + "epoch": 0.3657, + "grad_norm": 0.8075754267791037, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 36570 + }, + { + "epoch": 0.36571, + "grad_norm": 0.9136425097905705, + "learning_rate": 0.003, + "loss": 3.996, + "step": 36571 + }, + { + "epoch": 0.36572, + "grad_norm": 1.1113180573469015, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 36572 + }, + { + "epoch": 0.36573, + "grad_norm": 0.8587573106710576, + "learning_rate": 0.003, + "loss": 4.02, + "step": 36573 + }, + { + "epoch": 0.36574, + "grad_norm": 0.8277832545605818, + "learning_rate": 0.003, + "loss": 4.033, + "step": 36574 + }, + { + "epoch": 0.36575, + "grad_norm": 1.0684809732462823, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 36575 + }, + { + "epoch": 0.36576, + "grad_norm": 1.1293849036700694, + "learning_rate": 0.003, + "loss": 4.054, + "step": 36576 + }, + { + "epoch": 0.36577, + "grad_norm": 0.9436452025817109, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 36577 + }, + { + "epoch": 0.36578, + "grad_norm": 0.9086046132283814, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 36578 + }, + { + "epoch": 0.36579, + "grad_norm": 0.8918164323448341, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 36579 + }, + { + "epoch": 0.3658, + "grad_norm": 0.8259220879738401, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 36580 + }, + { + "epoch": 0.36581, + "grad_norm": 0.8794133953691022, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 36581 + }, + { + "epoch": 0.36582, + "grad_norm": 0.9131596699371435, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 36582 + }, + { + "epoch": 0.36583, + "grad_norm": 0.7839162757038926, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 36583 + }, + { + "epoch": 0.36584, + "grad_norm": 0.908897468287137, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 36584 + }, + { + "epoch": 0.36585, + "grad_norm": 1.1428344500483354, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 36585 + }, + { + "epoch": 0.36586, + "grad_norm": 0.9028910237997174, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 36586 + }, + { + "epoch": 0.36587, + "grad_norm": 0.7795131490477474, + "learning_rate": 0.003, + "loss": 4.0053, + "step": 36587 + }, + { + "epoch": 0.36588, + "grad_norm": 0.6832157620296251, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 36588 + }, + { + "epoch": 0.36589, + "grad_norm": 0.6728224681697682, + "learning_rate": 0.003, + "loss": 4.073, + "step": 36589 + }, + { + "epoch": 0.3659, + "grad_norm": 0.6320968841753755, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 36590 + }, + { + "epoch": 0.36591, + "grad_norm": 0.7429441684916932, + "learning_rate": 0.003, + "loss": 4.0057, + "step": 36591 + }, + { + "epoch": 0.36592, + "grad_norm": 0.9729584940922472, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 36592 + }, + { + "epoch": 0.36593, + "grad_norm": 1.0789036186097705, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 36593 + }, + { + "epoch": 0.36594, + "grad_norm": 0.7787158725874122, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 36594 + }, + { + "epoch": 0.36595, + "grad_norm": 0.7875682584073803, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 36595 + }, + { + "epoch": 0.36596, + "grad_norm": 0.9180991734043435, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 36596 + }, + { + "epoch": 0.36597, + "grad_norm": 1.1046197494391206, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 36597 + }, + { + "epoch": 0.36598, + "grad_norm": 0.8908231765400296, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 36598 + }, + { + "epoch": 0.36599, + "grad_norm": 0.8188757728721163, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 36599 + }, + { + "epoch": 0.366, + "grad_norm": 0.7586402916309285, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 36600 + }, + { + "epoch": 0.36601, + "grad_norm": 0.7220535135194175, + "learning_rate": 0.003, + "loss": 4.019, + "step": 36601 + }, + { + "epoch": 0.36602, + "grad_norm": 0.8450799331507277, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 36602 + }, + { + "epoch": 0.36603, + "grad_norm": 1.0935043343652533, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 36603 + }, + { + "epoch": 0.36604, + "grad_norm": 1.0042392738495511, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 36604 + }, + { + "epoch": 0.36605, + "grad_norm": 1.0137972936831054, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 36605 + }, + { + "epoch": 0.36606, + "grad_norm": 0.9788240424477218, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 36606 + }, + { + "epoch": 0.36607, + "grad_norm": 0.7961688156322678, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 36607 + }, + { + "epoch": 0.36608, + "grad_norm": 0.7303037923089406, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 36608 + }, + { + "epoch": 0.36609, + "grad_norm": 0.8026156043250059, + "learning_rate": 0.003, + "loss": 3.9935, + "step": 36609 + }, + { + "epoch": 0.3661, + "grad_norm": 0.8643233319186874, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 36610 + }, + { + "epoch": 0.36611, + "grad_norm": 0.8197770683255005, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 36611 + }, + { + "epoch": 0.36612, + "grad_norm": 0.7796269887536348, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 36612 + }, + { + "epoch": 0.36613, + "grad_norm": 0.7055622524621578, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 36613 + }, + { + "epoch": 0.36614, + "grad_norm": 0.6794260082652922, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 36614 + }, + { + "epoch": 0.36615, + "grad_norm": 0.8080108141660408, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 36615 + }, + { + "epoch": 0.36616, + "grad_norm": 1.0631742272195284, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 36616 + }, + { + "epoch": 0.36617, + "grad_norm": 1.151315720798188, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 36617 + }, + { + "epoch": 0.36618, + "grad_norm": 0.9427338081737591, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 36618 + }, + { + "epoch": 0.36619, + "grad_norm": 0.8408545532691434, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 36619 + }, + { + "epoch": 0.3662, + "grad_norm": 0.7324158221008791, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 36620 + }, + { + "epoch": 0.36621, + "grad_norm": 0.7692790034145166, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 36621 + }, + { + "epoch": 0.36622, + "grad_norm": 0.7410294715896398, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 36622 + }, + { + "epoch": 0.36623, + "grad_norm": 0.8437166682962397, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 36623 + }, + { + "epoch": 0.36624, + "grad_norm": 0.8878876494820519, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 36624 + }, + { + "epoch": 0.36625, + "grad_norm": 0.9969711810201572, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 36625 + }, + { + "epoch": 0.36626, + "grad_norm": 0.9913260773226816, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 36626 + }, + { + "epoch": 0.36627, + "grad_norm": 0.9227603323647467, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 36627 + }, + { + "epoch": 0.36628, + "grad_norm": 0.9447869761039092, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 36628 + }, + { + "epoch": 0.36629, + "grad_norm": 0.8574809421964387, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 36629 + }, + { + "epoch": 0.3663, + "grad_norm": 0.7340974833364611, + "learning_rate": 0.003, + "loss": 4.016, + "step": 36630 + }, + { + "epoch": 0.36631, + "grad_norm": 0.6225649497932846, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 36631 + }, + { + "epoch": 0.36632, + "grad_norm": 0.6464111523124203, + "learning_rate": 0.003, + "loss": 4.0018, + "step": 36632 + }, + { + "epoch": 0.36633, + "grad_norm": 0.7608565647215652, + "learning_rate": 0.003, + "loss": 4.0055, + "step": 36633 + }, + { + "epoch": 0.36634, + "grad_norm": 0.8672855246874974, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 36634 + }, + { + "epoch": 0.36635, + "grad_norm": 0.9213669274090276, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 36635 + }, + { + "epoch": 0.36636, + "grad_norm": 0.8641469940174424, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 36636 + }, + { + "epoch": 0.36637, + "grad_norm": 0.7644530955733692, + "learning_rate": 0.003, + "loss": 4.036, + "step": 36637 + }, + { + "epoch": 0.36638, + "grad_norm": 0.7427254105771673, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 36638 + }, + { + "epoch": 0.36639, + "grad_norm": 0.6936476185427638, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 36639 + }, + { + "epoch": 0.3664, + "grad_norm": 0.7162568914973398, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 36640 + }, + { + "epoch": 0.36641, + "grad_norm": 0.8029029929893059, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 36641 + }, + { + "epoch": 0.36642, + "grad_norm": 0.8858851818058645, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 36642 + }, + { + "epoch": 0.36643, + "grad_norm": 0.9632069013491761, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 36643 + }, + { + "epoch": 0.36644, + "grad_norm": 1.122512012917059, + "learning_rate": 0.003, + "loss": 4.066, + "step": 36644 + }, + { + "epoch": 0.36645, + "grad_norm": 0.9161305225314225, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 36645 + }, + { + "epoch": 0.36646, + "grad_norm": 0.9611513836489961, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 36646 + }, + { + "epoch": 0.36647, + "grad_norm": 0.9171572064606387, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 36647 + }, + { + "epoch": 0.36648, + "grad_norm": 0.9285049442825689, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 36648 + }, + { + "epoch": 0.36649, + "grad_norm": 0.780418496089693, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 36649 + }, + { + "epoch": 0.3665, + "grad_norm": 0.7975875761118293, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 36650 + }, + { + "epoch": 0.36651, + "grad_norm": 0.7534025123051049, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 36651 + }, + { + "epoch": 0.36652, + "grad_norm": 0.8040083037746817, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 36652 + }, + { + "epoch": 0.36653, + "grad_norm": 1.0801409038303662, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 36653 + }, + { + "epoch": 0.36654, + "grad_norm": 1.1135770265834755, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 36654 + }, + { + "epoch": 0.36655, + "grad_norm": 0.7153518847991157, + "learning_rate": 0.003, + "loss": 4.03, + "step": 36655 + }, + { + "epoch": 0.36656, + "grad_norm": 0.6227994928262652, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 36656 + }, + { + "epoch": 0.36657, + "grad_norm": 0.6418640596417158, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 36657 + }, + { + "epoch": 0.36658, + "grad_norm": 0.6728088487854765, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 36658 + }, + { + "epoch": 0.36659, + "grad_norm": 0.61868041170304, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 36659 + }, + { + "epoch": 0.3666, + "grad_norm": 0.6601081365940414, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 36660 + }, + { + "epoch": 0.36661, + "grad_norm": 0.8016235006116494, + "learning_rate": 0.003, + "loss": 4.018, + "step": 36661 + }, + { + "epoch": 0.36662, + "grad_norm": 0.9176986737082055, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 36662 + }, + { + "epoch": 0.36663, + "grad_norm": 0.885971231912432, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 36663 + }, + { + "epoch": 0.36664, + "grad_norm": 0.8212379021361692, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 36664 + }, + { + "epoch": 0.36665, + "grad_norm": 0.7400312185631244, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 36665 + }, + { + "epoch": 0.36666, + "grad_norm": 0.663039792843106, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 36666 + }, + { + "epoch": 0.36667, + "grad_norm": 0.6296559410362682, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 36667 + }, + { + "epoch": 0.36668, + "grad_norm": 0.5336844201965505, + "learning_rate": 0.003, + "loss": 4.071, + "step": 36668 + }, + { + "epoch": 0.36669, + "grad_norm": 0.5501897624754095, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 36669 + }, + { + "epoch": 0.3667, + "grad_norm": 0.6369443077178484, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 36670 + }, + { + "epoch": 0.36671, + "grad_norm": 0.6543205204290723, + "learning_rate": 0.003, + "loss": 4.0015, + "step": 36671 + }, + { + "epoch": 0.36672, + "grad_norm": 0.7739350292394511, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 36672 + }, + { + "epoch": 0.36673, + "grad_norm": 1.0121401481267391, + "learning_rate": 0.003, + "loss": 4.0089, + "step": 36673 + }, + { + "epoch": 0.36674, + "grad_norm": 1.274158820307813, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 36674 + }, + { + "epoch": 0.36675, + "grad_norm": 0.768018418473303, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 36675 + }, + { + "epoch": 0.36676, + "grad_norm": 0.7586172899547654, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 36676 + }, + { + "epoch": 0.36677, + "grad_norm": 0.6772983911557248, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 36677 + }, + { + "epoch": 0.36678, + "grad_norm": 0.6214954225567069, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 36678 + }, + { + "epoch": 0.36679, + "grad_norm": 0.6889460495326254, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 36679 + }, + { + "epoch": 0.3668, + "grad_norm": 0.6908450453825807, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 36680 + }, + { + "epoch": 0.36681, + "grad_norm": 0.7435064201591103, + "learning_rate": 0.003, + "loss": 4.0, + "step": 36681 + }, + { + "epoch": 0.36682, + "grad_norm": 0.660141815748049, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 36682 + }, + { + "epoch": 0.36683, + "grad_norm": 0.6030465288816576, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 36683 + }, + { + "epoch": 0.36684, + "grad_norm": 0.6681632310618064, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 36684 + }, + { + "epoch": 0.36685, + "grad_norm": 0.6978812036359582, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 36685 + }, + { + "epoch": 0.36686, + "grad_norm": 0.9482860405014721, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 36686 + }, + { + "epoch": 0.36687, + "grad_norm": 1.40838628684005, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 36687 + }, + { + "epoch": 0.36688, + "grad_norm": 0.8579265534527734, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 36688 + }, + { + "epoch": 0.36689, + "grad_norm": 0.8931941103523859, + "learning_rate": 0.003, + "loss": 4.054, + "step": 36689 + }, + { + "epoch": 0.3669, + "grad_norm": 0.7481859564525347, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 36690 + }, + { + "epoch": 0.36691, + "grad_norm": 0.8207497818594028, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 36691 + }, + { + "epoch": 0.36692, + "grad_norm": 0.9703659473135587, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 36692 + }, + { + "epoch": 0.36693, + "grad_norm": 0.9477084367439446, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 36693 + }, + { + "epoch": 0.36694, + "grad_norm": 0.8760901854508224, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 36694 + }, + { + "epoch": 0.36695, + "grad_norm": 0.837855577776055, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 36695 + }, + { + "epoch": 0.36696, + "grad_norm": 0.7311882021991196, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 36696 + }, + { + "epoch": 0.36697, + "grad_norm": 0.6983666030512522, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 36697 + }, + { + "epoch": 0.36698, + "grad_norm": 0.707241096617632, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 36698 + }, + { + "epoch": 0.36699, + "grad_norm": 0.7975039237848981, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 36699 + }, + { + "epoch": 0.367, + "grad_norm": 0.8825813238961344, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 36700 + }, + { + "epoch": 0.36701, + "grad_norm": 0.9511094598694858, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 36701 + }, + { + "epoch": 0.36702, + "grad_norm": 1.051423452728449, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 36702 + }, + { + "epoch": 0.36703, + "grad_norm": 1.1628534768552654, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 36703 + }, + { + "epoch": 0.36704, + "grad_norm": 0.7810046825552677, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 36704 + }, + { + "epoch": 0.36705, + "grad_norm": 0.7008415856257676, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 36705 + }, + { + "epoch": 0.36706, + "grad_norm": 0.6651939142818435, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 36706 + }, + { + "epoch": 0.36707, + "grad_norm": 0.6548987502224148, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 36707 + }, + { + "epoch": 0.36708, + "grad_norm": 0.6980009080910397, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 36708 + }, + { + "epoch": 0.36709, + "grad_norm": 0.7608526979334845, + "learning_rate": 0.003, + "loss": 4.045, + "step": 36709 + }, + { + "epoch": 0.3671, + "grad_norm": 0.9850210202493211, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 36710 + }, + { + "epoch": 0.36711, + "grad_norm": 1.2806430207084565, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 36711 + }, + { + "epoch": 0.36712, + "grad_norm": 0.7641765834642079, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 36712 + }, + { + "epoch": 0.36713, + "grad_norm": 0.8793562030706844, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 36713 + }, + { + "epoch": 0.36714, + "grad_norm": 1.0035364681581955, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 36714 + }, + { + "epoch": 0.36715, + "grad_norm": 1.0341639053597316, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 36715 + }, + { + "epoch": 0.36716, + "grad_norm": 0.9786671917583875, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 36716 + }, + { + "epoch": 0.36717, + "grad_norm": 0.9841875360959297, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 36717 + }, + { + "epoch": 0.36718, + "grad_norm": 1.0323148098892905, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 36718 + }, + { + "epoch": 0.36719, + "grad_norm": 0.8739442823969108, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 36719 + }, + { + "epoch": 0.3672, + "grad_norm": 0.8795115462598461, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 36720 + }, + { + "epoch": 0.36721, + "grad_norm": 0.7955332334018822, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 36721 + }, + { + "epoch": 0.36722, + "grad_norm": 0.6727954473442085, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 36722 + }, + { + "epoch": 0.36723, + "grad_norm": 0.6828264249278291, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 36723 + }, + { + "epoch": 0.36724, + "grad_norm": 0.7728230460303602, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 36724 + }, + { + "epoch": 0.36725, + "grad_norm": 0.9925655465859311, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 36725 + }, + { + "epoch": 0.36726, + "grad_norm": 1.3150416573668977, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 36726 + }, + { + "epoch": 0.36727, + "grad_norm": 0.6592891262695096, + "learning_rate": 0.003, + "loss": 4.006, + "step": 36727 + }, + { + "epoch": 0.36728, + "grad_norm": 0.6820869637528454, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 36728 + }, + { + "epoch": 0.36729, + "grad_norm": 0.7940406417172008, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 36729 + }, + { + "epoch": 0.3673, + "grad_norm": 0.8039814417866616, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 36730 + }, + { + "epoch": 0.36731, + "grad_norm": 0.7678454396931886, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 36731 + }, + { + "epoch": 0.36732, + "grad_norm": 0.6527394130396555, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 36732 + }, + { + "epoch": 0.36733, + "grad_norm": 0.5655533664837572, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 36733 + }, + { + "epoch": 0.36734, + "grad_norm": 0.5496815162437007, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 36734 + }, + { + "epoch": 0.36735, + "grad_norm": 0.5681809479201715, + "learning_rate": 0.003, + "loss": 4.046, + "step": 36735 + }, + { + "epoch": 0.36736, + "grad_norm": 0.6250893744809306, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 36736 + }, + { + "epoch": 0.36737, + "grad_norm": 0.5955561204013687, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 36737 + }, + { + "epoch": 0.36738, + "grad_norm": 0.6280245749732458, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 36738 + }, + { + "epoch": 0.36739, + "grad_norm": 0.7360738621368608, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 36739 + }, + { + "epoch": 0.3674, + "grad_norm": 0.807623637663469, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 36740 + }, + { + "epoch": 0.36741, + "grad_norm": 1.0916219465791006, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 36741 + }, + { + "epoch": 0.36742, + "grad_norm": 1.2974484538717772, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 36742 + }, + { + "epoch": 0.36743, + "grad_norm": 0.6769245381719283, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 36743 + }, + { + "epoch": 0.36744, + "grad_norm": 0.7101204673129159, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 36744 + }, + { + "epoch": 0.36745, + "grad_norm": 0.816470074806576, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 36745 + }, + { + "epoch": 0.36746, + "grad_norm": 0.8792702630215936, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 36746 + }, + { + "epoch": 0.36747, + "grad_norm": 0.8211681834188368, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 36747 + }, + { + "epoch": 0.36748, + "grad_norm": 0.7301992503449849, + "learning_rate": 0.003, + "loss": 3.9806, + "step": 36748 + }, + { + "epoch": 0.36749, + "grad_norm": 0.7417507403481955, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 36749 + }, + { + "epoch": 0.3675, + "grad_norm": 0.8564116519436213, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 36750 + }, + { + "epoch": 0.36751, + "grad_norm": 0.9276315687853901, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 36751 + }, + { + "epoch": 0.36752, + "grad_norm": 0.8385540613780754, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 36752 + }, + { + "epoch": 0.36753, + "grad_norm": 0.8432760221723373, + "learning_rate": 0.003, + "loss": 4.049, + "step": 36753 + }, + { + "epoch": 0.36754, + "grad_norm": 0.9379713570098684, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 36754 + }, + { + "epoch": 0.36755, + "grad_norm": 0.8891364190841619, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 36755 + }, + { + "epoch": 0.36756, + "grad_norm": 0.9512352828699276, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 36756 + }, + { + "epoch": 0.36757, + "grad_norm": 1.1410292371873405, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 36757 + }, + { + "epoch": 0.36758, + "grad_norm": 1.077053809744339, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 36758 + }, + { + "epoch": 0.36759, + "grad_norm": 1.0581077238311571, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 36759 + }, + { + "epoch": 0.3676, + "grad_norm": 1.2678609324926904, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 36760 + }, + { + "epoch": 0.36761, + "grad_norm": 0.8431239127998489, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 36761 + }, + { + "epoch": 0.36762, + "grad_norm": 0.8360711313344765, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 36762 + }, + { + "epoch": 0.36763, + "grad_norm": 0.9876783519088881, + "learning_rate": 0.003, + "loss": 4.061, + "step": 36763 + }, + { + "epoch": 0.36764, + "grad_norm": 1.110495046329535, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 36764 + }, + { + "epoch": 0.36765, + "grad_norm": 1.0496379947875984, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 36765 + }, + { + "epoch": 0.36766, + "grad_norm": 0.9982138887751718, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 36766 + }, + { + "epoch": 0.36767, + "grad_norm": 1.045474651960162, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 36767 + }, + { + "epoch": 0.36768, + "grad_norm": 0.9652463522394011, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 36768 + }, + { + "epoch": 0.36769, + "grad_norm": 0.9106940019610725, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 36769 + }, + { + "epoch": 0.3677, + "grad_norm": 0.8215249581170844, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 36770 + }, + { + "epoch": 0.36771, + "grad_norm": 0.8631100021588427, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 36771 + }, + { + "epoch": 0.36772, + "grad_norm": 0.9539575039216687, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 36772 + }, + { + "epoch": 0.36773, + "grad_norm": 1.056706861654047, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 36773 + }, + { + "epoch": 0.36774, + "grad_norm": 0.9188067723674477, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 36774 + }, + { + "epoch": 0.36775, + "grad_norm": 0.8632878967140041, + "learning_rate": 0.003, + "loss": 4.035, + "step": 36775 + }, + { + "epoch": 0.36776, + "grad_norm": 0.7809879831271929, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 36776 + }, + { + "epoch": 0.36777, + "grad_norm": 0.7009031790097955, + "learning_rate": 0.003, + "loss": 4.029, + "step": 36777 + }, + { + "epoch": 0.36778, + "grad_norm": 0.7258661963784062, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 36778 + }, + { + "epoch": 0.36779, + "grad_norm": 0.7438765242560557, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 36779 + }, + { + "epoch": 0.3678, + "grad_norm": 0.7137652387933179, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 36780 + }, + { + "epoch": 0.36781, + "grad_norm": 0.6515030215873558, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 36781 + }, + { + "epoch": 0.36782, + "grad_norm": 0.6210866730362035, + "learning_rate": 0.003, + "loss": 4.066, + "step": 36782 + }, + { + "epoch": 0.36783, + "grad_norm": 0.5522691021824534, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 36783 + }, + { + "epoch": 0.36784, + "grad_norm": 0.5013631475178039, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 36784 + }, + { + "epoch": 0.36785, + "grad_norm": 0.4642262716018065, + "learning_rate": 0.003, + "loss": 3.9985, + "step": 36785 + }, + { + "epoch": 0.36786, + "grad_norm": 0.46693189674125973, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 36786 + }, + { + "epoch": 0.36787, + "grad_norm": 0.42786647758197893, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 36787 + }, + { + "epoch": 0.36788, + "grad_norm": 0.464625862373694, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 36788 + }, + { + "epoch": 0.36789, + "grad_norm": 0.44747730500154453, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 36789 + }, + { + "epoch": 0.3679, + "grad_norm": 0.4105112544645885, + "learning_rate": 0.003, + "loss": 3.9963, + "step": 36790 + }, + { + "epoch": 0.36791, + "grad_norm": 0.4348764231032359, + "learning_rate": 0.003, + "loss": 3.9955, + "step": 36791 + }, + { + "epoch": 0.36792, + "grad_norm": 0.4013679093134044, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 36792 + }, + { + "epoch": 0.36793, + "grad_norm": 0.42153879019487406, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 36793 + }, + { + "epoch": 0.36794, + "grad_norm": 0.4984353831550319, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 36794 + }, + { + "epoch": 0.36795, + "grad_norm": 0.5966266726123957, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 36795 + }, + { + "epoch": 0.36796, + "grad_norm": 0.6732766482060146, + "learning_rate": 0.003, + "loss": 4.0013, + "step": 36796 + }, + { + "epoch": 0.36797, + "grad_norm": 0.7361594054030159, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 36797 + }, + { + "epoch": 0.36798, + "grad_norm": 0.9875664689292271, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 36798 + }, + { + "epoch": 0.36799, + "grad_norm": 1.5626248710187884, + "learning_rate": 0.003, + "loss": 4.051, + "step": 36799 + }, + { + "epoch": 0.368, + "grad_norm": 0.7832522432413992, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 36800 + }, + { + "epoch": 0.36801, + "grad_norm": 0.7950566354877928, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 36801 + }, + { + "epoch": 0.36802, + "grad_norm": 0.8297144515579825, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 36802 + }, + { + "epoch": 0.36803, + "grad_norm": 0.8046201958204579, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 36803 + }, + { + "epoch": 0.36804, + "grad_norm": 0.7748178348871486, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 36804 + }, + { + "epoch": 0.36805, + "grad_norm": 0.7879079775975406, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 36805 + }, + { + "epoch": 0.36806, + "grad_norm": 0.8449325606553624, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 36806 + }, + { + "epoch": 0.36807, + "grad_norm": 0.812514631453781, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 36807 + }, + { + "epoch": 0.36808, + "grad_norm": 0.7534720713284468, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 36808 + }, + { + "epoch": 0.36809, + "grad_norm": 0.8057090848608166, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 36809 + }, + { + "epoch": 0.3681, + "grad_norm": 0.7866375292068923, + "learning_rate": 0.003, + "loss": 4.039, + "step": 36810 + }, + { + "epoch": 0.36811, + "grad_norm": 0.7064524562249626, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 36811 + }, + { + "epoch": 0.36812, + "grad_norm": 0.8297564271093987, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 36812 + }, + { + "epoch": 0.36813, + "grad_norm": 0.9372634800550087, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 36813 + }, + { + "epoch": 0.36814, + "grad_norm": 1.2320674594702614, + "learning_rate": 0.003, + "loss": 3.9932, + "step": 36814 + }, + { + "epoch": 0.36815, + "grad_norm": 1.1366194529461093, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 36815 + }, + { + "epoch": 0.36816, + "grad_norm": 1.2046456250331754, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 36816 + }, + { + "epoch": 0.36817, + "grad_norm": 0.9045192673794165, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 36817 + }, + { + "epoch": 0.36818, + "grad_norm": 0.9095885199518156, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 36818 + }, + { + "epoch": 0.36819, + "grad_norm": 0.8970893828518729, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 36819 + }, + { + "epoch": 0.3682, + "grad_norm": 1.035101983023465, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 36820 + }, + { + "epoch": 0.36821, + "grad_norm": 1.0273561948454395, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 36821 + }, + { + "epoch": 0.36822, + "grad_norm": 1.0650205090402032, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 36822 + }, + { + "epoch": 0.36823, + "grad_norm": 1.0801319960556819, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 36823 + }, + { + "epoch": 0.36824, + "grad_norm": 1.0387742251899987, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 36824 + }, + { + "epoch": 0.36825, + "grad_norm": 1.1811825467192978, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 36825 + }, + { + "epoch": 0.36826, + "grad_norm": 0.9312018069039326, + "learning_rate": 0.003, + "loss": 4.071, + "step": 36826 + }, + { + "epoch": 0.36827, + "grad_norm": 0.9316219123976843, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 36827 + }, + { + "epoch": 0.36828, + "grad_norm": 1.0030001337857866, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 36828 + }, + { + "epoch": 0.36829, + "grad_norm": 1.0723829745229476, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 36829 + }, + { + "epoch": 0.3683, + "grad_norm": 1.0015885134689644, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 36830 + }, + { + "epoch": 0.36831, + "grad_norm": 0.929882940182703, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 36831 + }, + { + "epoch": 0.36832, + "grad_norm": 0.9722263708183494, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 36832 + }, + { + "epoch": 0.36833, + "grad_norm": 1.0223014033910318, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 36833 + }, + { + "epoch": 0.36834, + "grad_norm": 0.9638626445935244, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 36834 + }, + { + "epoch": 0.36835, + "grad_norm": 0.8151179271499461, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 36835 + }, + { + "epoch": 0.36836, + "grad_norm": 0.6861404457290492, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 36836 + }, + { + "epoch": 0.36837, + "grad_norm": 0.7073031302218746, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 36837 + }, + { + "epoch": 0.36838, + "grad_norm": 0.8648175834215271, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 36838 + }, + { + "epoch": 0.36839, + "grad_norm": 0.9929141915107716, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 36839 + }, + { + "epoch": 0.3684, + "grad_norm": 1.1596967443776678, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 36840 + }, + { + "epoch": 0.36841, + "grad_norm": 0.7999718872172199, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 36841 + }, + { + "epoch": 0.36842, + "grad_norm": 0.7989812109359756, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 36842 + }, + { + "epoch": 0.36843, + "grad_norm": 0.9772664516723604, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 36843 + }, + { + "epoch": 0.36844, + "grad_norm": 0.9708822875293135, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 36844 + }, + { + "epoch": 0.36845, + "grad_norm": 0.8297775514438553, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 36845 + }, + { + "epoch": 0.36846, + "grad_norm": 0.7439659212077926, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 36846 + }, + { + "epoch": 0.36847, + "grad_norm": 0.7518364879032494, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 36847 + }, + { + "epoch": 0.36848, + "grad_norm": 0.7253622162221167, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 36848 + }, + { + "epoch": 0.36849, + "grad_norm": 0.7785307338831582, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 36849 + }, + { + "epoch": 0.3685, + "grad_norm": 0.7463158575535527, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 36850 + }, + { + "epoch": 0.36851, + "grad_norm": 0.7124555815521392, + "learning_rate": 0.003, + "loss": 4.055, + "step": 36851 + }, + { + "epoch": 0.36852, + "grad_norm": 0.5849162062527973, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 36852 + }, + { + "epoch": 0.36853, + "grad_norm": 0.5435197965111561, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 36853 + }, + { + "epoch": 0.36854, + "grad_norm": 0.5202151849839306, + "learning_rate": 0.003, + "loss": 4.051, + "step": 36854 + }, + { + "epoch": 0.36855, + "grad_norm": 0.5625339282517187, + "learning_rate": 0.003, + "loss": 4.019, + "step": 36855 + }, + { + "epoch": 0.36856, + "grad_norm": 0.7342288432508026, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 36856 + }, + { + "epoch": 0.36857, + "grad_norm": 0.9824662334565858, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 36857 + }, + { + "epoch": 0.36858, + "grad_norm": 1.3329426145987104, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 36858 + }, + { + "epoch": 0.36859, + "grad_norm": 0.5732273042103745, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 36859 + }, + { + "epoch": 0.3686, + "grad_norm": 0.6800857967564115, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 36860 + }, + { + "epoch": 0.36861, + "grad_norm": 0.7165483348530701, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 36861 + }, + { + "epoch": 0.36862, + "grad_norm": 0.6233855591977709, + "learning_rate": 0.003, + "loss": 3.9977, + "step": 36862 + }, + { + "epoch": 0.36863, + "grad_norm": 0.6231969761481632, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 36863 + }, + { + "epoch": 0.36864, + "grad_norm": 0.6658849426669988, + "learning_rate": 0.003, + "loss": 4.022, + "step": 36864 + }, + { + "epoch": 0.36865, + "grad_norm": 0.8188613726725142, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 36865 + }, + { + "epoch": 0.36866, + "grad_norm": 0.8900920558609763, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 36866 + }, + { + "epoch": 0.36867, + "grad_norm": 1.0017500109443984, + "learning_rate": 0.003, + "loss": 3.9997, + "step": 36867 + }, + { + "epoch": 0.36868, + "grad_norm": 1.0926579854207277, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 36868 + }, + { + "epoch": 0.36869, + "grad_norm": 0.85107789011108, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 36869 + }, + { + "epoch": 0.3687, + "grad_norm": 0.739646208461904, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 36870 + }, + { + "epoch": 0.36871, + "grad_norm": 0.6868050753042905, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 36871 + }, + { + "epoch": 0.36872, + "grad_norm": 0.766737911059407, + "learning_rate": 0.003, + "loss": 3.9811, + "step": 36872 + }, + { + "epoch": 0.36873, + "grad_norm": 0.8403322439721188, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 36873 + }, + { + "epoch": 0.36874, + "grad_norm": 0.8772837654549394, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 36874 + }, + { + "epoch": 0.36875, + "grad_norm": 0.9896877942927997, + "learning_rate": 0.003, + "loss": 4.023, + "step": 36875 + }, + { + "epoch": 0.36876, + "grad_norm": 1.0833306766829651, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 36876 + }, + { + "epoch": 0.36877, + "grad_norm": 0.8071567525158283, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 36877 + }, + { + "epoch": 0.36878, + "grad_norm": 0.7463682140129613, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 36878 + }, + { + "epoch": 0.36879, + "grad_norm": 0.8500793780927153, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 36879 + }, + { + "epoch": 0.3688, + "grad_norm": 0.9485704417925911, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 36880 + }, + { + "epoch": 0.36881, + "grad_norm": 1.098884233327673, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 36881 + }, + { + "epoch": 0.36882, + "grad_norm": 1.0112896092189385, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 36882 + }, + { + "epoch": 0.36883, + "grad_norm": 0.8881302733997845, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 36883 + }, + { + "epoch": 0.36884, + "grad_norm": 0.911083328998895, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 36884 + }, + { + "epoch": 0.36885, + "grad_norm": 0.8851541451016623, + "learning_rate": 0.003, + "loss": 4.058, + "step": 36885 + }, + { + "epoch": 0.36886, + "grad_norm": 0.9258417997770797, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 36886 + }, + { + "epoch": 0.36887, + "grad_norm": 0.903898909721576, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 36887 + }, + { + "epoch": 0.36888, + "grad_norm": 0.7966217619699059, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 36888 + }, + { + "epoch": 0.36889, + "grad_norm": 0.715077350615037, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 36889 + }, + { + "epoch": 0.3689, + "grad_norm": 0.6776803157542045, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 36890 + }, + { + "epoch": 0.36891, + "grad_norm": 0.6173462492812456, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 36891 + }, + { + "epoch": 0.36892, + "grad_norm": 0.5979619951273583, + "learning_rate": 0.003, + "loss": 4.01, + "step": 36892 + }, + { + "epoch": 0.36893, + "grad_norm": 0.6753663325541209, + "learning_rate": 0.003, + "loss": 4.04, + "step": 36893 + }, + { + "epoch": 0.36894, + "grad_norm": 0.6750388795653431, + "learning_rate": 0.003, + "loss": 4.0036, + "step": 36894 + }, + { + "epoch": 0.36895, + "grad_norm": 0.6717973692951722, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 36895 + }, + { + "epoch": 0.36896, + "grad_norm": 0.7097748122683512, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 36896 + }, + { + "epoch": 0.36897, + "grad_norm": 0.673397041747036, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 36897 + }, + { + "epoch": 0.36898, + "grad_norm": 0.6618920637992699, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 36898 + }, + { + "epoch": 0.36899, + "grad_norm": 0.6220844178937872, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 36899 + }, + { + "epoch": 0.369, + "grad_norm": 0.6527921311482231, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 36900 + }, + { + "epoch": 0.36901, + "grad_norm": 0.7599473012224466, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 36901 + }, + { + "epoch": 0.36902, + "grad_norm": 0.8201629431517352, + "learning_rate": 0.003, + "loss": 4.0095, + "step": 36902 + }, + { + "epoch": 0.36903, + "grad_norm": 0.8787204522209385, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 36903 + }, + { + "epoch": 0.36904, + "grad_norm": 0.8788210930882646, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 36904 + }, + { + "epoch": 0.36905, + "grad_norm": 0.9812682894589722, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 36905 + }, + { + "epoch": 0.36906, + "grad_norm": 1.047767893727494, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 36906 + }, + { + "epoch": 0.36907, + "grad_norm": 1.057594971731746, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 36907 + }, + { + "epoch": 0.36908, + "grad_norm": 0.9934725454809783, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 36908 + }, + { + "epoch": 0.36909, + "grad_norm": 1.1311586558783688, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 36909 + }, + { + "epoch": 0.3691, + "grad_norm": 0.9265826286157451, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 36910 + }, + { + "epoch": 0.36911, + "grad_norm": 0.9438191955590695, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 36911 + }, + { + "epoch": 0.36912, + "grad_norm": 0.9194013084607698, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 36912 + }, + { + "epoch": 0.36913, + "grad_norm": 0.8584470170492401, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 36913 + }, + { + "epoch": 0.36914, + "grad_norm": 0.8006438446639215, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 36914 + }, + { + "epoch": 0.36915, + "grad_norm": 0.7056559462530465, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 36915 + }, + { + "epoch": 0.36916, + "grad_norm": 0.7542384219553878, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 36916 + }, + { + "epoch": 0.36917, + "grad_norm": 0.8705344935952783, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 36917 + }, + { + "epoch": 0.36918, + "grad_norm": 0.8761068712713742, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 36918 + }, + { + "epoch": 0.36919, + "grad_norm": 0.8591856473044287, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 36919 + }, + { + "epoch": 0.3692, + "grad_norm": 0.8234412984479295, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 36920 + }, + { + "epoch": 0.36921, + "grad_norm": 0.7430192182988758, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 36921 + }, + { + "epoch": 0.36922, + "grad_norm": 0.7003185688599389, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 36922 + }, + { + "epoch": 0.36923, + "grad_norm": 0.7741886414081712, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 36923 + }, + { + "epoch": 0.36924, + "grad_norm": 0.8340591488352227, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 36924 + }, + { + "epoch": 0.36925, + "grad_norm": 0.9536654406855326, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 36925 + }, + { + "epoch": 0.36926, + "grad_norm": 1.1390073668647107, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 36926 + }, + { + "epoch": 0.36927, + "grad_norm": 1.0099448005018548, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 36927 + }, + { + "epoch": 0.36928, + "grad_norm": 0.9032088760297092, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 36928 + }, + { + "epoch": 0.36929, + "grad_norm": 0.7944678525551279, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 36929 + }, + { + "epoch": 0.3693, + "grad_norm": 0.7554712903193862, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 36930 + }, + { + "epoch": 0.36931, + "grad_norm": 0.7742025160060619, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 36931 + }, + { + "epoch": 0.36932, + "grad_norm": 0.8033851996207793, + "learning_rate": 0.003, + "loss": 4.044, + "step": 36932 + }, + { + "epoch": 0.36933, + "grad_norm": 0.8598021550024495, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 36933 + }, + { + "epoch": 0.36934, + "grad_norm": 0.8733152379651241, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 36934 + }, + { + "epoch": 0.36935, + "grad_norm": 0.7463347036354825, + "learning_rate": 0.003, + "loss": 4.046, + "step": 36935 + }, + { + "epoch": 0.36936, + "grad_norm": 0.6847459564904789, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 36936 + }, + { + "epoch": 0.36937, + "grad_norm": 0.7452931895974972, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 36937 + }, + { + "epoch": 0.36938, + "grad_norm": 0.7274767528596761, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 36938 + }, + { + "epoch": 0.36939, + "grad_norm": 0.8840074607401983, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 36939 + }, + { + "epoch": 0.3694, + "grad_norm": 1.2143448501031506, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 36940 + }, + { + "epoch": 0.36941, + "grad_norm": 0.8783939315100879, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 36941 + }, + { + "epoch": 0.36942, + "grad_norm": 0.6999000887445634, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 36942 + }, + { + "epoch": 0.36943, + "grad_norm": 0.6384243588875573, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 36943 + }, + { + "epoch": 0.36944, + "grad_norm": 0.6821646820694248, + "learning_rate": 0.003, + "loss": 4.042, + "step": 36944 + }, + { + "epoch": 0.36945, + "grad_norm": 0.6181713545794209, + "learning_rate": 0.003, + "loss": 4.034, + "step": 36945 + }, + { + "epoch": 0.36946, + "grad_norm": 0.5695721018357028, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 36946 + }, + { + "epoch": 0.36947, + "grad_norm": 0.6231408389899497, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 36947 + }, + { + "epoch": 0.36948, + "grad_norm": 0.6853080083561321, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 36948 + }, + { + "epoch": 0.36949, + "grad_norm": 0.8209883630669342, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 36949 + }, + { + "epoch": 0.3695, + "grad_norm": 0.9380256573929453, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 36950 + }, + { + "epoch": 0.36951, + "grad_norm": 0.8627978825933115, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 36951 + }, + { + "epoch": 0.36952, + "grad_norm": 0.7280430374070006, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 36952 + }, + { + "epoch": 0.36953, + "grad_norm": 0.6922073368346137, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 36953 + }, + { + "epoch": 0.36954, + "grad_norm": 0.7061423350819331, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 36954 + }, + { + "epoch": 0.36955, + "grad_norm": 0.6627577644267479, + "learning_rate": 0.003, + "loss": 4.006, + "step": 36955 + }, + { + "epoch": 0.36956, + "grad_norm": 0.7751886299718033, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 36956 + }, + { + "epoch": 0.36957, + "grad_norm": 0.9948162503806349, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 36957 + }, + { + "epoch": 0.36958, + "grad_norm": 1.3183464740377988, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 36958 + }, + { + "epoch": 0.36959, + "grad_norm": 0.7311217130300401, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 36959 + }, + { + "epoch": 0.3696, + "grad_norm": 0.694540075048572, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 36960 + }, + { + "epoch": 0.36961, + "grad_norm": 0.815299777920547, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 36961 + }, + { + "epoch": 0.36962, + "grad_norm": 1.0720331815303854, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 36962 + }, + { + "epoch": 0.36963, + "grad_norm": 1.205613992240361, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 36963 + }, + { + "epoch": 0.36964, + "grad_norm": 0.7163258032395008, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 36964 + }, + { + "epoch": 0.36965, + "grad_norm": 0.7270066813331139, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 36965 + }, + { + "epoch": 0.36966, + "grad_norm": 0.8431274813730874, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 36966 + }, + { + "epoch": 0.36967, + "grad_norm": 0.848377742080918, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 36967 + }, + { + "epoch": 0.36968, + "grad_norm": 0.8440911055257112, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 36968 + }, + { + "epoch": 0.36969, + "grad_norm": 0.8009281897331215, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 36969 + }, + { + "epoch": 0.3697, + "grad_norm": 0.7535479452884671, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 36970 + }, + { + "epoch": 0.36971, + "grad_norm": 0.8002726746694381, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 36971 + }, + { + "epoch": 0.36972, + "grad_norm": 0.8901492975549494, + "learning_rate": 0.003, + "loss": 4.0724, + "step": 36972 + }, + { + "epoch": 0.36973, + "grad_norm": 0.8465302766849194, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 36973 + }, + { + "epoch": 0.36974, + "grad_norm": 0.7768798301964677, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 36974 + }, + { + "epoch": 0.36975, + "grad_norm": 0.8523416711170975, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 36975 + }, + { + "epoch": 0.36976, + "grad_norm": 0.9352659978063963, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 36976 + }, + { + "epoch": 0.36977, + "grad_norm": 1.1003999519393426, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 36977 + }, + { + "epoch": 0.36978, + "grad_norm": 0.8602628299455286, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 36978 + }, + { + "epoch": 0.36979, + "grad_norm": 0.8605070163396973, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 36979 + }, + { + "epoch": 0.3698, + "grad_norm": 1.0181705357259196, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 36980 + }, + { + "epoch": 0.36981, + "grad_norm": 0.9199138752749352, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 36981 + }, + { + "epoch": 0.36982, + "grad_norm": 0.7114317247886792, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 36982 + }, + { + "epoch": 0.36983, + "grad_norm": 0.7129364337877688, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 36983 + }, + { + "epoch": 0.36984, + "grad_norm": 0.8074208419497136, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 36984 + }, + { + "epoch": 0.36985, + "grad_norm": 0.8800250595983158, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 36985 + }, + { + "epoch": 0.36986, + "grad_norm": 0.9436874566162339, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 36986 + }, + { + "epoch": 0.36987, + "grad_norm": 1.163940136756619, + "learning_rate": 0.003, + "loss": 4.049, + "step": 36987 + }, + { + "epoch": 0.36988, + "grad_norm": 0.9195412978172871, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 36988 + }, + { + "epoch": 0.36989, + "grad_norm": 0.9077140859137299, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 36989 + }, + { + "epoch": 0.3699, + "grad_norm": 0.8543443265830163, + "learning_rate": 0.003, + "loss": 4.062, + "step": 36990 + }, + { + "epoch": 0.36991, + "grad_norm": 0.872030381230807, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 36991 + }, + { + "epoch": 0.36992, + "grad_norm": 0.8626811801640805, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 36992 + }, + { + "epoch": 0.36993, + "grad_norm": 0.7537194852436806, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 36993 + }, + { + "epoch": 0.36994, + "grad_norm": 0.7132315805838623, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 36994 + }, + { + "epoch": 0.36995, + "grad_norm": 0.9081873789233962, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 36995 + }, + { + "epoch": 0.36996, + "grad_norm": 1.082355135828151, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 36996 + }, + { + "epoch": 0.36997, + "grad_norm": 1.0096185859538467, + "learning_rate": 0.003, + "loss": 4.03, + "step": 36997 + }, + { + "epoch": 0.36998, + "grad_norm": 0.8767125055089195, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 36998 + }, + { + "epoch": 0.36999, + "grad_norm": 0.7743421423368273, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 36999 + }, + { + "epoch": 0.37, + "grad_norm": 0.7560702946356798, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 37000 + }, + { + "epoch": 0.37001, + "grad_norm": 0.8490365599390446, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 37001 + }, + { + "epoch": 0.37002, + "grad_norm": 0.8247176947759712, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 37002 + }, + { + "epoch": 0.37003, + "grad_norm": 0.6646465809703512, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 37003 + }, + { + "epoch": 0.37004, + "grad_norm": 0.6373171143410952, + "learning_rate": 0.003, + "loss": 4.017, + "step": 37004 + }, + { + "epoch": 0.37005, + "grad_norm": 0.5685353721149674, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 37005 + }, + { + "epoch": 0.37006, + "grad_norm": 0.5063295432068481, + "learning_rate": 0.003, + "loss": 4.031, + "step": 37006 + }, + { + "epoch": 0.37007, + "grad_norm": 0.46777549895431686, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 37007 + }, + { + "epoch": 0.37008, + "grad_norm": 0.49874605668829336, + "learning_rate": 0.003, + "loss": 3.9983, + "step": 37008 + }, + { + "epoch": 0.37009, + "grad_norm": 0.5601542904779858, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 37009 + }, + { + "epoch": 0.3701, + "grad_norm": 0.7120982845987182, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 37010 + }, + { + "epoch": 0.37011, + "grad_norm": 1.0059871678177532, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 37011 + }, + { + "epoch": 0.37012, + "grad_norm": 1.4741638844950158, + "learning_rate": 0.003, + "loss": 4.05, + "step": 37012 + }, + { + "epoch": 0.37013, + "grad_norm": 0.5135555395356913, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 37013 + }, + { + "epoch": 0.37014, + "grad_norm": 1.0087300552656564, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 37014 + }, + { + "epoch": 0.37015, + "grad_norm": 1.099648372454082, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 37015 + }, + { + "epoch": 0.37016, + "grad_norm": 0.8743611643180645, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 37016 + }, + { + "epoch": 0.37017, + "grad_norm": 0.8227549811505475, + "learning_rate": 0.003, + "loss": 4.036, + "step": 37017 + }, + { + "epoch": 0.37018, + "grad_norm": 0.7446293221353876, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 37018 + }, + { + "epoch": 0.37019, + "grad_norm": 0.7238534967606962, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 37019 + }, + { + "epoch": 0.3702, + "grad_norm": 0.8016289107274701, + "learning_rate": 0.003, + "loss": 4.033, + "step": 37020 + }, + { + "epoch": 0.37021, + "grad_norm": 0.8199503458929243, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 37021 + }, + { + "epoch": 0.37022, + "grad_norm": 0.8351366017805246, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 37022 + }, + { + "epoch": 0.37023, + "grad_norm": 0.8759251943751651, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 37023 + }, + { + "epoch": 0.37024, + "grad_norm": 0.9560720941679169, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 37024 + }, + { + "epoch": 0.37025, + "grad_norm": 0.9533326909891658, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 37025 + }, + { + "epoch": 0.37026, + "grad_norm": 0.977568769914267, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 37026 + }, + { + "epoch": 0.37027, + "grad_norm": 0.9437735857013684, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 37027 + }, + { + "epoch": 0.37028, + "grad_norm": 0.9195186142497785, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 37028 + }, + { + "epoch": 0.37029, + "grad_norm": 0.9227266385893358, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 37029 + }, + { + "epoch": 0.3703, + "grad_norm": 0.9741225600129273, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 37030 + }, + { + "epoch": 0.37031, + "grad_norm": 1.1510569550908751, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 37031 + }, + { + "epoch": 0.37032, + "grad_norm": 1.0600433961037483, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 37032 + }, + { + "epoch": 0.37033, + "grad_norm": 0.93333506714232, + "learning_rate": 0.003, + "loss": 4.0825, + "step": 37033 + }, + { + "epoch": 0.37034, + "grad_norm": 0.9635052742904271, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 37034 + }, + { + "epoch": 0.37035, + "grad_norm": 0.8578677714163562, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 37035 + }, + { + "epoch": 0.37036, + "grad_norm": 0.9590751049473081, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 37036 + }, + { + "epoch": 0.37037, + "grad_norm": 1.0857659335744285, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 37037 + }, + { + "epoch": 0.37038, + "grad_norm": 0.9807762076831346, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 37038 + }, + { + "epoch": 0.37039, + "grad_norm": 1.0181969592160767, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 37039 + }, + { + "epoch": 0.3704, + "grad_norm": 0.9466926788390325, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 37040 + }, + { + "epoch": 0.37041, + "grad_norm": 0.9834862629796205, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 37041 + }, + { + "epoch": 0.37042, + "grad_norm": 1.0137756353318481, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 37042 + }, + { + "epoch": 0.37043, + "grad_norm": 0.7573758960269338, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 37043 + }, + { + "epoch": 0.37044, + "grad_norm": 0.6516365730306859, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 37044 + }, + { + "epoch": 0.37045, + "grad_norm": 0.5698804230328042, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 37045 + }, + { + "epoch": 0.37046, + "grad_norm": 0.55363074289551, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 37046 + }, + { + "epoch": 0.37047, + "grad_norm": 0.6337434768605827, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 37047 + }, + { + "epoch": 0.37048, + "grad_norm": 0.8189558171648083, + "learning_rate": 0.003, + "loss": 4.047, + "step": 37048 + }, + { + "epoch": 0.37049, + "grad_norm": 0.8429651900436764, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 37049 + }, + { + "epoch": 0.3705, + "grad_norm": 0.7274436290167893, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 37050 + }, + { + "epoch": 0.37051, + "grad_norm": 0.6063318819392971, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 37051 + }, + { + "epoch": 0.37052, + "grad_norm": 0.6232573066023606, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 37052 + }, + { + "epoch": 0.37053, + "grad_norm": 0.6583935410815218, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 37053 + }, + { + "epoch": 0.37054, + "grad_norm": 0.6710440780270126, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 37054 + }, + { + "epoch": 0.37055, + "grad_norm": 0.6808430020637972, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 37055 + }, + { + "epoch": 0.37056, + "grad_norm": 0.7067463350656311, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 37056 + }, + { + "epoch": 0.37057, + "grad_norm": 0.6868554016214244, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 37057 + }, + { + "epoch": 0.37058, + "grad_norm": 0.6330766905354962, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 37058 + }, + { + "epoch": 0.37059, + "grad_norm": 0.5537765757429657, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 37059 + }, + { + "epoch": 0.3706, + "grad_norm": 0.612368219837295, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 37060 + }, + { + "epoch": 0.37061, + "grad_norm": 0.6642049769569627, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 37061 + }, + { + "epoch": 0.37062, + "grad_norm": 0.7181671439805024, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 37062 + }, + { + "epoch": 0.37063, + "grad_norm": 0.8042347315223907, + "learning_rate": 0.003, + "loss": 4.0012, + "step": 37063 + }, + { + "epoch": 0.37064, + "grad_norm": 0.8497457261136104, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 37064 + }, + { + "epoch": 0.37065, + "grad_norm": 1.0882678038652003, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 37065 + }, + { + "epoch": 0.37066, + "grad_norm": 1.2438302760373465, + "learning_rate": 0.003, + "loss": 4.012, + "step": 37066 + }, + { + "epoch": 0.37067, + "grad_norm": 0.7608452029293692, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 37067 + }, + { + "epoch": 0.37068, + "grad_norm": 0.7187687043783911, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 37068 + }, + { + "epoch": 0.37069, + "grad_norm": 0.7259287550376434, + "learning_rate": 0.003, + "loss": 4.02, + "step": 37069 + }, + { + "epoch": 0.3707, + "grad_norm": 0.6578816610333295, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 37070 + }, + { + "epoch": 0.37071, + "grad_norm": 0.5358742108667396, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 37071 + }, + { + "epoch": 0.37072, + "grad_norm": 0.5378842809671124, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 37072 + }, + { + "epoch": 0.37073, + "grad_norm": 0.6233063736105936, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 37073 + }, + { + "epoch": 0.37074, + "grad_norm": 0.6947278209530238, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 37074 + }, + { + "epoch": 0.37075, + "grad_norm": 0.857835726769795, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 37075 + }, + { + "epoch": 0.37076, + "grad_norm": 1.1379551835460142, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 37076 + }, + { + "epoch": 0.37077, + "grad_norm": 0.8668771297361219, + "learning_rate": 0.003, + "loss": 4.041, + "step": 37077 + }, + { + "epoch": 0.37078, + "grad_norm": 0.8052818698132158, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 37078 + }, + { + "epoch": 0.37079, + "grad_norm": 0.7030702979007816, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 37079 + }, + { + "epoch": 0.3708, + "grad_norm": 0.662468641756662, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 37080 + }, + { + "epoch": 0.37081, + "grad_norm": 0.8109843489106954, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 37081 + }, + { + "epoch": 0.37082, + "grad_norm": 1.105912568397747, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 37082 + }, + { + "epoch": 0.37083, + "grad_norm": 1.2812116232266886, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 37083 + }, + { + "epoch": 0.37084, + "grad_norm": 1.02522200771454, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 37084 + }, + { + "epoch": 0.37085, + "grad_norm": 0.8709027515296228, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 37085 + }, + { + "epoch": 0.37086, + "grad_norm": 0.9076750415640986, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 37086 + }, + { + "epoch": 0.37087, + "grad_norm": 0.829473394877613, + "learning_rate": 0.003, + "loss": 4.042, + "step": 37087 + }, + { + "epoch": 0.37088, + "grad_norm": 0.8521030167244472, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 37088 + }, + { + "epoch": 0.37089, + "grad_norm": 0.8505961071303029, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 37089 + }, + { + "epoch": 0.3709, + "grad_norm": 0.8452855604581128, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 37090 + }, + { + "epoch": 0.37091, + "grad_norm": 0.7362858072518589, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 37091 + }, + { + "epoch": 0.37092, + "grad_norm": 0.7208847748483522, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 37092 + }, + { + "epoch": 0.37093, + "grad_norm": 0.7907082139407378, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 37093 + }, + { + "epoch": 0.37094, + "grad_norm": 0.8918116035848707, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 37094 + }, + { + "epoch": 0.37095, + "grad_norm": 1.0210993625849112, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 37095 + }, + { + "epoch": 0.37096, + "grad_norm": 1.0355242189625857, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 37096 + }, + { + "epoch": 0.37097, + "grad_norm": 0.9093715409382125, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 37097 + }, + { + "epoch": 0.37098, + "grad_norm": 0.8337143052773581, + "learning_rate": 0.003, + "loss": 4.0853, + "step": 37098 + }, + { + "epoch": 0.37099, + "grad_norm": 0.9401346487198636, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 37099 + }, + { + "epoch": 0.371, + "grad_norm": 0.9880708413326937, + "learning_rate": 0.003, + "loss": 4.0073, + "step": 37100 + }, + { + "epoch": 0.37101, + "grad_norm": 0.8701511470322699, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 37101 + }, + { + "epoch": 0.37102, + "grad_norm": 0.8740327833372816, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 37102 + }, + { + "epoch": 0.37103, + "grad_norm": 0.7546968185097553, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 37103 + }, + { + "epoch": 0.37104, + "grad_norm": 0.639269844876232, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 37104 + }, + { + "epoch": 0.37105, + "grad_norm": 0.5914207821857252, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 37105 + }, + { + "epoch": 0.37106, + "grad_norm": 0.629650759416398, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 37106 + }, + { + "epoch": 0.37107, + "grad_norm": 0.692847041477718, + "learning_rate": 0.003, + "loss": 4.005, + "step": 37107 + }, + { + "epoch": 0.37108, + "grad_norm": 0.6535292553162078, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 37108 + }, + { + "epoch": 0.37109, + "grad_norm": 0.5742751385195393, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 37109 + }, + { + "epoch": 0.3711, + "grad_norm": 0.604267972494541, + "learning_rate": 0.003, + "loss": 4.03, + "step": 37110 + }, + { + "epoch": 0.37111, + "grad_norm": 0.6854243713501855, + "learning_rate": 0.003, + "loss": 4.0051, + "step": 37111 + }, + { + "epoch": 0.37112, + "grad_norm": 0.8128545125531661, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 37112 + }, + { + "epoch": 0.37113, + "grad_norm": 1.106472324558318, + "learning_rate": 0.003, + "loss": 4.063, + "step": 37113 + }, + { + "epoch": 0.37114, + "grad_norm": 1.3642603237275353, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 37114 + }, + { + "epoch": 0.37115, + "grad_norm": 0.6790899812432601, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 37115 + }, + { + "epoch": 0.37116, + "grad_norm": 0.8217048147007979, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 37116 + }, + { + "epoch": 0.37117, + "grad_norm": 0.9397231129704002, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 37117 + }, + { + "epoch": 0.37118, + "grad_norm": 0.9984607573207672, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 37118 + }, + { + "epoch": 0.37119, + "grad_norm": 0.878538094059242, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 37119 + }, + { + "epoch": 0.3712, + "grad_norm": 0.7796683208816731, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 37120 + }, + { + "epoch": 0.37121, + "grad_norm": 0.7564154862059245, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 37121 + }, + { + "epoch": 0.37122, + "grad_norm": 0.8207593562797464, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 37122 + }, + { + "epoch": 0.37123, + "grad_norm": 0.7499408848328548, + "learning_rate": 0.003, + "loss": 4.015, + "step": 37123 + }, + { + "epoch": 0.37124, + "grad_norm": 0.6786426246049132, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 37124 + }, + { + "epoch": 0.37125, + "grad_norm": 0.6874682952347965, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 37125 + }, + { + "epoch": 0.37126, + "grad_norm": 0.8273675525052782, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 37126 + }, + { + "epoch": 0.37127, + "grad_norm": 1.055879159870275, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 37127 + }, + { + "epoch": 0.37128, + "grad_norm": 1.090257537232577, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 37128 + }, + { + "epoch": 0.37129, + "grad_norm": 0.9005054666837352, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 37129 + }, + { + "epoch": 0.3713, + "grad_norm": 0.8547961920146271, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 37130 + }, + { + "epoch": 0.37131, + "grad_norm": 0.8971488591883995, + "learning_rate": 0.003, + "loss": 4.058, + "step": 37131 + }, + { + "epoch": 0.37132, + "grad_norm": 0.974227511007289, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 37132 + }, + { + "epoch": 0.37133, + "grad_norm": 1.0365986505867941, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 37133 + }, + { + "epoch": 0.37134, + "grad_norm": 0.9385312427090857, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 37134 + }, + { + "epoch": 0.37135, + "grad_norm": 0.9770046418200687, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 37135 + }, + { + "epoch": 0.37136, + "grad_norm": 0.9877828299527347, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 37136 + }, + { + "epoch": 0.37137, + "grad_norm": 0.9077913099293886, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 37137 + }, + { + "epoch": 0.37138, + "grad_norm": 0.8108280283416814, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 37138 + }, + { + "epoch": 0.37139, + "grad_norm": 0.7964596379945323, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 37139 + }, + { + "epoch": 0.3714, + "grad_norm": 0.8638967236406246, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 37140 + }, + { + "epoch": 0.37141, + "grad_norm": 0.8992813278253871, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 37141 + }, + { + "epoch": 0.37142, + "grad_norm": 0.7732053169764347, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 37142 + }, + { + "epoch": 0.37143, + "grad_norm": 0.7440178329870957, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 37143 + }, + { + "epoch": 0.37144, + "grad_norm": 0.7383987566296655, + "learning_rate": 0.003, + "loss": 4.039, + "step": 37144 + }, + { + "epoch": 0.37145, + "grad_norm": 0.7572559078053207, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 37145 + }, + { + "epoch": 0.37146, + "grad_norm": 0.803179825866921, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 37146 + }, + { + "epoch": 0.37147, + "grad_norm": 0.9488927800284385, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 37147 + }, + { + "epoch": 0.37148, + "grad_norm": 1.0708755877739646, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 37148 + }, + { + "epoch": 0.37149, + "grad_norm": 0.8085639962224043, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 37149 + }, + { + "epoch": 0.3715, + "grad_norm": 0.7145119284312325, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 37150 + }, + { + "epoch": 0.37151, + "grad_norm": 0.7031305566517462, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 37151 + }, + { + "epoch": 0.37152, + "grad_norm": 0.8959511800011158, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 37152 + }, + { + "epoch": 0.37153, + "grad_norm": 1.1502213622782127, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 37153 + }, + { + "epoch": 0.37154, + "grad_norm": 0.9061752385543462, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 37154 + }, + { + "epoch": 0.37155, + "grad_norm": 0.7385872446560864, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 37155 + }, + { + "epoch": 0.37156, + "grad_norm": 0.755554420884673, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 37156 + }, + { + "epoch": 0.37157, + "grad_norm": 0.7858625264354155, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 37157 + }, + { + "epoch": 0.37158, + "grad_norm": 0.6971555440905154, + "learning_rate": 0.003, + "loss": 4.041, + "step": 37158 + }, + { + "epoch": 0.37159, + "grad_norm": 0.6681596396844943, + "learning_rate": 0.003, + "loss": 4.024, + "step": 37159 + }, + { + "epoch": 0.3716, + "grad_norm": 0.6837750356896083, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 37160 + }, + { + "epoch": 0.37161, + "grad_norm": 0.7660650762366301, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 37161 + }, + { + "epoch": 0.37162, + "grad_norm": 0.846543509769823, + "learning_rate": 0.003, + "loss": 4.027, + "step": 37162 + }, + { + "epoch": 0.37163, + "grad_norm": 0.8759400486663071, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 37163 + }, + { + "epoch": 0.37164, + "grad_norm": 1.071689182421946, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 37164 + }, + { + "epoch": 0.37165, + "grad_norm": 0.929264527160591, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 37165 + }, + { + "epoch": 0.37166, + "grad_norm": 0.7935014840572291, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 37166 + }, + { + "epoch": 0.37167, + "grad_norm": 0.6440415340967791, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 37167 + }, + { + "epoch": 0.37168, + "grad_norm": 0.6563796390134409, + "learning_rate": 0.003, + "loss": 4.007, + "step": 37168 + }, + { + "epoch": 0.37169, + "grad_norm": 0.6915363217874849, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 37169 + }, + { + "epoch": 0.3717, + "grad_norm": 0.7093961105334079, + "learning_rate": 0.003, + "loss": 4.04, + "step": 37170 + }, + { + "epoch": 0.37171, + "grad_norm": 0.674913564038487, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 37171 + }, + { + "epoch": 0.37172, + "grad_norm": 0.7820618743916685, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 37172 + }, + { + "epoch": 0.37173, + "grad_norm": 0.8642372242688119, + "learning_rate": 0.003, + "loss": 4.012, + "step": 37173 + }, + { + "epoch": 0.37174, + "grad_norm": 0.8112837269348807, + "learning_rate": 0.003, + "loss": 4.007, + "step": 37174 + }, + { + "epoch": 0.37175, + "grad_norm": 0.7492817596748073, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 37175 + }, + { + "epoch": 0.37176, + "grad_norm": 0.836322493013719, + "learning_rate": 0.003, + "loss": 4.027, + "step": 37176 + }, + { + "epoch": 0.37177, + "grad_norm": 0.8439708154150387, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 37177 + }, + { + "epoch": 0.37178, + "grad_norm": 0.8336479604118431, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 37178 + }, + { + "epoch": 0.37179, + "grad_norm": 0.8315455971101943, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 37179 + }, + { + "epoch": 0.3718, + "grad_norm": 0.9338501137982366, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 37180 + }, + { + "epoch": 0.37181, + "grad_norm": 0.9101036824899702, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 37181 + }, + { + "epoch": 0.37182, + "grad_norm": 0.8775923288048372, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 37182 + }, + { + "epoch": 0.37183, + "grad_norm": 0.7474539001449064, + "learning_rate": 0.003, + "loss": 3.9988, + "step": 37183 + }, + { + "epoch": 0.37184, + "grad_norm": 0.6979414116068486, + "learning_rate": 0.003, + "loss": 4.0009, + "step": 37184 + }, + { + "epoch": 0.37185, + "grad_norm": 0.6966310967803946, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 37185 + }, + { + "epoch": 0.37186, + "grad_norm": 0.8063937829843812, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 37186 + }, + { + "epoch": 0.37187, + "grad_norm": 0.9426913409183622, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 37187 + }, + { + "epoch": 0.37188, + "grad_norm": 1.0623464856272915, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 37188 + }, + { + "epoch": 0.37189, + "grad_norm": 0.8214881509086915, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 37189 + }, + { + "epoch": 0.3719, + "grad_norm": 0.7137755027363656, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 37190 + }, + { + "epoch": 0.37191, + "grad_norm": 0.6809271692840132, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 37191 + }, + { + "epoch": 0.37192, + "grad_norm": 0.5679693958901039, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 37192 + }, + { + "epoch": 0.37193, + "grad_norm": 0.6222048591934564, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 37193 + }, + { + "epoch": 0.37194, + "grad_norm": 0.639388725262434, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 37194 + }, + { + "epoch": 0.37195, + "grad_norm": 0.6785008169819589, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 37195 + }, + { + "epoch": 0.37196, + "grad_norm": 0.7169508290306912, + "learning_rate": 0.003, + "loss": 3.9992, + "step": 37196 + }, + { + "epoch": 0.37197, + "grad_norm": 0.7709955220021772, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 37197 + }, + { + "epoch": 0.37198, + "grad_norm": 0.80317468505242, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 37198 + }, + { + "epoch": 0.37199, + "grad_norm": 0.7983018511201572, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 37199 + }, + { + "epoch": 0.372, + "grad_norm": 0.8900743585830748, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 37200 + }, + { + "epoch": 0.37201, + "grad_norm": 0.977175759815707, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 37201 + }, + { + "epoch": 0.37202, + "grad_norm": 1.0471016242331448, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 37202 + }, + { + "epoch": 0.37203, + "grad_norm": 1.1792920517279444, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 37203 + }, + { + "epoch": 0.37204, + "grad_norm": 1.104281950027656, + "learning_rate": 0.003, + "loss": 4.042, + "step": 37204 + }, + { + "epoch": 0.37205, + "grad_norm": 1.028410055084292, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 37205 + }, + { + "epoch": 0.37206, + "grad_norm": 0.9200169797106809, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 37206 + }, + { + "epoch": 0.37207, + "grad_norm": 0.7807173281343259, + "learning_rate": 0.003, + "loss": 4.019, + "step": 37207 + }, + { + "epoch": 0.37208, + "grad_norm": 0.6860729320803782, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 37208 + }, + { + "epoch": 0.37209, + "grad_norm": 0.6318301787989773, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 37209 + }, + { + "epoch": 0.3721, + "grad_norm": 0.696455131471472, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 37210 + }, + { + "epoch": 0.37211, + "grad_norm": 0.7643789555192715, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 37211 + }, + { + "epoch": 0.37212, + "grad_norm": 0.7903491628971998, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 37212 + }, + { + "epoch": 0.37213, + "grad_norm": 0.8019860177603912, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 37213 + }, + { + "epoch": 0.37214, + "grad_norm": 0.8299861399224494, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 37214 + }, + { + "epoch": 0.37215, + "grad_norm": 0.9322110890037677, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 37215 + }, + { + "epoch": 0.37216, + "grad_norm": 1.1555157431654348, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 37216 + }, + { + "epoch": 0.37217, + "grad_norm": 1.073399177810289, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 37217 + }, + { + "epoch": 0.37218, + "grad_norm": 1.1038644154645207, + "learning_rate": 0.003, + "loss": 4.025, + "step": 37218 + }, + { + "epoch": 0.37219, + "grad_norm": 0.8957816283987804, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 37219 + }, + { + "epoch": 0.3722, + "grad_norm": 0.8358854562103764, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 37220 + }, + { + "epoch": 0.37221, + "grad_norm": 0.9369684916830848, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 37221 + }, + { + "epoch": 0.37222, + "grad_norm": 0.9209131501231349, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 37222 + }, + { + "epoch": 0.37223, + "grad_norm": 0.8416903669599163, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 37223 + }, + { + "epoch": 0.37224, + "grad_norm": 0.7783983607143455, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 37224 + }, + { + "epoch": 0.37225, + "grad_norm": 0.7369884558496173, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 37225 + }, + { + "epoch": 0.37226, + "grad_norm": 0.6902378223997903, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 37226 + }, + { + "epoch": 0.37227, + "grad_norm": 0.6572112523922266, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 37227 + }, + { + "epoch": 0.37228, + "grad_norm": 0.7116342550412896, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 37228 + }, + { + "epoch": 0.37229, + "grad_norm": 0.7106242702273571, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 37229 + }, + { + "epoch": 0.3723, + "grad_norm": 0.7047941815556107, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 37230 + }, + { + "epoch": 0.37231, + "grad_norm": 0.6930972884536747, + "learning_rate": 0.003, + "loss": 3.9978, + "step": 37231 + }, + { + "epoch": 0.37232, + "grad_norm": 0.6379720190954703, + "learning_rate": 0.003, + "loss": 4.032, + "step": 37232 + }, + { + "epoch": 0.37233, + "grad_norm": 0.7740442142405185, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 37233 + }, + { + "epoch": 0.37234, + "grad_norm": 0.8680325983411525, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 37234 + }, + { + "epoch": 0.37235, + "grad_norm": 0.9599437208896167, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 37235 + }, + { + "epoch": 0.37236, + "grad_norm": 1.0576437242401917, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 37236 + }, + { + "epoch": 0.37237, + "grad_norm": 0.9651682408840271, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 37237 + }, + { + "epoch": 0.37238, + "grad_norm": 0.8870502349594004, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 37238 + }, + { + "epoch": 0.37239, + "grad_norm": 0.7822837365433802, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 37239 + }, + { + "epoch": 0.3724, + "grad_norm": 0.7540122357072097, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 37240 + }, + { + "epoch": 0.37241, + "grad_norm": 0.8986219329583064, + "learning_rate": 0.003, + "loss": 4.03, + "step": 37241 + }, + { + "epoch": 0.37242, + "grad_norm": 0.9292933241272716, + "learning_rate": 0.003, + "loss": 4.0791, + "step": 37242 + }, + { + "epoch": 0.37243, + "grad_norm": 0.9043065937748248, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 37243 + }, + { + "epoch": 0.37244, + "grad_norm": 0.9961885556956879, + "learning_rate": 0.003, + "loss": 4.042, + "step": 37244 + }, + { + "epoch": 0.37245, + "grad_norm": 0.9330267563489056, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 37245 + }, + { + "epoch": 0.37246, + "grad_norm": 0.9482872409544085, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 37246 + }, + { + "epoch": 0.37247, + "grad_norm": 1.0323944739940454, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 37247 + }, + { + "epoch": 0.37248, + "grad_norm": 1.1929262045903513, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 37248 + }, + { + "epoch": 0.37249, + "grad_norm": 0.934813540111051, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 37249 + }, + { + "epoch": 0.3725, + "grad_norm": 0.9195684547181885, + "learning_rate": 0.003, + "loss": 4.028, + "step": 37250 + }, + { + "epoch": 0.37251, + "grad_norm": 0.890972341878209, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 37251 + }, + { + "epoch": 0.37252, + "grad_norm": 0.9266364482676206, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 37252 + }, + { + "epoch": 0.37253, + "grad_norm": 0.892237772629654, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 37253 + }, + { + "epoch": 0.37254, + "grad_norm": 0.7153526234161444, + "learning_rate": 0.003, + "loss": 3.9918, + "step": 37254 + }, + { + "epoch": 0.37255, + "grad_norm": 0.6867139738031223, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 37255 + }, + { + "epoch": 0.37256, + "grad_norm": 0.6669123398968904, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 37256 + }, + { + "epoch": 0.37257, + "grad_norm": 0.6076754060533263, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 37257 + }, + { + "epoch": 0.37258, + "grad_norm": 0.6057950555126116, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 37258 + }, + { + "epoch": 0.37259, + "grad_norm": 0.6367267060023399, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 37259 + }, + { + "epoch": 0.3726, + "grad_norm": 0.5786746150507799, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 37260 + }, + { + "epoch": 0.37261, + "grad_norm": 0.6606408596725981, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 37261 + }, + { + "epoch": 0.37262, + "grad_norm": 0.8230697458346458, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 37262 + }, + { + "epoch": 0.37263, + "grad_norm": 1.0654406282695636, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 37263 + }, + { + "epoch": 0.37264, + "grad_norm": 1.138879201622088, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 37264 + }, + { + "epoch": 0.37265, + "grad_norm": 0.9093224155955076, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 37265 + }, + { + "epoch": 0.37266, + "grad_norm": 0.962777537273718, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 37266 + }, + { + "epoch": 0.37267, + "grad_norm": 0.9973794737543036, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 37267 + }, + { + "epoch": 0.37268, + "grad_norm": 1.1939006165301196, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 37268 + }, + { + "epoch": 0.37269, + "grad_norm": 0.8311297513959879, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 37269 + }, + { + "epoch": 0.3727, + "grad_norm": 0.7988541481551534, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 37270 + }, + { + "epoch": 0.37271, + "grad_norm": 0.7734029860995066, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 37271 + }, + { + "epoch": 0.37272, + "grad_norm": 0.7636581559231036, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 37272 + }, + { + "epoch": 0.37273, + "grad_norm": 0.7589138736886513, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 37273 + }, + { + "epoch": 0.37274, + "grad_norm": 0.7220768425862575, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 37274 + }, + { + "epoch": 0.37275, + "grad_norm": 0.7826679652371958, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 37275 + }, + { + "epoch": 0.37276, + "grad_norm": 0.816874505821495, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 37276 + }, + { + "epoch": 0.37277, + "grad_norm": 0.8473724656063945, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 37277 + }, + { + "epoch": 0.37278, + "grad_norm": 0.8896096418612459, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 37278 + }, + { + "epoch": 0.37279, + "grad_norm": 0.8648977983366267, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 37279 + }, + { + "epoch": 0.3728, + "grad_norm": 0.8764501368417124, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 37280 + }, + { + "epoch": 0.37281, + "grad_norm": 0.8300791187535558, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 37281 + }, + { + "epoch": 0.37282, + "grad_norm": 0.8070404257391324, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 37282 + }, + { + "epoch": 0.37283, + "grad_norm": 0.848474039917354, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 37283 + }, + { + "epoch": 0.37284, + "grad_norm": 0.7954050919425447, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 37284 + }, + { + "epoch": 0.37285, + "grad_norm": 0.8920309586472228, + "learning_rate": 0.003, + "loss": 4.038, + "step": 37285 + }, + { + "epoch": 0.37286, + "grad_norm": 1.0320903743043233, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 37286 + }, + { + "epoch": 0.37287, + "grad_norm": 1.0921836719079163, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 37287 + }, + { + "epoch": 0.37288, + "grad_norm": 0.9914108865096539, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 37288 + }, + { + "epoch": 0.37289, + "grad_norm": 1.0410092181783615, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 37289 + }, + { + "epoch": 0.3729, + "grad_norm": 1.0416193835251288, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 37290 + }, + { + "epoch": 0.37291, + "grad_norm": 0.9213104981222298, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 37291 + }, + { + "epoch": 0.37292, + "grad_norm": 0.9568719595089025, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 37292 + }, + { + "epoch": 0.37293, + "grad_norm": 0.897629017433314, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 37293 + }, + { + "epoch": 0.37294, + "grad_norm": 0.9283171109811992, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 37294 + }, + { + "epoch": 0.37295, + "grad_norm": 0.9746666600340341, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 37295 + }, + { + "epoch": 0.37296, + "grad_norm": 0.8199620943102864, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 37296 + }, + { + "epoch": 0.37297, + "grad_norm": 0.6705866096888399, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 37297 + }, + { + "epoch": 0.37298, + "grad_norm": 0.5907344992601682, + "learning_rate": 0.003, + "loss": 4.049, + "step": 37298 + }, + { + "epoch": 0.37299, + "grad_norm": 0.590219252188518, + "learning_rate": 0.003, + "loss": 4.059, + "step": 37299 + }, + { + "epoch": 0.373, + "grad_norm": 0.5542380661334451, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 37300 + }, + { + "epoch": 0.37301, + "grad_norm": 0.5877275052845679, + "learning_rate": 0.003, + "loss": 4.06, + "step": 37301 + }, + { + "epoch": 0.37302, + "grad_norm": 0.6091380983686757, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 37302 + }, + { + "epoch": 0.37303, + "grad_norm": 0.7061190691912086, + "learning_rate": 0.003, + "loss": 3.988, + "step": 37303 + }, + { + "epoch": 0.37304, + "grad_norm": 0.8462639122887342, + "learning_rate": 0.003, + "loss": 4.0001, + "step": 37304 + }, + { + "epoch": 0.37305, + "grad_norm": 0.8869171549749675, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 37305 + }, + { + "epoch": 0.37306, + "grad_norm": 0.8854336475375234, + "learning_rate": 0.003, + "loss": 4.039, + "step": 37306 + }, + { + "epoch": 0.37307, + "grad_norm": 0.9867848323831437, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 37307 + }, + { + "epoch": 0.37308, + "grad_norm": 1.0009276164896659, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 37308 + }, + { + "epoch": 0.37309, + "grad_norm": 0.9404286332732845, + "learning_rate": 0.003, + "loss": 4.064, + "step": 37309 + }, + { + "epoch": 0.3731, + "grad_norm": 0.883240088779619, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 37310 + }, + { + "epoch": 0.37311, + "grad_norm": 0.9821132901589463, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 37311 + }, + { + "epoch": 0.37312, + "grad_norm": 1.1192311856985275, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 37312 + }, + { + "epoch": 0.37313, + "grad_norm": 0.824639553852283, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 37313 + }, + { + "epoch": 0.37314, + "grad_norm": 0.9149112410891053, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 37314 + }, + { + "epoch": 0.37315, + "grad_norm": 0.9250382663938143, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 37315 + }, + { + "epoch": 0.37316, + "grad_norm": 0.8218088901792717, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 37316 + }, + { + "epoch": 0.37317, + "grad_norm": 0.6858873097523065, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 37317 + }, + { + "epoch": 0.37318, + "grad_norm": 0.6396305053415559, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 37318 + }, + { + "epoch": 0.37319, + "grad_norm": 0.6491592145376874, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 37319 + }, + { + "epoch": 0.3732, + "grad_norm": 0.6410040775557807, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 37320 + }, + { + "epoch": 0.37321, + "grad_norm": 0.8741840365235778, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 37321 + }, + { + "epoch": 0.37322, + "grad_norm": 1.1533220283175978, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 37322 + }, + { + "epoch": 0.37323, + "grad_norm": 0.891650616287512, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 37323 + }, + { + "epoch": 0.37324, + "grad_norm": 0.8397803669229454, + "learning_rate": 0.003, + "loss": 4.053, + "step": 37324 + }, + { + "epoch": 0.37325, + "grad_norm": 0.799859163769369, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 37325 + }, + { + "epoch": 0.37326, + "grad_norm": 0.6625329691466774, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 37326 + }, + { + "epoch": 0.37327, + "grad_norm": 0.629082254891581, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 37327 + }, + { + "epoch": 0.37328, + "grad_norm": 0.69988459091357, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 37328 + }, + { + "epoch": 0.37329, + "grad_norm": 0.7684094080787037, + "learning_rate": 0.003, + "loss": 4.05, + "step": 37329 + }, + { + "epoch": 0.3733, + "grad_norm": 0.7833428841698753, + "learning_rate": 0.003, + "loss": 4.0009, + "step": 37330 + }, + { + "epoch": 0.37331, + "grad_norm": 0.838382067416353, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 37331 + }, + { + "epoch": 0.37332, + "grad_norm": 0.9532423799583066, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 37332 + }, + { + "epoch": 0.37333, + "grad_norm": 1.0251036298306089, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 37333 + }, + { + "epoch": 0.37334, + "grad_norm": 0.9428727995970947, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 37334 + }, + { + "epoch": 0.37335, + "grad_norm": 0.8914579935421869, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 37335 + }, + { + "epoch": 0.37336, + "grad_norm": 0.9027561633200494, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 37336 + }, + { + "epoch": 0.37337, + "grad_norm": 0.909744198282722, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 37337 + }, + { + "epoch": 0.37338, + "grad_norm": 0.9207585359112589, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 37338 + }, + { + "epoch": 0.37339, + "grad_norm": 0.8404812031146547, + "learning_rate": 0.003, + "loss": 4.0045, + "step": 37339 + }, + { + "epoch": 0.3734, + "grad_norm": 0.8221447143194633, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 37340 + }, + { + "epoch": 0.37341, + "grad_norm": 0.7539742137749685, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 37341 + }, + { + "epoch": 0.37342, + "grad_norm": 0.614466487572327, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 37342 + }, + { + "epoch": 0.37343, + "grad_norm": 0.5826447953491977, + "learning_rate": 0.003, + "loss": 4.033, + "step": 37343 + }, + { + "epoch": 0.37344, + "grad_norm": 0.5677892326444612, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 37344 + }, + { + "epoch": 0.37345, + "grad_norm": 0.5427867221039834, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 37345 + }, + { + "epoch": 0.37346, + "grad_norm": 0.49484241765358494, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 37346 + }, + { + "epoch": 0.37347, + "grad_norm": 0.5106816786778667, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 37347 + }, + { + "epoch": 0.37348, + "grad_norm": 0.5082765565027845, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 37348 + }, + { + "epoch": 0.37349, + "grad_norm": 0.5723653869145743, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 37349 + }, + { + "epoch": 0.3735, + "grad_norm": 0.705745399961748, + "learning_rate": 0.003, + "loss": 4.014, + "step": 37350 + }, + { + "epoch": 0.37351, + "grad_norm": 1.105974658082321, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 37351 + }, + { + "epoch": 0.37352, + "grad_norm": 1.3000123209069814, + "learning_rate": 0.003, + "loss": 4.018, + "step": 37352 + }, + { + "epoch": 0.37353, + "grad_norm": 0.5834502516709862, + "learning_rate": 0.003, + "loss": 4.052, + "step": 37353 + }, + { + "epoch": 0.37354, + "grad_norm": 0.6794615713993719, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 37354 + }, + { + "epoch": 0.37355, + "grad_norm": 0.8149031597716241, + "learning_rate": 0.003, + "loss": 4.0011, + "step": 37355 + }, + { + "epoch": 0.37356, + "grad_norm": 0.8250982938657954, + "learning_rate": 0.003, + "loss": 4.021, + "step": 37356 + }, + { + "epoch": 0.37357, + "grad_norm": 0.7602107962232251, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 37357 + }, + { + "epoch": 0.37358, + "grad_norm": 0.812222462457722, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 37358 + }, + { + "epoch": 0.37359, + "grad_norm": 0.8915222565308769, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 37359 + }, + { + "epoch": 0.3736, + "grad_norm": 1.0594090290014686, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 37360 + }, + { + "epoch": 0.37361, + "grad_norm": 1.0392425907493907, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 37361 + }, + { + "epoch": 0.37362, + "grad_norm": 1.006781113534545, + "learning_rate": 0.003, + "loss": 4.029, + "step": 37362 + }, + { + "epoch": 0.37363, + "grad_norm": 0.8707341508266692, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 37363 + }, + { + "epoch": 0.37364, + "grad_norm": 0.6760418697069269, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 37364 + }, + { + "epoch": 0.37365, + "grad_norm": 0.7115564377652395, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 37365 + }, + { + "epoch": 0.37366, + "grad_norm": 0.6729554842026572, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 37366 + }, + { + "epoch": 0.37367, + "grad_norm": 0.7172155016650612, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 37367 + }, + { + "epoch": 0.37368, + "grad_norm": 0.7928857008424459, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 37368 + }, + { + "epoch": 0.37369, + "grad_norm": 0.9282360578003364, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 37369 + }, + { + "epoch": 0.3737, + "grad_norm": 1.0612855622890331, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 37370 + }, + { + "epoch": 0.37371, + "grad_norm": 0.8769654222821622, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 37371 + }, + { + "epoch": 0.37372, + "grad_norm": 0.8772593014231973, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 37372 + }, + { + "epoch": 0.37373, + "grad_norm": 0.7979111435358638, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 37373 + }, + { + "epoch": 0.37374, + "grad_norm": 0.856728691420882, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 37374 + }, + { + "epoch": 0.37375, + "grad_norm": 0.8633457970401035, + "learning_rate": 0.003, + "loss": 4.057, + "step": 37375 + }, + { + "epoch": 0.37376, + "grad_norm": 0.836754437848753, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 37376 + }, + { + "epoch": 0.37377, + "grad_norm": 0.8500252812276431, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 37377 + }, + { + "epoch": 0.37378, + "grad_norm": 0.8381954540672671, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 37378 + }, + { + "epoch": 0.37379, + "grad_norm": 0.93957028918571, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 37379 + }, + { + "epoch": 0.3738, + "grad_norm": 1.0314919256218769, + "learning_rate": 0.003, + "loss": 4.0812, + "step": 37380 + }, + { + "epoch": 0.37381, + "grad_norm": 1.055841732902265, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 37381 + }, + { + "epoch": 0.37382, + "grad_norm": 0.9924612250591697, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 37382 + }, + { + "epoch": 0.37383, + "grad_norm": 1.0189987748388825, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 37383 + }, + { + "epoch": 0.37384, + "grad_norm": 0.8396009159905143, + "learning_rate": 0.003, + "loss": 4.035, + "step": 37384 + }, + { + "epoch": 0.37385, + "grad_norm": 0.7090500235915402, + "learning_rate": 0.003, + "loss": 4.025, + "step": 37385 + }, + { + "epoch": 0.37386, + "grad_norm": 0.7187384279374334, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 37386 + }, + { + "epoch": 0.37387, + "grad_norm": 0.897484743334476, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 37387 + }, + { + "epoch": 0.37388, + "grad_norm": 0.9593114826990868, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 37388 + }, + { + "epoch": 0.37389, + "grad_norm": 0.8700066399054105, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 37389 + }, + { + "epoch": 0.3739, + "grad_norm": 0.732982820060023, + "learning_rate": 0.003, + "loss": 4.037, + "step": 37390 + }, + { + "epoch": 0.37391, + "grad_norm": 0.713653447357444, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 37391 + }, + { + "epoch": 0.37392, + "grad_norm": 0.6932681448363297, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 37392 + }, + { + "epoch": 0.37393, + "grad_norm": 0.8032039527868667, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 37393 + }, + { + "epoch": 0.37394, + "grad_norm": 0.9110192580899377, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 37394 + }, + { + "epoch": 0.37395, + "grad_norm": 0.9001874479532983, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 37395 + }, + { + "epoch": 0.37396, + "grad_norm": 0.9432578982862065, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 37396 + }, + { + "epoch": 0.37397, + "grad_norm": 0.9105515701372648, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 37397 + }, + { + "epoch": 0.37398, + "grad_norm": 0.854403519311065, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 37398 + }, + { + "epoch": 0.37399, + "grad_norm": 0.8541336386532217, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 37399 + }, + { + "epoch": 0.374, + "grad_norm": 0.8832892884299615, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 37400 + }, + { + "epoch": 0.37401, + "grad_norm": 0.9670206554396424, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 37401 + }, + { + "epoch": 0.37402, + "grad_norm": 0.8892100459357518, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 37402 + }, + { + "epoch": 0.37403, + "grad_norm": 0.8325610300246417, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 37403 + }, + { + "epoch": 0.37404, + "grad_norm": 0.9429350123332383, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 37404 + }, + { + "epoch": 0.37405, + "grad_norm": 0.9052535029365827, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 37405 + }, + { + "epoch": 0.37406, + "grad_norm": 0.7762715816711013, + "learning_rate": 0.003, + "loss": 4.04, + "step": 37406 + }, + { + "epoch": 0.37407, + "grad_norm": 0.7682621002887531, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 37407 + }, + { + "epoch": 0.37408, + "grad_norm": 0.7231166745457597, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 37408 + }, + { + "epoch": 0.37409, + "grad_norm": 0.7143936766331732, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 37409 + }, + { + "epoch": 0.3741, + "grad_norm": 0.7046484028029898, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 37410 + }, + { + "epoch": 0.37411, + "grad_norm": 0.8086352598351025, + "learning_rate": 0.003, + "loss": 4.034, + "step": 37411 + }, + { + "epoch": 0.37412, + "grad_norm": 0.9302475263452856, + "learning_rate": 0.003, + "loss": 4.021, + "step": 37412 + }, + { + "epoch": 0.37413, + "grad_norm": 0.8860242169478997, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 37413 + }, + { + "epoch": 0.37414, + "grad_norm": 0.8808503125308336, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 37414 + }, + { + "epoch": 0.37415, + "grad_norm": 0.8243328352881817, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 37415 + }, + { + "epoch": 0.37416, + "grad_norm": 0.7468863313110524, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 37416 + }, + { + "epoch": 0.37417, + "grad_norm": 0.6754105507567836, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 37417 + }, + { + "epoch": 0.37418, + "grad_norm": 0.6693577968934361, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 37418 + }, + { + "epoch": 0.37419, + "grad_norm": 0.5805132888735003, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 37419 + }, + { + "epoch": 0.3742, + "grad_norm": 0.5695062371919876, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 37420 + }, + { + "epoch": 0.37421, + "grad_norm": 0.543275402358237, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 37421 + }, + { + "epoch": 0.37422, + "grad_norm": 0.6573092465682859, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 37422 + }, + { + "epoch": 0.37423, + "grad_norm": 0.7292255198654456, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 37423 + }, + { + "epoch": 0.37424, + "grad_norm": 1.0073481836095786, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 37424 + }, + { + "epoch": 0.37425, + "grad_norm": 1.2299326255172323, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 37425 + }, + { + "epoch": 0.37426, + "grad_norm": 0.7426369584025264, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 37426 + }, + { + "epoch": 0.37427, + "grad_norm": 0.7002417945910894, + "learning_rate": 0.003, + "loss": 4.0024, + "step": 37427 + }, + { + "epoch": 0.37428, + "grad_norm": 0.7420746176064452, + "learning_rate": 0.003, + "loss": 4.005, + "step": 37428 + }, + { + "epoch": 0.37429, + "grad_norm": 0.7724745617325697, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 37429 + }, + { + "epoch": 0.3743, + "grad_norm": 0.8610091372579207, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 37430 + }, + { + "epoch": 0.37431, + "grad_norm": 0.9919330180000544, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 37431 + }, + { + "epoch": 0.37432, + "grad_norm": 0.9330410899697827, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 37432 + }, + { + "epoch": 0.37433, + "grad_norm": 0.8253716441156383, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 37433 + }, + { + "epoch": 0.37434, + "grad_norm": 0.7250187250618241, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 37434 + }, + { + "epoch": 0.37435, + "grad_norm": 0.8914798469277033, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 37435 + }, + { + "epoch": 0.37436, + "grad_norm": 0.8607559127508151, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 37436 + }, + { + "epoch": 0.37437, + "grad_norm": 0.8949951469751166, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 37437 + }, + { + "epoch": 0.37438, + "grad_norm": 1.0265793991782268, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 37438 + }, + { + "epoch": 0.37439, + "grad_norm": 0.9795514723572359, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 37439 + }, + { + "epoch": 0.3744, + "grad_norm": 0.904904906082824, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 37440 + }, + { + "epoch": 0.37441, + "grad_norm": 0.8494398567056931, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 37441 + }, + { + "epoch": 0.37442, + "grad_norm": 0.8647070544523665, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 37442 + }, + { + "epoch": 0.37443, + "grad_norm": 0.7648109731942588, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 37443 + }, + { + "epoch": 0.37444, + "grad_norm": 0.6960102724968272, + "learning_rate": 0.003, + "loss": 3.9871, + "step": 37444 + }, + { + "epoch": 0.37445, + "grad_norm": 0.7147647127088884, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 37445 + }, + { + "epoch": 0.37446, + "grad_norm": 0.8318423308914575, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 37446 + }, + { + "epoch": 0.37447, + "grad_norm": 0.9235218190397486, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 37447 + }, + { + "epoch": 0.37448, + "grad_norm": 1.1185336832345982, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 37448 + }, + { + "epoch": 0.37449, + "grad_norm": 0.9006731876550081, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 37449 + }, + { + "epoch": 0.3745, + "grad_norm": 0.7271485110149148, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 37450 + }, + { + "epoch": 0.37451, + "grad_norm": 0.7835232801448622, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 37451 + }, + { + "epoch": 0.37452, + "grad_norm": 0.782690967708232, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 37452 + }, + { + "epoch": 0.37453, + "grad_norm": 0.7310986919537943, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 37453 + }, + { + "epoch": 0.37454, + "grad_norm": 0.7049356226943826, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 37454 + }, + { + "epoch": 0.37455, + "grad_norm": 0.7123548165203444, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 37455 + }, + { + "epoch": 0.37456, + "grad_norm": 0.7040187469314381, + "learning_rate": 0.003, + "loss": 4.017, + "step": 37456 + }, + { + "epoch": 0.37457, + "grad_norm": 0.7024186713973382, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 37457 + }, + { + "epoch": 0.37458, + "grad_norm": 0.6985420835349003, + "learning_rate": 0.003, + "loss": 4.036, + "step": 37458 + }, + { + "epoch": 0.37459, + "grad_norm": 0.7818386145247619, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 37459 + }, + { + "epoch": 0.3746, + "grad_norm": 0.7507339267308656, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 37460 + }, + { + "epoch": 0.37461, + "grad_norm": 0.683871965109301, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 37461 + }, + { + "epoch": 0.37462, + "grad_norm": 0.7202742423257792, + "learning_rate": 0.003, + "loss": 3.9981, + "step": 37462 + }, + { + "epoch": 0.37463, + "grad_norm": 0.8594877290348256, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 37463 + }, + { + "epoch": 0.37464, + "grad_norm": 1.188262582945787, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 37464 + }, + { + "epoch": 0.37465, + "grad_norm": 1.1527846191642155, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 37465 + }, + { + "epoch": 0.37466, + "grad_norm": 0.7239679631679342, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 37466 + }, + { + "epoch": 0.37467, + "grad_norm": 0.6590123801775236, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 37467 + }, + { + "epoch": 0.37468, + "grad_norm": 0.7089174319778961, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 37468 + }, + { + "epoch": 0.37469, + "grad_norm": 0.8128909428907152, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 37469 + }, + { + "epoch": 0.3747, + "grad_norm": 0.884466426974962, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 37470 + }, + { + "epoch": 0.37471, + "grad_norm": 0.7479273977872127, + "learning_rate": 0.003, + "loss": 4.0034, + "step": 37471 + }, + { + "epoch": 0.37472, + "grad_norm": 0.7694239697850882, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 37472 + }, + { + "epoch": 0.37473, + "grad_norm": 0.7151931169209578, + "learning_rate": 0.003, + "loss": 4.0015, + "step": 37473 + }, + { + "epoch": 0.37474, + "grad_norm": 0.6423243656917309, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 37474 + }, + { + "epoch": 0.37475, + "grad_norm": 0.6936488102242215, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 37475 + }, + { + "epoch": 0.37476, + "grad_norm": 0.8573681058087189, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 37476 + }, + { + "epoch": 0.37477, + "grad_norm": 1.0337637189346558, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 37477 + }, + { + "epoch": 0.37478, + "grad_norm": 1.0282783223593748, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 37478 + }, + { + "epoch": 0.37479, + "grad_norm": 0.8384668510414325, + "learning_rate": 0.003, + "loss": 4.031, + "step": 37479 + }, + { + "epoch": 0.3748, + "grad_norm": 0.9238717842067391, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 37480 + }, + { + "epoch": 0.37481, + "grad_norm": 1.0827745240833897, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 37481 + }, + { + "epoch": 0.37482, + "grad_norm": 1.0103573224978095, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 37482 + }, + { + "epoch": 0.37483, + "grad_norm": 0.9298209697274913, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 37483 + }, + { + "epoch": 0.37484, + "grad_norm": 0.9149646884467838, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 37484 + }, + { + "epoch": 0.37485, + "grad_norm": 0.9202898916384488, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 37485 + }, + { + "epoch": 0.37486, + "grad_norm": 0.9230751522845801, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 37486 + }, + { + "epoch": 0.37487, + "grad_norm": 1.0508650743959118, + "learning_rate": 0.003, + "loss": 4.09, + "step": 37487 + }, + { + "epoch": 0.37488, + "grad_norm": 1.1752178667808022, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 37488 + }, + { + "epoch": 0.37489, + "grad_norm": 1.0155800180323946, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 37489 + }, + { + "epoch": 0.3749, + "grad_norm": 1.0265226077231162, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 37490 + }, + { + "epoch": 0.37491, + "grad_norm": 1.0409602658131196, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 37491 + }, + { + "epoch": 0.37492, + "grad_norm": 0.905210047289567, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 37492 + }, + { + "epoch": 0.37493, + "grad_norm": 0.7371049206765783, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 37493 + }, + { + "epoch": 0.37494, + "grad_norm": 0.7282244617538497, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 37494 + }, + { + "epoch": 0.37495, + "grad_norm": 0.7194673809814468, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 37495 + }, + { + "epoch": 0.37496, + "grad_norm": 0.7770345626144923, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 37496 + }, + { + "epoch": 0.37497, + "grad_norm": 0.7553748716108316, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 37497 + }, + { + "epoch": 0.37498, + "grad_norm": 0.7746113799115552, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 37498 + }, + { + "epoch": 0.37499, + "grad_norm": 0.7543452532550091, + "learning_rate": 0.003, + "loss": 4.032, + "step": 37499 + }, + { + "epoch": 0.375, + "grad_norm": 0.6571095754313954, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 37500 + }, + { + "epoch": 0.37501, + "grad_norm": 0.6514085121629847, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 37501 + }, + { + "epoch": 0.37502, + "grad_norm": 0.5632994956220885, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 37502 + }, + { + "epoch": 0.37503, + "grad_norm": 0.6105458177270732, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 37503 + }, + { + "epoch": 0.37504, + "grad_norm": 0.624611225429587, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 37504 + }, + { + "epoch": 0.37505, + "grad_norm": 0.6860232852272028, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 37505 + }, + { + "epoch": 0.37506, + "grad_norm": 0.6590729197812425, + "learning_rate": 0.003, + "loss": 4.0051, + "step": 37506 + }, + { + "epoch": 0.37507, + "grad_norm": 0.831175545895381, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 37507 + }, + { + "epoch": 0.37508, + "grad_norm": 1.0304619140698417, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 37508 + }, + { + "epoch": 0.37509, + "grad_norm": 1.2204747421919169, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 37509 + }, + { + "epoch": 0.3751, + "grad_norm": 0.8319782803559228, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 37510 + }, + { + "epoch": 0.37511, + "grad_norm": 0.6949421958742619, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 37511 + }, + { + "epoch": 0.37512, + "grad_norm": 0.756041055757322, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 37512 + }, + { + "epoch": 0.37513, + "grad_norm": 0.7147542501804656, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 37513 + }, + { + "epoch": 0.37514, + "grad_norm": 0.6871650438661071, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 37514 + }, + { + "epoch": 0.37515, + "grad_norm": 0.6877362228904349, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 37515 + }, + { + "epoch": 0.37516, + "grad_norm": 0.6222658131755776, + "learning_rate": 0.003, + "loss": 3.9791, + "step": 37516 + }, + { + "epoch": 0.37517, + "grad_norm": 0.7581187447463089, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 37517 + }, + { + "epoch": 0.37518, + "grad_norm": 0.9705053471469556, + "learning_rate": 0.003, + "loss": 4.042, + "step": 37518 + }, + { + "epoch": 0.37519, + "grad_norm": 1.2001103378896594, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 37519 + }, + { + "epoch": 0.3752, + "grad_norm": 0.7394836333754037, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 37520 + }, + { + "epoch": 0.37521, + "grad_norm": 0.6340189992425552, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 37521 + }, + { + "epoch": 0.37522, + "grad_norm": 0.7196060974660567, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 37522 + }, + { + "epoch": 0.37523, + "grad_norm": 0.7411774120356314, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 37523 + }, + { + "epoch": 0.37524, + "grad_norm": 0.7196520902710362, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 37524 + }, + { + "epoch": 0.37525, + "grad_norm": 0.8006530529832556, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 37525 + }, + { + "epoch": 0.37526, + "grad_norm": 0.9160781653894673, + "learning_rate": 0.003, + "loss": 4.087, + "step": 37526 + }, + { + "epoch": 0.37527, + "grad_norm": 0.9573831843372403, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 37527 + }, + { + "epoch": 0.37528, + "grad_norm": 0.998455058095277, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 37528 + }, + { + "epoch": 0.37529, + "grad_norm": 1.0882591779131316, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 37529 + }, + { + "epoch": 0.3753, + "grad_norm": 0.7897584958450138, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 37530 + }, + { + "epoch": 0.37531, + "grad_norm": 0.691411562437372, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 37531 + }, + { + "epoch": 0.37532, + "grad_norm": 0.6510626165880816, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 37532 + }, + { + "epoch": 0.37533, + "grad_norm": 0.6223898016366122, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 37533 + }, + { + "epoch": 0.37534, + "grad_norm": 0.579815714841355, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 37534 + }, + { + "epoch": 0.37535, + "grad_norm": 0.6439533219795027, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 37535 + }, + { + "epoch": 0.37536, + "grad_norm": 0.8093883476480864, + "learning_rate": 0.003, + "loss": 4.024, + "step": 37536 + }, + { + "epoch": 0.37537, + "grad_norm": 0.884633877674498, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 37537 + }, + { + "epoch": 0.37538, + "grad_norm": 0.9469515187455004, + "learning_rate": 0.003, + "loss": 4.032, + "step": 37538 + }, + { + "epoch": 0.37539, + "grad_norm": 1.0435539120749504, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 37539 + }, + { + "epoch": 0.3754, + "grad_norm": 1.0034751231505146, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 37540 + }, + { + "epoch": 0.37541, + "grad_norm": 0.8905837702498453, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 37541 + }, + { + "epoch": 0.37542, + "grad_norm": 0.8833113874127112, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 37542 + }, + { + "epoch": 0.37543, + "grad_norm": 0.8234130103534916, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 37543 + }, + { + "epoch": 0.37544, + "grad_norm": 0.7552670552536902, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 37544 + }, + { + "epoch": 0.37545, + "grad_norm": 0.9180492619183117, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 37545 + }, + { + "epoch": 0.37546, + "grad_norm": 0.984013283815246, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 37546 + }, + { + "epoch": 0.37547, + "grad_norm": 1.0711513621298876, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 37547 + }, + { + "epoch": 0.37548, + "grad_norm": 0.9083142300174752, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 37548 + }, + { + "epoch": 0.37549, + "grad_norm": 0.9440993804347689, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 37549 + }, + { + "epoch": 0.3755, + "grad_norm": 0.964718158883265, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 37550 + }, + { + "epoch": 0.37551, + "grad_norm": 1.0099692145981096, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 37551 + }, + { + "epoch": 0.37552, + "grad_norm": 0.9401746197355978, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 37552 + }, + { + "epoch": 0.37553, + "grad_norm": 0.8808521604640808, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 37553 + }, + { + "epoch": 0.37554, + "grad_norm": 0.871982525130244, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 37554 + }, + { + "epoch": 0.37555, + "grad_norm": 0.7830571318444631, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 37555 + }, + { + "epoch": 0.37556, + "grad_norm": 0.7529448264354272, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 37556 + }, + { + "epoch": 0.37557, + "grad_norm": 0.8179011190458095, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 37557 + }, + { + "epoch": 0.37558, + "grad_norm": 0.9046297865281512, + "learning_rate": 0.003, + "loss": 4.039, + "step": 37558 + }, + { + "epoch": 0.37559, + "grad_norm": 1.0089290595895164, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 37559 + }, + { + "epoch": 0.3756, + "grad_norm": 1.1314437587390573, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 37560 + }, + { + "epoch": 0.37561, + "grad_norm": 1.0211397065333854, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 37561 + }, + { + "epoch": 0.37562, + "grad_norm": 0.9329492190415246, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 37562 + }, + { + "epoch": 0.37563, + "grad_norm": 0.9122904126058259, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 37563 + }, + { + "epoch": 0.37564, + "grad_norm": 0.8375684647659394, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 37564 + }, + { + "epoch": 0.37565, + "grad_norm": 0.7363774495580871, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 37565 + }, + { + "epoch": 0.37566, + "grad_norm": 0.7484361421135587, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 37566 + }, + { + "epoch": 0.37567, + "grad_norm": 0.6611632623201508, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 37567 + }, + { + "epoch": 0.37568, + "grad_norm": 0.6312207480804349, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 37568 + }, + { + "epoch": 0.37569, + "grad_norm": 0.6387013151296034, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 37569 + }, + { + "epoch": 0.3757, + "grad_norm": 0.72699099629366, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 37570 + }, + { + "epoch": 0.37571, + "grad_norm": 0.7002718580836572, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 37571 + }, + { + "epoch": 0.37572, + "grad_norm": 0.699198737417126, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 37572 + }, + { + "epoch": 0.37573, + "grad_norm": 0.690391610141439, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 37573 + }, + { + "epoch": 0.37574, + "grad_norm": 0.6857592249788078, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 37574 + }, + { + "epoch": 0.37575, + "grad_norm": 0.6713703369138858, + "learning_rate": 0.003, + "loss": 4.028, + "step": 37575 + }, + { + "epoch": 0.37576, + "grad_norm": 0.7392833892704638, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 37576 + }, + { + "epoch": 0.37577, + "grad_norm": 0.7494012428610742, + "learning_rate": 0.003, + "loss": 4.015, + "step": 37577 + }, + { + "epoch": 0.37578, + "grad_norm": 0.6713196304828886, + "learning_rate": 0.003, + "loss": 4.04, + "step": 37578 + }, + { + "epoch": 0.37579, + "grad_norm": 0.6060798737540648, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 37579 + }, + { + "epoch": 0.3758, + "grad_norm": 0.6367911045081753, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 37580 + }, + { + "epoch": 0.37581, + "grad_norm": 0.5955950565872815, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 37581 + }, + { + "epoch": 0.37582, + "grad_norm": 0.6391742541779551, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 37582 + }, + { + "epoch": 0.37583, + "grad_norm": 0.6454409464554238, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 37583 + }, + { + "epoch": 0.37584, + "grad_norm": 0.7347677984984045, + "learning_rate": 0.003, + "loss": 3.9955, + "step": 37584 + }, + { + "epoch": 0.37585, + "grad_norm": 0.8986120340701558, + "learning_rate": 0.003, + "loss": 4.024, + "step": 37585 + }, + { + "epoch": 0.37586, + "grad_norm": 1.1841909267931774, + "learning_rate": 0.003, + "loss": 4.032, + "step": 37586 + }, + { + "epoch": 0.37587, + "grad_norm": 0.9909181524707139, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 37587 + }, + { + "epoch": 0.37588, + "grad_norm": 1.087230278409957, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 37588 + }, + { + "epoch": 0.37589, + "grad_norm": 0.993653547496119, + "learning_rate": 0.003, + "loss": 4.0039, + "step": 37589 + }, + { + "epoch": 0.3759, + "grad_norm": 1.009973252235159, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 37590 + }, + { + "epoch": 0.37591, + "grad_norm": 0.9612499050540169, + "learning_rate": 0.003, + "loss": 4.027, + "step": 37591 + }, + { + "epoch": 0.37592, + "grad_norm": 0.8540801203134153, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 37592 + }, + { + "epoch": 0.37593, + "grad_norm": 0.8269471464323942, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 37593 + }, + { + "epoch": 0.37594, + "grad_norm": 0.8518750661614488, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 37594 + }, + { + "epoch": 0.37595, + "grad_norm": 0.7199701061062878, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 37595 + }, + { + "epoch": 0.37596, + "grad_norm": 0.7875464634466464, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 37596 + }, + { + "epoch": 0.37597, + "grad_norm": 0.7940779714160037, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 37597 + }, + { + "epoch": 0.37598, + "grad_norm": 0.8273428386060693, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 37598 + }, + { + "epoch": 0.37599, + "grad_norm": 0.9034467037347187, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 37599 + }, + { + "epoch": 0.376, + "grad_norm": 1.0563070403068116, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 37600 + }, + { + "epoch": 0.37601, + "grad_norm": 1.0173250547840818, + "learning_rate": 0.003, + "loss": 4.037, + "step": 37601 + }, + { + "epoch": 0.37602, + "grad_norm": 0.8320218124587385, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 37602 + }, + { + "epoch": 0.37603, + "grad_norm": 0.7615880321250837, + "learning_rate": 0.003, + "loss": 4.051, + "step": 37603 + }, + { + "epoch": 0.37604, + "grad_norm": 0.7751133182796713, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 37604 + }, + { + "epoch": 0.37605, + "grad_norm": 0.6787068445519812, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 37605 + }, + { + "epoch": 0.37606, + "grad_norm": 0.6766794767251314, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 37606 + }, + { + "epoch": 0.37607, + "grad_norm": 0.6840177240929115, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 37607 + }, + { + "epoch": 0.37608, + "grad_norm": 0.7148620444456555, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 37608 + }, + { + "epoch": 0.37609, + "grad_norm": 0.7442364003574098, + "learning_rate": 0.003, + "loss": 4.028, + "step": 37609 + }, + { + "epoch": 0.3761, + "grad_norm": 0.7352832237970978, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 37610 + }, + { + "epoch": 0.37611, + "grad_norm": 0.8612933586137825, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 37611 + }, + { + "epoch": 0.37612, + "grad_norm": 1.148326854338885, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 37612 + }, + { + "epoch": 0.37613, + "grad_norm": 0.8798377029466065, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 37613 + }, + { + "epoch": 0.37614, + "grad_norm": 0.8251416073113411, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 37614 + }, + { + "epoch": 0.37615, + "grad_norm": 0.8694067232030369, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 37615 + }, + { + "epoch": 0.37616, + "grad_norm": 0.8583084344075125, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 37616 + }, + { + "epoch": 0.37617, + "grad_norm": 0.8745702490122492, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 37617 + }, + { + "epoch": 0.37618, + "grad_norm": 0.9764774660840724, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 37618 + }, + { + "epoch": 0.37619, + "grad_norm": 1.1072017001223922, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 37619 + }, + { + "epoch": 0.3762, + "grad_norm": 0.8103642441664904, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 37620 + }, + { + "epoch": 0.37621, + "grad_norm": 0.7018187096616972, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 37621 + }, + { + "epoch": 0.37622, + "grad_norm": 0.7492485196265367, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 37622 + }, + { + "epoch": 0.37623, + "grad_norm": 0.7687580994486028, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 37623 + }, + { + "epoch": 0.37624, + "grad_norm": 0.7270891089927314, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 37624 + }, + { + "epoch": 0.37625, + "grad_norm": 0.6764484921698277, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 37625 + }, + { + "epoch": 0.37626, + "grad_norm": 0.678794542800183, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 37626 + }, + { + "epoch": 0.37627, + "grad_norm": 0.6850375027539659, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 37627 + }, + { + "epoch": 0.37628, + "grad_norm": 0.6889274402873606, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 37628 + }, + { + "epoch": 0.37629, + "grad_norm": 0.7482394598119108, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 37629 + }, + { + "epoch": 0.3763, + "grad_norm": 0.9514694192256442, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 37630 + }, + { + "epoch": 0.37631, + "grad_norm": 1.1799242555827858, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 37631 + }, + { + "epoch": 0.37632, + "grad_norm": 0.9908870297281012, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 37632 + }, + { + "epoch": 0.37633, + "grad_norm": 0.9806819887859083, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 37633 + }, + { + "epoch": 0.37634, + "grad_norm": 0.910550564907181, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 37634 + }, + { + "epoch": 0.37635, + "grad_norm": 0.9348030056345659, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 37635 + }, + { + "epoch": 0.37636, + "grad_norm": 0.9710886117555306, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 37636 + }, + { + "epoch": 0.37637, + "grad_norm": 0.9596567438842079, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 37637 + }, + { + "epoch": 0.37638, + "grad_norm": 0.9189916707511182, + "learning_rate": 0.003, + "loss": 4.048, + "step": 37638 + }, + { + "epoch": 0.37639, + "grad_norm": 0.8659245261679109, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 37639 + }, + { + "epoch": 0.3764, + "grad_norm": 0.948783848793536, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 37640 + }, + { + "epoch": 0.37641, + "grad_norm": 0.9854893250736423, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 37641 + }, + { + "epoch": 0.37642, + "grad_norm": 0.9193775055012956, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 37642 + }, + { + "epoch": 0.37643, + "grad_norm": 0.9777898280019939, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 37643 + }, + { + "epoch": 0.37644, + "grad_norm": 0.8847161748467732, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 37644 + }, + { + "epoch": 0.37645, + "grad_norm": 0.9364866729563817, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 37645 + }, + { + "epoch": 0.37646, + "grad_norm": 1.0315022324422916, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 37646 + }, + { + "epoch": 0.37647, + "grad_norm": 1.0150311375763044, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 37647 + }, + { + "epoch": 0.37648, + "grad_norm": 1.012364235834434, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 37648 + }, + { + "epoch": 0.37649, + "grad_norm": 0.9012484116981886, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 37649 + }, + { + "epoch": 0.3765, + "grad_norm": 0.8327076089143673, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 37650 + }, + { + "epoch": 0.37651, + "grad_norm": 0.8314383456883147, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 37651 + }, + { + "epoch": 0.37652, + "grad_norm": 0.8190315106784005, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 37652 + }, + { + "epoch": 0.37653, + "grad_norm": 0.910234806953883, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 37653 + }, + { + "epoch": 0.37654, + "grad_norm": 1.1083146149615841, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 37654 + }, + { + "epoch": 0.37655, + "grad_norm": 1.0104130330156744, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 37655 + }, + { + "epoch": 0.37656, + "grad_norm": 0.8218842052164118, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 37656 + }, + { + "epoch": 0.37657, + "grad_norm": 0.7104982976049554, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 37657 + }, + { + "epoch": 0.37658, + "grad_norm": 0.5740244883866918, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 37658 + }, + { + "epoch": 0.37659, + "grad_norm": 0.636004261416634, + "learning_rate": 0.003, + "loss": 4.043, + "step": 37659 + }, + { + "epoch": 0.3766, + "grad_norm": 0.619022337995547, + "learning_rate": 0.003, + "loss": 3.9998, + "step": 37660 + }, + { + "epoch": 0.37661, + "grad_norm": 0.6318599484585365, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 37661 + }, + { + "epoch": 0.37662, + "grad_norm": 0.6018702935849077, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 37662 + }, + { + "epoch": 0.37663, + "grad_norm": 0.6299347115127599, + "learning_rate": 0.003, + "loss": 4.018, + "step": 37663 + }, + { + "epoch": 0.37664, + "grad_norm": 0.6421261411551767, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 37664 + }, + { + "epoch": 0.37665, + "grad_norm": 0.7417566552839632, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 37665 + }, + { + "epoch": 0.37666, + "grad_norm": 1.0122949909914922, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 37666 + }, + { + "epoch": 0.37667, + "grad_norm": 1.3042545669038021, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 37667 + }, + { + "epoch": 0.37668, + "grad_norm": 0.6400407382331001, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 37668 + }, + { + "epoch": 0.37669, + "grad_norm": 0.6877685317018494, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 37669 + }, + { + "epoch": 0.3767, + "grad_norm": 0.7284162407570458, + "learning_rate": 0.003, + "loss": 4.0073, + "step": 37670 + }, + { + "epoch": 0.37671, + "grad_norm": 0.7200862830360403, + "learning_rate": 0.003, + "loss": 4.018, + "step": 37671 + }, + { + "epoch": 0.37672, + "grad_norm": 0.6722138960187298, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 37672 + }, + { + "epoch": 0.37673, + "grad_norm": 0.6279907327075996, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 37673 + }, + { + "epoch": 0.37674, + "grad_norm": 0.7117220889495574, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 37674 + }, + { + "epoch": 0.37675, + "grad_norm": 0.798753789976739, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 37675 + }, + { + "epoch": 0.37676, + "grad_norm": 0.7916679485930659, + "learning_rate": 0.003, + "loss": 3.9962, + "step": 37676 + }, + { + "epoch": 0.37677, + "grad_norm": 0.6592498981822045, + "learning_rate": 0.003, + "loss": 4.02, + "step": 37677 + }, + { + "epoch": 0.37678, + "grad_norm": 0.6114026698540349, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 37678 + }, + { + "epoch": 0.37679, + "grad_norm": 0.6115708366949691, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 37679 + }, + { + "epoch": 0.3768, + "grad_norm": 0.6115050275278279, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 37680 + }, + { + "epoch": 0.37681, + "grad_norm": 0.712925404594426, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 37681 + }, + { + "epoch": 0.37682, + "grad_norm": 0.8128117941633441, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 37682 + }, + { + "epoch": 0.37683, + "grad_norm": 0.983278440829563, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 37683 + }, + { + "epoch": 0.37684, + "grad_norm": 1.1338239623419075, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 37684 + }, + { + "epoch": 0.37685, + "grad_norm": 0.9131626970080932, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 37685 + }, + { + "epoch": 0.37686, + "grad_norm": 0.91021556817446, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 37686 + }, + { + "epoch": 0.37687, + "grad_norm": 0.9936514497810455, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 37687 + }, + { + "epoch": 0.37688, + "grad_norm": 1.0361387555481782, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 37688 + }, + { + "epoch": 0.37689, + "grad_norm": 1.031045918019694, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 37689 + }, + { + "epoch": 0.3769, + "grad_norm": 0.9713405818848359, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 37690 + }, + { + "epoch": 0.37691, + "grad_norm": 1.0859073634236807, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 37691 + }, + { + "epoch": 0.37692, + "grad_norm": 0.9842322081315643, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 37692 + }, + { + "epoch": 0.37693, + "grad_norm": 1.0075667658975844, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 37693 + }, + { + "epoch": 0.37694, + "grad_norm": 0.8913779261705159, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 37694 + }, + { + "epoch": 0.37695, + "grad_norm": 0.8589953745241203, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 37695 + }, + { + "epoch": 0.37696, + "grad_norm": 0.8558948749004207, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 37696 + }, + { + "epoch": 0.37697, + "grad_norm": 0.7837185983104679, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 37697 + }, + { + "epoch": 0.37698, + "grad_norm": 0.7180383983719246, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 37698 + }, + { + "epoch": 0.37699, + "grad_norm": 0.7169955499148041, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 37699 + }, + { + "epoch": 0.377, + "grad_norm": 0.7062369384805217, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 37700 + }, + { + "epoch": 0.37701, + "grad_norm": 0.7616589454766841, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 37701 + }, + { + "epoch": 0.37702, + "grad_norm": 0.7792529584732423, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 37702 + }, + { + "epoch": 0.37703, + "grad_norm": 0.7356494642316624, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 37703 + }, + { + "epoch": 0.37704, + "grad_norm": 0.7091440805828676, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 37704 + }, + { + "epoch": 0.37705, + "grad_norm": 0.7678503478856169, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 37705 + }, + { + "epoch": 0.37706, + "grad_norm": 0.864596592563754, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 37706 + }, + { + "epoch": 0.37707, + "grad_norm": 0.8977145207313438, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 37707 + }, + { + "epoch": 0.37708, + "grad_norm": 0.8764371117146595, + "learning_rate": 0.003, + "loss": 4.0089, + "step": 37708 + }, + { + "epoch": 0.37709, + "grad_norm": 0.9278859335886231, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 37709 + }, + { + "epoch": 0.3771, + "grad_norm": 0.9845789583660172, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 37710 + }, + { + "epoch": 0.37711, + "grad_norm": 1.1141926445069417, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 37711 + }, + { + "epoch": 0.37712, + "grad_norm": 1.0157203322117996, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 37712 + }, + { + "epoch": 0.37713, + "grad_norm": 1.079167695244436, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 37713 + }, + { + "epoch": 0.37714, + "grad_norm": 0.8624485869988174, + "learning_rate": 0.003, + "loss": 4.0088, + "step": 37714 + }, + { + "epoch": 0.37715, + "grad_norm": 0.9710685140093047, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 37715 + }, + { + "epoch": 0.37716, + "grad_norm": 1.0077198269535126, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 37716 + }, + { + "epoch": 0.37717, + "grad_norm": 0.8422904653214387, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 37717 + }, + { + "epoch": 0.37718, + "grad_norm": 0.8890582510992044, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 37718 + }, + { + "epoch": 0.37719, + "grad_norm": 0.825119985523839, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 37719 + }, + { + "epoch": 0.3772, + "grad_norm": 0.8079966849908495, + "learning_rate": 0.003, + "loss": 4.026, + "step": 37720 + }, + { + "epoch": 0.37721, + "grad_norm": 0.8035822411978809, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 37721 + }, + { + "epoch": 0.37722, + "grad_norm": 0.8092165390406844, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 37722 + }, + { + "epoch": 0.37723, + "grad_norm": 0.8578653544682041, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 37723 + }, + { + "epoch": 0.37724, + "grad_norm": 0.9541049120322085, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 37724 + }, + { + "epoch": 0.37725, + "grad_norm": 0.9967683016862027, + "learning_rate": 0.003, + "loss": 4.039, + "step": 37725 + }, + { + "epoch": 0.37726, + "grad_norm": 0.899741798763165, + "learning_rate": 0.003, + "loss": 4.0038, + "step": 37726 + }, + { + "epoch": 0.37727, + "grad_norm": 0.730817020056801, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 37727 + }, + { + "epoch": 0.37728, + "grad_norm": 0.7639559840163961, + "learning_rate": 0.003, + "loss": 4.0077, + "step": 37728 + }, + { + "epoch": 0.37729, + "grad_norm": 0.825184243756034, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 37729 + }, + { + "epoch": 0.3773, + "grad_norm": 0.9330829661394818, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 37730 + }, + { + "epoch": 0.37731, + "grad_norm": 1.0068399035090771, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 37731 + }, + { + "epoch": 0.37732, + "grad_norm": 1.0636714660742168, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 37732 + }, + { + "epoch": 0.37733, + "grad_norm": 0.7394023151246093, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 37733 + }, + { + "epoch": 0.37734, + "grad_norm": 0.562036376160918, + "learning_rate": 0.003, + "loss": 3.9953, + "step": 37734 + }, + { + "epoch": 0.37735, + "grad_norm": 0.5420889838899315, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 37735 + }, + { + "epoch": 0.37736, + "grad_norm": 0.4972694334181052, + "learning_rate": 0.003, + "loss": 3.9982, + "step": 37736 + }, + { + "epoch": 0.37737, + "grad_norm": 0.5150510979737046, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 37737 + }, + { + "epoch": 0.37738, + "grad_norm": 0.5417765028495114, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 37738 + }, + { + "epoch": 0.37739, + "grad_norm": 0.7111063830849033, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 37739 + }, + { + "epoch": 0.3774, + "grad_norm": 0.905076466924828, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 37740 + }, + { + "epoch": 0.37741, + "grad_norm": 1.0422243281880055, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 37741 + }, + { + "epoch": 0.37742, + "grad_norm": 0.8221081766085205, + "learning_rate": 0.003, + "loss": 4.037, + "step": 37742 + }, + { + "epoch": 0.37743, + "grad_norm": 0.6432772990496624, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 37743 + }, + { + "epoch": 0.37744, + "grad_norm": 0.6772888421239335, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 37744 + }, + { + "epoch": 0.37745, + "grad_norm": 0.7309607309212047, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 37745 + }, + { + "epoch": 0.37746, + "grad_norm": 0.8114517748529685, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 37746 + }, + { + "epoch": 0.37747, + "grad_norm": 0.8452931761639206, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 37747 + }, + { + "epoch": 0.37748, + "grad_norm": 0.8199995478101157, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 37748 + }, + { + "epoch": 0.37749, + "grad_norm": 0.7739537573599824, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 37749 + }, + { + "epoch": 0.3775, + "grad_norm": 0.755335821895632, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 37750 + }, + { + "epoch": 0.37751, + "grad_norm": 0.7908213291113916, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 37751 + }, + { + "epoch": 0.37752, + "grad_norm": 0.7900003418980575, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 37752 + }, + { + "epoch": 0.37753, + "grad_norm": 0.6144736313552209, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 37753 + }, + { + "epoch": 0.37754, + "grad_norm": 0.5852793744718868, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 37754 + }, + { + "epoch": 0.37755, + "grad_norm": 0.6106098643400659, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 37755 + }, + { + "epoch": 0.37756, + "grad_norm": 0.6922330860010059, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 37756 + }, + { + "epoch": 0.37757, + "grad_norm": 0.740125565858685, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 37757 + }, + { + "epoch": 0.37758, + "grad_norm": 0.75096877200921, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 37758 + }, + { + "epoch": 0.37759, + "grad_norm": 0.8881893012678826, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 37759 + }, + { + "epoch": 0.3776, + "grad_norm": 1.0625216660617596, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 37760 + }, + { + "epoch": 0.37761, + "grad_norm": 1.1781634411705668, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 37761 + }, + { + "epoch": 0.37762, + "grad_norm": 0.8147782620007523, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 37762 + }, + { + "epoch": 0.37763, + "grad_norm": 0.730392763912855, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 37763 + }, + { + "epoch": 0.37764, + "grad_norm": 0.7679112077745089, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 37764 + }, + { + "epoch": 0.37765, + "grad_norm": 0.729145601316785, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 37765 + }, + { + "epoch": 0.37766, + "grad_norm": 0.74800822261782, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 37766 + }, + { + "epoch": 0.37767, + "grad_norm": 0.8107669834659945, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 37767 + }, + { + "epoch": 0.37768, + "grad_norm": 0.8944100772908175, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 37768 + }, + { + "epoch": 0.37769, + "grad_norm": 1.0730546414543674, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 37769 + }, + { + "epoch": 0.3777, + "grad_norm": 1.2882394760843672, + "learning_rate": 0.003, + "loss": 4.0842, + "step": 37770 + }, + { + "epoch": 0.37771, + "grad_norm": 0.9574233857475535, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 37771 + }, + { + "epoch": 0.37772, + "grad_norm": 0.9768870611746825, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 37772 + }, + { + "epoch": 0.37773, + "grad_norm": 1.184271521810532, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 37773 + }, + { + "epoch": 0.37774, + "grad_norm": 0.9191078785236543, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 37774 + }, + { + "epoch": 0.37775, + "grad_norm": 0.8413414652927848, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 37775 + }, + { + "epoch": 0.37776, + "grad_norm": 0.7629923084820257, + "learning_rate": 0.003, + "loss": 4.051, + "step": 37776 + }, + { + "epoch": 0.37777, + "grad_norm": 0.6874505491359172, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 37777 + }, + { + "epoch": 0.37778, + "grad_norm": 0.6173659896351688, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 37778 + }, + { + "epoch": 0.37779, + "grad_norm": 0.6744933224259342, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 37779 + }, + { + "epoch": 0.3778, + "grad_norm": 0.7384389628779857, + "learning_rate": 0.003, + "loss": 4.033, + "step": 37780 + }, + { + "epoch": 0.37781, + "grad_norm": 0.9467631805192114, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 37781 + }, + { + "epoch": 0.37782, + "grad_norm": 1.0755569695791596, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 37782 + }, + { + "epoch": 0.37783, + "grad_norm": 0.9056648875433188, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 37783 + }, + { + "epoch": 0.37784, + "grad_norm": 0.7582209983715498, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 37784 + }, + { + "epoch": 0.37785, + "grad_norm": 0.6124027500188279, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 37785 + }, + { + "epoch": 0.37786, + "grad_norm": 0.6050589779213364, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 37786 + }, + { + "epoch": 0.37787, + "grad_norm": 0.5709586509663538, + "learning_rate": 0.003, + "loss": 3.9832, + "step": 37787 + }, + { + "epoch": 0.37788, + "grad_norm": 0.56724026624154, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 37788 + }, + { + "epoch": 0.37789, + "grad_norm": 0.600114539818659, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 37789 + }, + { + "epoch": 0.3779, + "grad_norm": 0.7593385518859307, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 37790 + }, + { + "epoch": 0.37791, + "grad_norm": 1.021234050587808, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 37791 + }, + { + "epoch": 0.37792, + "grad_norm": 1.2166939460349124, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 37792 + }, + { + "epoch": 0.37793, + "grad_norm": 0.7384982429319139, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 37793 + }, + { + "epoch": 0.37794, + "grad_norm": 0.8973342492719393, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 37794 + }, + { + "epoch": 0.37795, + "grad_norm": 1.0142518905128723, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 37795 + }, + { + "epoch": 0.37796, + "grad_norm": 1.0391211104170917, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 37796 + }, + { + "epoch": 0.37797, + "grad_norm": 0.9421693309063429, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 37797 + }, + { + "epoch": 0.37798, + "grad_norm": 0.943141142879319, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 37798 + }, + { + "epoch": 0.37799, + "grad_norm": 0.9699920958753845, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 37799 + }, + { + "epoch": 0.378, + "grad_norm": 0.9772574823755621, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 37800 + }, + { + "epoch": 0.37801, + "grad_norm": 0.9279106000223022, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 37801 + }, + { + "epoch": 0.37802, + "grad_norm": 0.8019937201997274, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 37802 + }, + { + "epoch": 0.37803, + "grad_norm": 0.6782036005184248, + "learning_rate": 0.003, + "loss": 4.05, + "step": 37803 + }, + { + "epoch": 0.37804, + "grad_norm": 0.6943187452250168, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 37804 + }, + { + "epoch": 0.37805, + "grad_norm": 0.6768910310640514, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 37805 + }, + { + "epoch": 0.37806, + "grad_norm": 0.7425601765293703, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 37806 + }, + { + "epoch": 0.37807, + "grad_norm": 0.8611833041213237, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 37807 + }, + { + "epoch": 0.37808, + "grad_norm": 0.9535053983045279, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 37808 + }, + { + "epoch": 0.37809, + "grad_norm": 0.9449041039712404, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 37809 + }, + { + "epoch": 0.3781, + "grad_norm": 0.8179075985320989, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 37810 + }, + { + "epoch": 0.37811, + "grad_norm": 0.9432903637881684, + "learning_rate": 0.003, + "loss": 4.054, + "step": 37811 + }, + { + "epoch": 0.37812, + "grad_norm": 1.0463374043412732, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 37812 + }, + { + "epoch": 0.37813, + "grad_norm": 0.9152299414490683, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 37813 + }, + { + "epoch": 0.37814, + "grad_norm": 0.8051736474634028, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 37814 + }, + { + "epoch": 0.37815, + "grad_norm": 0.9472490658997738, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 37815 + }, + { + "epoch": 0.37816, + "grad_norm": 0.8584510712728743, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 37816 + }, + { + "epoch": 0.37817, + "grad_norm": 0.7755705746941345, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 37817 + }, + { + "epoch": 0.37818, + "grad_norm": 0.7559962586269274, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 37818 + }, + { + "epoch": 0.37819, + "grad_norm": 0.7673962435570226, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 37819 + }, + { + "epoch": 0.3782, + "grad_norm": 0.8115019313925177, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 37820 + }, + { + "epoch": 0.37821, + "grad_norm": 0.8048587150410569, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 37821 + }, + { + "epoch": 0.37822, + "grad_norm": 0.8759328626927918, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 37822 + }, + { + "epoch": 0.37823, + "grad_norm": 0.8552370058608116, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 37823 + }, + { + "epoch": 0.37824, + "grad_norm": 1.0419449197812305, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 37824 + }, + { + "epoch": 0.37825, + "grad_norm": 1.019947613857256, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 37825 + }, + { + "epoch": 0.37826, + "grad_norm": 0.9210141885411367, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 37826 + }, + { + "epoch": 0.37827, + "grad_norm": 0.813762725625581, + "learning_rate": 0.003, + "loss": 4.047, + "step": 37827 + }, + { + "epoch": 0.37828, + "grad_norm": 0.7303595171532996, + "learning_rate": 0.003, + "loss": 4.0068, + "step": 37828 + }, + { + "epoch": 0.37829, + "grad_norm": 0.6791186306094872, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 37829 + }, + { + "epoch": 0.3783, + "grad_norm": 0.7700644699756941, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 37830 + }, + { + "epoch": 0.37831, + "grad_norm": 0.7915424213382575, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 37831 + }, + { + "epoch": 0.37832, + "grad_norm": 0.8224250990603077, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 37832 + }, + { + "epoch": 0.37833, + "grad_norm": 0.8297024122797048, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 37833 + }, + { + "epoch": 0.37834, + "grad_norm": 0.8044633916117644, + "learning_rate": 0.003, + "loss": 4.0084, + "step": 37834 + }, + { + "epoch": 0.37835, + "grad_norm": 1.0060871564720237, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 37835 + }, + { + "epoch": 0.37836, + "grad_norm": 1.2110812013474033, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 37836 + }, + { + "epoch": 0.37837, + "grad_norm": 1.0150063467009658, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 37837 + }, + { + "epoch": 0.37838, + "grad_norm": 0.98737829442125, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 37838 + }, + { + "epoch": 0.37839, + "grad_norm": 0.9305985801368649, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 37839 + }, + { + "epoch": 0.3784, + "grad_norm": 0.8458442474637328, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 37840 + }, + { + "epoch": 0.37841, + "grad_norm": 0.7620657974104771, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 37841 + }, + { + "epoch": 0.37842, + "grad_norm": 0.841592345501659, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 37842 + }, + { + "epoch": 0.37843, + "grad_norm": 0.9224446739722006, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 37843 + }, + { + "epoch": 0.37844, + "grad_norm": 0.9799207662964898, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 37844 + }, + { + "epoch": 0.37845, + "grad_norm": 0.9007917917685929, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 37845 + }, + { + "epoch": 0.37846, + "grad_norm": 0.6324578198795774, + "learning_rate": 0.003, + "loss": 4.032, + "step": 37846 + }, + { + "epoch": 0.37847, + "grad_norm": 0.5266169379612166, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 37847 + }, + { + "epoch": 0.37848, + "grad_norm": 0.5132693730155462, + "learning_rate": 0.003, + "loss": 4.029, + "step": 37848 + }, + { + "epoch": 0.37849, + "grad_norm": 0.5270946231750956, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 37849 + }, + { + "epoch": 0.3785, + "grad_norm": 0.5198280744035592, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 37850 + }, + { + "epoch": 0.37851, + "grad_norm": 0.524552382207101, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 37851 + }, + { + "epoch": 0.37852, + "grad_norm": 0.530302344935297, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 37852 + }, + { + "epoch": 0.37853, + "grad_norm": 0.5249583062210271, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 37853 + }, + { + "epoch": 0.37854, + "grad_norm": 0.5185357608795078, + "learning_rate": 0.003, + "loss": 3.9965, + "step": 37854 + }, + { + "epoch": 0.37855, + "grad_norm": 0.520783409567334, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 37855 + }, + { + "epoch": 0.37856, + "grad_norm": 0.600674677388867, + "learning_rate": 0.003, + "loss": 3.9972, + "step": 37856 + }, + { + "epoch": 0.37857, + "grad_norm": 0.7493315622251875, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 37857 + }, + { + "epoch": 0.37858, + "grad_norm": 0.88000128757248, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 37858 + }, + { + "epoch": 0.37859, + "grad_norm": 1.0007561912095362, + "learning_rate": 0.003, + "loss": 4.0022, + "step": 37859 + }, + { + "epoch": 0.3786, + "grad_norm": 0.9189442839863843, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 37860 + }, + { + "epoch": 0.37861, + "grad_norm": 0.8396326853780633, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 37861 + }, + { + "epoch": 0.37862, + "grad_norm": 0.8434380379268684, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 37862 + }, + { + "epoch": 0.37863, + "grad_norm": 0.8287619800555249, + "learning_rate": 0.003, + "loss": 3.9993, + "step": 37863 + }, + { + "epoch": 0.37864, + "grad_norm": 0.7705897418537342, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 37864 + }, + { + "epoch": 0.37865, + "grad_norm": 0.7998771781732333, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 37865 + }, + { + "epoch": 0.37866, + "grad_norm": 0.8256406695851775, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 37866 + }, + { + "epoch": 0.37867, + "grad_norm": 0.9070819359479616, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 37867 + }, + { + "epoch": 0.37868, + "grad_norm": 0.8706313140995973, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 37868 + }, + { + "epoch": 0.37869, + "grad_norm": 0.9462043601103467, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 37869 + }, + { + "epoch": 0.3787, + "grad_norm": 0.9855131252346617, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 37870 + }, + { + "epoch": 0.37871, + "grad_norm": 0.9258244797379266, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 37871 + }, + { + "epoch": 0.37872, + "grad_norm": 0.9553941341130998, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 37872 + }, + { + "epoch": 0.37873, + "grad_norm": 0.8999636707298843, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 37873 + }, + { + "epoch": 0.37874, + "grad_norm": 0.882886920295795, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 37874 + }, + { + "epoch": 0.37875, + "grad_norm": 0.8186694900254728, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 37875 + }, + { + "epoch": 0.37876, + "grad_norm": 0.7509157235959518, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 37876 + }, + { + "epoch": 0.37877, + "grad_norm": 0.7172852839733488, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 37877 + }, + { + "epoch": 0.37878, + "grad_norm": 0.7611212068644682, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 37878 + }, + { + "epoch": 0.37879, + "grad_norm": 0.9660224215290537, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 37879 + }, + { + "epoch": 0.3788, + "grad_norm": 1.1407124076983453, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 37880 + }, + { + "epoch": 0.37881, + "grad_norm": 0.7220112299657861, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 37881 + }, + { + "epoch": 0.37882, + "grad_norm": 0.6672454294132012, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 37882 + }, + { + "epoch": 0.37883, + "grad_norm": 0.7019729079669768, + "learning_rate": 0.003, + "loss": 4.047, + "step": 37883 + }, + { + "epoch": 0.37884, + "grad_norm": 0.8246627540259504, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 37884 + }, + { + "epoch": 0.37885, + "grad_norm": 0.9179980999170221, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 37885 + }, + { + "epoch": 0.37886, + "grad_norm": 0.8982891033917458, + "learning_rate": 0.003, + "loss": 4.057, + "step": 37886 + }, + { + "epoch": 0.37887, + "grad_norm": 1.108142236422058, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 37887 + }, + { + "epoch": 0.37888, + "grad_norm": 1.1129040059157709, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 37888 + }, + { + "epoch": 0.37889, + "grad_norm": 1.0143793506242949, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 37889 + }, + { + "epoch": 0.3789, + "grad_norm": 1.0692865520180943, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 37890 + }, + { + "epoch": 0.37891, + "grad_norm": 0.9004030320736236, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 37891 + }, + { + "epoch": 0.37892, + "grad_norm": 0.7858955234231492, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 37892 + }, + { + "epoch": 0.37893, + "grad_norm": 0.7978882533277047, + "learning_rate": 0.003, + "loss": 4.035, + "step": 37893 + }, + { + "epoch": 0.37894, + "grad_norm": 0.7198523957161747, + "learning_rate": 0.003, + "loss": 4.045, + "step": 37894 + }, + { + "epoch": 0.37895, + "grad_norm": 0.6555203566908989, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 37895 + }, + { + "epoch": 0.37896, + "grad_norm": 0.6880997871777106, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 37896 + }, + { + "epoch": 0.37897, + "grad_norm": 0.8641058673016213, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 37897 + }, + { + "epoch": 0.37898, + "grad_norm": 0.926608793625859, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 37898 + }, + { + "epoch": 0.37899, + "grad_norm": 1.0166987093814672, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 37899 + }, + { + "epoch": 0.379, + "grad_norm": 1.0301990682185755, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 37900 + }, + { + "epoch": 0.37901, + "grad_norm": 0.8528447848181414, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 37901 + }, + { + "epoch": 0.37902, + "grad_norm": 0.867619462294035, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 37902 + }, + { + "epoch": 0.37903, + "grad_norm": 0.8609860214805394, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 37903 + }, + { + "epoch": 0.37904, + "grad_norm": 0.7773128373373799, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 37904 + }, + { + "epoch": 0.37905, + "grad_norm": 0.6717856132430113, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 37905 + }, + { + "epoch": 0.37906, + "grad_norm": 0.6453158010322619, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 37906 + }, + { + "epoch": 0.37907, + "grad_norm": 0.6908424089297212, + "learning_rate": 0.003, + "loss": 4.052, + "step": 37907 + }, + { + "epoch": 0.37908, + "grad_norm": 0.7657078569393597, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 37908 + }, + { + "epoch": 0.37909, + "grad_norm": 0.8280639166168029, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 37909 + }, + { + "epoch": 0.3791, + "grad_norm": 0.9812866942894802, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 37910 + }, + { + "epoch": 0.37911, + "grad_norm": 1.2441580613377377, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 37911 + }, + { + "epoch": 0.37912, + "grad_norm": 1.027275174198199, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 37912 + }, + { + "epoch": 0.37913, + "grad_norm": 0.9974348423036251, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 37913 + }, + { + "epoch": 0.37914, + "grad_norm": 0.8017140399271049, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 37914 + }, + { + "epoch": 0.37915, + "grad_norm": 0.7872121874207306, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 37915 + }, + { + "epoch": 0.37916, + "grad_norm": 0.74822608221896, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 37916 + }, + { + "epoch": 0.37917, + "grad_norm": 0.748719751612996, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 37917 + }, + { + "epoch": 0.37918, + "grad_norm": 0.7021711928854879, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 37918 + }, + { + "epoch": 0.37919, + "grad_norm": 0.600297337970951, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 37919 + }, + { + "epoch": 0.3792, + "grad_norm": 0.5524014382411252, + "learning_rate": 0.003, + "loss": 3.9949, + "step": 37920 + }, + { + "epoch": 0.37921, + "grad_norm": 0.6725249358359393, + "learning_rate": 0.003, + "loss": 3.995, + "step": 37921 + }, + { + "epoch": 0.37922, + "grad_norm": 0.646954089529615, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 37922 + }, + { + "epoch": 0.37923, + "grad_norm": 0.7013976614626241, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 37923 + }, + { + "epoch": 0.37924, + "grad_norm": 0.7892916089539466, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 37924 + }, + { + "epoch": 0.37925, + "grad_norm": 0.8181666164736521, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 37925 + }, + { + "epoch": 0.37926, + "grad_norm": 0.7757460493569281, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 37926 + }, + { + "epoch": 0.37927, + "grad_norm": 0.7314253581128874, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 37927 + }, + { + "epoch": 0.37928, + "grad_norm": 0.7473773117613608, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 37928 + }, + { + "epoch": 0.37929, + "grad_norm": 0.8569125484654413, + "learning_rate": 0.003, + "loss": 3.9818, + "step": 37929 + }, + { + "epoch": 0.3793, + "grad_norm": 0.8226600212020864, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 37930 + }, + { + "epoch": 0.37931, + "grad_norm": 0.8975970784196584, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 37931 + }, + { + "epoch": 0.37932, + "grad_norm": 0.8785656550078648, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 37932 + }, + { + "epoch": 0.37933, + "grad_norm": 0.7095698685786355, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 37933 + }, + { + "epoch": 0.37934, + "grad_norm": 0.7198081481002753, + "learning_rate": 0.003, + "loss": 4.017, + "step": 37934 + }, + { + "epoch": 0.37935, + "grad_norm": 0.7876596674773649, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 37935 + }, + { + "epoch": 0.37936, + "grad_norm": 0.7366185611147462, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 37936 + }, + { + "epoch": 0.37937, + "grad_norm": 0.7109779004298905, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 37937 + }, + { + "epoch": 0.37938, + "grad_norm": 0.756907485808043, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 37938 + }, + { + "epoch": 0.37939, + "grad_norm": 0.8088549161374166, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 37939 + }, + { + "epoch": 0.3794, + "grad_norm": 0.903179284731745, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 37940 + }, + { + "epoch": 0.37941, + "grad_norm": 0.8399048629253268, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 37941 + }, + { + "epoch": 0.37942, + "grad_norm": 0.923092381283394, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 37942 + }, + { + "epoch": 0.37943, + "grad_norm": 1.0567785867540471, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 37943 + }, + { + "epoch": 0.37944, + "grad_norm": 0.9878709867082754, + "learning_rate": 0.003, + "loss": 4.017, + "step": 37944 + }, + { + "epoch": 0.37945, + "grad_norm": 1.1687658917310313, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 37945 + }, + { + "epoch": 0.37946, + "grad_norm": 1.1165995480279702, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 37946 + }, + { + "epoch": 0.37947, + "grad_norm": 0.8416210317584206, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 37947 + }, + { + "epoch": 0.37948, + "grad_norm": 0.7260156218501111, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 37948 + }, + { + "epoch": 0.37949, + "grad_norm": 0.7274624692357331, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 37949 + }, + { + "epoch": 0.3795, + "grad_norm": 0.831638072152324, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 37950 + }, + { + "epoch": 0.37951, + "grad_norm": 0.8693252134557259, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 37951 + }, + { + "epoch": 0.37952, + "grad_norm": 0.957595255350751, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 37952 + }, + { + "epoch": 0.37953, + "grad_norm": 0.9986681694589681, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 37953 + }, + { + "epoch": 0.37954, + "grad_norm": 1.14670167444976, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 37954 + }, + { + "epoch": 0.37955, + "grad_norm": 0.8345895812333207, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 37955 + }, + { + "epoch": 0.37956, + "grad_norm": 0.6848111156667729, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 37956 + }, + { + "epoch": 0.37957, + "grad_norm": 0.8073726342445503, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 37957 + }, + { + "epoch": 0.37958, + "grad_norm": 0.8893015388864013, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 37958 + }, + { + "epoch": 0.37959, + "grad_norm": 0.8707297305327644, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 37959 + }, + { + "epoch": 0.3796, + "grad_norm": 0.796099725490183, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 37960 + }, + { + "epoch": 0.37961, + "grad_norm": 0.738377654407244, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 37961 + }, + { + "epoch": 0.37962, + "grad_norm": 0.7227782482255434, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 37962 + }, + { + "epoch": 0.37963, + "grad_norm": 0.6482589166641503, + "learning_rate": 0.003, + "loss": 4.0045, + "step": 37963 + }, + { + "epoch": 0.37964, + "grad_norm": 0.671205260182411, + "learning_rate": 0.003, + "loss": 4.028, + "step": 37964 + }, + { + "epoch": 0.37965, + "grad_norm": 0.7102036790480714, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 37965 + }, + { + "epoch": 0.37966, + "grad_norm": 0.6264760079777609, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 37966 + }, + { + "epoch": 0.37967, + "grad_norm": 0.6155808244731666, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 37967 + }, + { + "epoch": 0.37968, + "grad_norm": 0.7008217260570159, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 37968 + }, + { + "epoch": 0.37969, + "grad_norm": 0.7795879497614533, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 37969 + }, + { + "epoch": 0.3797, + "grad_norm": 0.9311560058277254, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 37970 + }, + { + "epoch": 0.37971, + "grad_norm": 1.1619020763766972, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 37971 + }, + { + "epoch": 0.37972, + "grad_norm": 0.8746177125243314, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 37972 + }, + { + "epoch": 0.37973, + "grad_norm": 0.7774746443769637, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 37973 + }, + { + "epoch": 0.37974, + "grad_norm": 0.6602572466327135, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 37974 + }, + { + "epoch": 0.37975, + "grad_norm": 0.7223014039317232, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 37975 + }, + { + "epoch": 0.37976, + "grad_norm": 0.7743102123346235, + "learning_rate": 0.003, + "loss": 4.035, + "step": 37976 + }, + { + "epoch": 0.37977, + "grad_norm": 0.7628915588027909, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 37977 + }, + { + "epoch": 0.37978, + "grad_norm": 0.777198849219127, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 37978 + }, + { + "epoch": 0.37979, + "grad_norm": 0.8374525833471126, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 37979 + }, + { + "epoch": 0.3798, + "grad_norm": 0.980718340783036, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 37980 + }, + { + "epoch": 0.37981, + "grad_norm": 1.1915170585468593, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 37981 + }, + { + "epoch": 0.37982, + "grad_norm": 0.845424785767953, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 37982 + }, + { + "epoch": 0.37983, + "grad_norm": 0.8362707138195129, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 37983 + }, + { + "epoch": 0.37984, + "grad_norm": 0.8470986247116847, + "learning_rate": 0.003, + "loss": 4.0055, + "step": 37984 + }, + { + "epoch": 0.37985, + "grad_norm": 0.7737451611679347, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 37985 + }, + { + "epoch": 0.37986, + "grad_norm": 0.7993987168547607, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 37986 + }, + { + "epoch": 0.37987, + "grad_norm": 0.7427468856600923, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 37987 + }, + { + "epoch": 0.37988, + "grad_norm": 0.8217208937761024, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 37988 + }, + { + "epoch": 0.37989, + "grad_norm": 0.8600545623642734, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 37989 + }, + { + "epoch": 0.3799, + "grad_norm": 0.95105306087866, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 37990 + }, + { + "epoch": 0.37991, + "grad_norm": 1.0428296886461252, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 37991 + }, + { + "epoch": 0.37992, + "grad_norm": 0.8476792598366156, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 37992 + }, + { + "epoch": 0.37993, + "grad_norm": 0.8442414882985252, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 37993 + }, + { + "epoch": 0.37994, + "grad_norm": 1.1114967156934779, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 37994 + }, + { + "epoch": 0.37995, + "grad_norm": 0.9303544417235191, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 37995 + }, + { + "epoch": 0.37996, + "grad_norm": 0.7792932279421183, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 37996 + }, + { + "epoch": 0.37997, + "grad_norm": 0.6858886020568373, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 37997 + }, + { + "epoch": 0.37998, + "grad_norm": 0.7394017092183088, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 37998 + }, + { + "epoch": 0.37999, + "grad_norm": 0.8256216334392498, + "learning_rate": 0.003, + "loss": 3.9997, + "step": 37999 + }, + { + "epoch": 0.38, + "grad_norm": 0.8333939448644541, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 38000 + }, + { + "epoch": 0.38001, + "grad_norm": 0.8953009302780237, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 38001 + }, + { + "epoch": 0.38002, + "grad_norm": 0.9763527657557471, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 38002 + }, + { + "epoch": 0.38003, + "grad_norm": 0.9928473891476166, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 38003 + }, + { + "epoch": 0.38004, + "grad_norm": 1.0147837291029689, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 38004 + }, + { + "epoch": 0.38005, + "grad_norm": 0.9898959763781208, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 38005 + }, + { + "epoch": 0.38006, + "grad_norm": 1.0095609533817962, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 38006 + }, + { + "epoch": 0.38007, + "grad_norm": 0.8335395598753755, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 38007 + }, + { + "epoch": 0.38008, + "grad_norm": 0.6854388612250041, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 38008 + }, + { + "epoch": 0.38009, + "grad_norm": 0.6934766715889513, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 38009 + }, + { + "epoch": 0.3801, + "grad_norm": 0.6314976042984051, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 38010 + }, + { + "epoch": 0.38011, + "grad_norm": 0.7013891125628016, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 38011 + }, + { + "epoch": 0.38012, + "grad_norm": 0.8746679742671459, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 38012 + }, + { + "epoch": 0.38013, + "grad_norm": 1.1007175020290767, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 38013 + }, + { + "epoch": 0.38014, + "grad_norm": 0.9895452413216316, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 38014 + }, + { + "epoch": 0.38015, + "grad_norm": 0.8978116085103468, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 38015 + }, + { + "epoch": 0.38016, + "grad_norm": 0.7182331864724177, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 38016 + }, + { + "epoch": 0.38017, + "grad_norm": 0.6077021714676272, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 38017 + }, + { + "epoch": 0.38018, + "grad_norm": 0.6242562099217163, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 38018 + }, + { + "epoch": 0.38019, + "grad_norm": 0.6215679274980113, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 38019 + }, + { + "epoch": 0.3802, + "grad_norm": 0.7186630713656114, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 38020 + }, + { + "epoch": 0.38021, + "grad_norm": 0.7663177435795043, + "learning_rate": 0.003, + "loss": 4.0011, + "step": 38021 + }, + { + "epoch": 0.38022, + "grad_norm": 0.6632857580524469, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 38022 + }, + { + "epoch": 0.38023, + "grad_norm": 0.6603939116308283, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 38023 + }, + { + "epoch": 0.38024, + "grad_norm": 0.6687997867443858, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 38024 + }, + { + "epoch": 0.38025, + "grad_norm": 0.7119566217306943, + "learning_rate": 0.003, + "loss": 4.035, + "step": 38025 + }, + { + "epoch": 0.38026, + "grad_norm": 0.7721939000313673, + "learning_rate": 0.003, + "loss": 3.9981, + "step": 38026 + }, + { + "epoch": 0.38027, + "grad_norm": 0.9294372110383106, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 38027 + }, + { + "epoch": 0.38028, + "grad_norm": 1.0509353679286377, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 38028 + }, + { + "epoch": 0.38029, + "grad_norm": 1.1101384880406249, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 38029 + }, + { + "epoch": 0.3803, + "grad_norm": 0.9281156744122382, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 38030 + }, + { + "epoch": 0.38031, + "grad_norm": 0.8225194994436442, + "learning_rate": 0.003, + "loss": 3.9991, + "step": 38031 + }, + { + "epoch": 0.38032, + "grad_norm": 0.7701108505459487, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 38032 + }, + { + "epoch": 0.38033, + "grad_norm": 0.8820334925493102, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 38033 + }, + { + "epoch": 0.38034, + "grad_norm": 0.8942314318821706, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 38034 + }, + { + "epoch": 0.38035, + "grad_norm": 1.1179004956727492, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 38035 + }, + { + "epoch": 0.38036, + "grad_norm": 0.9639602578610538, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 38036 + }, + { + "epoch": 0.38037, + "grad_norm": 0.9755106907454453, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 38037 + }, + { + "epoch": 0.38038, + "grad_norm": 0.9834209009495706, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 38038 + }, + { + "epoch": 0.38039, + "grad_norm": 0.918001854905288, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 38039 + }, + { + "epoch": 0.3804, + "grad_norm": 0.7579640784975007, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 38040 + }, + { + "epoch": 0.38041, + "grad_norm": 0.6874678745879964, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 38041 + }, + { + "epoch": 0.38042, + "grad_norm": 0.7065822503144604, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 38042 + }, + { + "epoch": 0.38043, + "grad_norm": 0.6751378550690761, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 38043 + }, + { + "epoch": 0.38044, + "grad_norm": 0.6856763507598632, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 38044 + }, + { + "epoch": 0.38045, + "grad_norm": 0.7833094378331614, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 38045 + }, + { + "epoch": 0.38046, + "grad_norm": 0.8840700537255352, + "learning_rate": 0.003, + "loss": 4.016, + "step": 38046 + }, + { + "epoch": 0.38047, + "grad_norm": 0.9484928803574912, + "learning_rate": 0.003, + "loss": 4.019, + "step": 38047 + }, + { + "epoch": 0.38048, + "grad_norm": 0.9771022007646214, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 38048 + }, + { + "epoch": 0.38049, + "grad_norm": 0.9779483764966423, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 38049 + }, + { + "epoch": 0.3805, + "grad_norm": 0.9080427974327431, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 38050 + }, + { + "epoch": 0.38051, + "grad_norm": 0.8006185083637181, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 38051 + }, + { + "epoch": 0.38052, + "grad_norm": 0.7713322997483543, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 38052 + }, + { + "epoch": 0.38053, + "grad_norm": 0.8408985189964147, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 38053 + }, + { + "epoch": 0.38054, + "grad_norm": 0.8375422621331817, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 38054 + }, + { + "epoch": 0.38055, + "grad_norm": 0.7800595236311234, + "learning_rate": 0.003, + "loss": 4.024, + "step": 38055 + }, + { + "epoch": 0.38056, + "grad_norm": 0.6779958104600039, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 38056 + }, + { + "epoch": 0.38057, + "grad_norm": 0.713361256830058, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 38057 + }, + { + "epoch": 0.38058, + "grad_norm": 0.7660737731645544, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 38058 + }, + { + "epoch": 0.38059, + "grad_norm": 0.8006167222967924, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 38059 + }, + { + "epoch": 0.3806, + "grad_norm": 0.8526537923972892, + "learning_rate": 0.003, + "loss": 4.05, + "step": 38060 + }, + { + "epoch": 0.38061, + "grad_norm": 0.747660955657482, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 38061 + }, + { + "epoch": 0.38062, + "grad_norm": 0.6784078583222053, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 38062 + }, + { + "epoch": 0.38063, + "grad_norm": 0.6804148837869365, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 38063 + }, + { + "epoch": 0.38064, + "grad_norm": 0.6820170793739364, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 38064 + }, + { + "epoch": 0.38065, + "grad_norm": 0.7550632094835972, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 38065 + }, + { + "epoch": 0.38066, + "grad_norm": 0.74322456547571, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 38066 + }, + { + "epoch": 0.38067, + "grad_norm": 0.6417885510601933, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 38067 + }, + { + "epoch": 0.38068, + "grad_norm": 0.6009067245318326, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 38068 + }, + { + "epoch": 0.38069, + "grad_norm": 0.7552806087540272, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 38069 + }, + { + "epoch": 0.3807, + "grad_norm": 1.065422766441588, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 38070 + }, + { + "epoch": 0.38071, + "grad_norm": 1.3379262156042788, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 38071 + }, + { + "epoch": 0.38072, + "grad_norm": 0.6389501140642999, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 38072 + }, + { + "epoch": 0.38073, + "grad_norm": 0.8349729271526691, + "learning_rate": 0.003, + "loss": 3.9971, + "step": 38073 + }, + { + "epoch": 0.38074, + "grad_norm": 0.7491383796159754, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 38074 + }, + { + "epoch": 0.38075, + "grad_norm": 0.7248491169049789, + "learning_rate": 0.003, + "loss": 4.029, + "step": 38075 + }, + { + "epoch": 0.38076, + "grad_norm": 0.7193209762916937, + "learning_rate": 0.003, + "loss": 4.033, + "step": 38076 + }, + { + "epoch": 0.38077, + "grad_norm": 0.6388767911938064, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 38077 + }, + { + "epoch": 0.38078, + "grad_norm": 0.7382270194965593, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 38078 + }, + { + "epoch": 0.38079, + "grad_norm": 0.8733473533666529, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 38079 + }, + { + "epoch": 0.3808, + "grad_norm": 1.0018644295151606, + "learning_rate": 0.003, + "loss": 4.0068, + "step": 38080 + }, + { + "epoch": 0.38081, + "grad_norm": 1.1352535953957597, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 38081 + }, + { + "epoch": 0.38082, + "grad_norm": 0.9195095906882103, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 38082 + }, + { + "epoch": 0.38083, + "grad_norm": 1.0601701627017013, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 38083 + }, + { + "epoch": 0.38084, + "grad_norm": 1.0045603470118933, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 38084 + }, + { + "epoch": 0.38085, + "grad_norm": 0.8206535142964904, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 38085 + }, + { + "epoch": 0.38086, + "grad_norm": 0.7468717646743434, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 38086 + }, + { + "epoch": 0.38087, + "grad_norm": 0.6885804246094234, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 38087 + }, + { + "epoch": 0.38088, + "grad_norm": 0.6702246924934899, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 38088 + }, + { + "epoch": 0.38089, + "grad_norm": 0.7026506898266792, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 38089 + }, + { + "epoch": 0.3809, + "grad_norm": 0.7456543126568986, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 38090 + }, + { + "epoch": 0.38091, + "grad_norm": 0.7373418494967087, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 38091 + }, + { + "epoch": 0.38092, + "grad_norm": 0.7410392805585225, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 38092 + }, + { + "epoch": 0.38093, + "grad_norm": 0.7241086778453847, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 38093 + }, + { + "epoch": 0.38094, + "grad_norm": 0.6011667395277301, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 38094 + }, + { + "epoch": 0.38095, + "grad_norm": 0.6345240082840348, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 38095 + }, + { + "epoch": 0.38096, + "grad_norm": 0.8342638405609026, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 38096 + }, + { + "epoch": 0.38097, + "grad_norm": 1.1610379083451674, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 38097 + }, + { + "epoch": 0.38098, + "grad_norm": 1.1129016272490737, + "learning_rate": 0.003, + "loss": 4.02, + "step": 38098 + }, + { + "epoch": 0.38099, + "grad_norm": 0.8040689935529923, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 38099 + }, + { + "epoch": 0.381, + "grad_norm": 0.7920855609270446, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 38100 + }, + { + "epoch": 0.38101, + "grad_norm": 0.8058558755112945, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 38101 + }, + { + "epoch": 0.38102, + "grad_norm": 0.7733748867105852, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 38102 + }, + { + "epoch": 0.38103, + "grad_norm": 0.7467405084844148, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 38103 + }, + { + "epoch": 0.38104, + "grad_norm": 0.9032926070144481, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 38104 + }, + { + "epoch": 0.38105, + "grad_norm": 1.08533322830696, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 38105 + }, + { + "epoch": 0.38106, + "grad_norm": 0.9817535676093753, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 38106 + }, + { + "epoch": 0.38107, + "grad_norm": 0.9927377406709664, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 38107 + }, + { + "epoch": 0.38108, + "grad_norm": 1.045491223484726, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 38108 + }, + { + "epoch": 0.38109, + "grad_norm": 0.8036805249076374, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 38109 + }, + { + "epoch": 0.3811, + "grad_norm": 0.8530893267036049, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 38110 + }, + { + "epoch": 0.38111, + "grad_norm": 0.8789891854129565, + "learning_rate": 0.003, + "loss": 4.054, + "step": 38111 + }, + { + "epoch": 0.38112, + "grad_norm": 0.897192027065962, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 38112 + }, + { + "epoch": 0.38113, + "grad_norm": 0.9404612245311709, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 38113 + }, + { + "epoch": 0.38114, + "grad_norm": 1.1035113179431244, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 38114 + }, + { + "epoch": 0.38115, + "grad_norm": 1.0456038868966755, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 38115 + }, + { + "epoch": 0.38116, + "grad_norm": 1.00491853028788, + "learning_rate": 0.003, + "loss": 4.038, + "step": 38116 + }, + { + "epoch": 0.38117, + "grad_norm": 0.9970249418795137, + "learning_rate": 0.003, + "loss": 4.08, + "step": 38117 + }, + { + "epoch": 0.38118, + "grad_norm": 0.9845739454537041, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 38118 + }, + { + "epoch": 0.38119, + "grad_norm": 0.8509534056876947, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 38119 + }, + { + "epoch": 0.3812, + "grad_norm": 0.7285381615095929, + "learning_rate": 0.003, + "loss": 4.045, + "step": 38120 + }, + { + "epoch": 0.38121, + "grad_norm": 0.6821746088174604, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 38121 + }, + { + "epoch": 0.38122, + "grad_norm": 0.6340816150424993, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 38122 + }, + { + "epoch": 0.38123, + "grad_norm": 0.5838978187510486, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 38123 + }, + { + "epoch": 0.38124, + "grad_norm": 0.6378387565422664, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 38124 + }, + { + "epoch": 0.38125, + "grad_norm": 0.643610556362872, + "learning_rate": 0.003, + "loss": 4.049, + "step": 38125 + }, + { + "epoch": 0.38126, + "grad_norm": 0.701962969077558, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 38126 + }, + { + "epoch": 0.38127, + "grad_norm": 0.832468003330963, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 38127 + }, + { + "epoch": 0.38128, + "grad_norm": 1.065027716600617, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 38128 + }, + { + "epoch": 0.38129, + "grad_norm": 0.9760071534908009, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 38129 + }, + { + "epoch": 0.3813, + "grad_norm": 0.8268968706511739, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 38130 + }, + { + "epoch": 0.38131, + "grad_norm": 0.7189297300075661, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 38131 + }, + { + "epoch": 0.38132, + "grad_norm": 0.6947767810257015, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 38132 + }, + { + "epoch": 0.38133, + "grad_norm": 0.6561988832975043, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 38133 + }, + { + "epoch": 0.38134, + "grad_norm": 0.6319856147216828, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 38134 + }, + { + "epoch": 0.38135, + "grad_norm": 0.6394544545379184, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 38135 + }, + { + "epoch": 0.38136, + "grad_norm": 0.7107465874286112, + "learning_rate": 0.003, + "loss": 4.0025, + "step": 38136 + }, + { + "epoch": 0.38137, + "grad_norm": 0.8062776391076589, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 38137 + }, + { + "epoch": 0.38138, + "grad_norm": 0.9617629845197929, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 38138 + }, + { + "epoch": 0.38139, + "grad_norm": 1.1207687370806272, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 38139 + }, + { + "epoch": 0.3814, + "grad_norm": 0.9155255653186286, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 38140 + }, + { + "epoch": 0.38141, + "grad_norm": 0.7859574904310275, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 38141 + }, + { + "epoch": 0.38142, + "grad_norm": 0.740427680717545, + "learning_rate": 0.003, + "loss": 4.032, + "step": 38142 + }, + { + "epoch": 0.38143, + "grad_norm": 0.8420079662291743, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 38143 + }, + { + "epoch": 0.38144, + "grad_norm": 0.9673296110259596, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 38144 + }, + { + "epoch": 0.38145, + "grad_norm": 0.8706380725392048, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 38145 + }, + { + "epoch": 0.38146, + "grad_norm": 0.8031121797512694, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 38146 + }, + { + "epoch": 0.38147, + "grad_norm": 0.6634705177819772, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 38147 + }, + { + "epoch": 0.38148, + "grad_norm": 0.6626605251240142, + "learning_rate": 0.003, + "loss": 4.015, + "step": 38148 + }, + { + "epoch": 0.38149, + "grad_norm": 0.7588711705370503, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 38149 + }, + { + "epoch": 0.3815, + "grad_norm": 0.8263261341851263, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 38150 + }, + { + "epoch": 0.38151, + "grad_norm": 0.8847496697923477, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 38151 + }, + { + "epoch": 0.38152, + "grad_norm": 1.0548954318539066, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 38152 + }, + { + "epoch": 0.38153, + "grad_norm": 0.8316184427180402, + "learning_rate": 0.003, + "loss": 3.9891, + "step": 38153 + }, + { + "epoch": 0.38154, + "grad_norm": 0.6976142610431898, + "learning_rate": 0.003, + "loss": 4.059, + "step": 38154 + }, + { + "epoch": 0.38155, + "grad_norm": 0.761493824465131, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 38155 + }, + { + "epoch": 0.38156, + "grad_norm": 0.6875361153876849, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 38156 + }, + { + "epoch": 0.38157, + "grad_norm": 0.7062646006054155, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 38157 + }, + { + "epoch": 0.38158, + "grad_norm": 0.7653484192451224, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 38158 + }, + { + "epoch": 0.38159, + "grad_norm": 1.022728848458359, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 38159 + }, + { + "epoch": 0.3816, + "grad_norm": 1.1062371374780393, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 38160 + }, + { + "epoch": 0.38161, + "grad_norm": 0.8703608009021198, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 38161 + }, + { + "epoch": 0.38162, + "grad_norm": 0.805468723169911, + "learning_rate": 0.003, + "loss": 4.031, + "step": 38162 + }, + { + "epoch": 0.38163, + "grad_norm": 0.8277375406941057, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 38163 + }, + { + "epoch": 0.38164, + "grad_norm": 0.8733400453217404, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 38164 + }, + { + "epoch": 0.38165, + "grad_norm": 0.9809331237339552, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 38165 + }, + { + "epoch": 0.38166, + "grad_norm": 0.800448342115483, + "learning_rate": 0.003, + "loss": 4.08, + "step": 38166 + }, + { + "epoch": 0.38167, + "grad_norm": 0.856678816835041, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 38167 + }, + { + "epoch": 0.38168, + "grad_norm": 0.7750195099195075, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 38168 + }, + { + "epoch": 0.38169, + "grad_norm": 0.7732192462505268, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 38169 + }, + { + "epoch": 0.3817, + "grad_norm": 0.8231293365550609, + "learning_rate": 0.003, + "loss": 4.0084, + "step": 38170 + }, + { + "epoch": 0.38171, + "grad_norm": 0.763616741547376, + "learning_rate": 0.003, + "loss": 4.049, + "step": 38171 + }, + { + "epoch": 0.38172, + "grad_norm": 0.7163299403984057, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 38172 + }, + { + "epoch": 0.38173, + "grad_norm": 0.7414314576337566, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 38173 + }, + { + "epoch": 0.38174, + "grad_norm": 0.8605067317434468, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 38174 + }, + { + "epoch": 0.38175, + "grad_norm": 1.314907551400158, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 38175 + }, + { + "epoch": 0.38176, + "grad_norm": 0.9602504448725443, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 38176 + }, + { + "epoch": 0.38177, + "grad_norm": 0.9536503640601574, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 38177 + }, + { + "epoch": 0.38178, + "grad_norm": 0.9665003756597079, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 38178 + }, + { + "epoch": 0.38179, + "grad_norm": 0.8974082887009657, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 38179 + }, + { + "epoch": 0.3818, + "grad_norm": 0.8859534727322472, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 38180 + }, + { + "epoch": 0.38181, + "grad_norm": 0.8431998500933936, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 38181 + }, + { + "epoch": 0.38182, + "grad_norm": 0.8388422844474259, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 38182 + }, + { + "epoch": 0.38183, + "grad_norm": 0.8261328566974073, + "learning_rate": 0.003, + "loss": 4.046, + "step": 38183 + }, + { + "epoch": 0.38184, + "grad_norm": 0.9003564088245252, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 38184 + }, + { + "epoch": 0.38185, + "grad_norm": 1.003965389431221, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 38185 + }, + { + "epoch": 0.38186, + "grad_norm": 1.170519644531306, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 38186 + }, + { + "epoch": 0.38187, + "grad_norm": 0.6687993841290404, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 38187 + }, + { + "epoch": 0.38188, + "grad_norm": 0.6476366007296755, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 38188 + }, + { + "epoch": 0.38189, + "grad_norm": 0.6132537143394041, + "learning_rate": 0.003, + "loss": 3.9986, + "step": 38189 + }, + { + "epoch": 0.3819, + "grad_norm": 0.6507584520515515, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 38190 + }, + { + "epoch": 0.38191, + "grad_norm": 0.6449260105268226, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 38191 + }, + { + "epoch": 0.38192, + "grad_norm": 0.6379325401825436, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 38192 + }, + { + "epoch": 0.38193, + "grad_norm": 0.6277269511774636, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 38193 + }, + { + "epoch": 0.38194, + "grad_norm": 0.7150882690640369, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 38194 + }, + { + "epoch": 0.38195, + "grad_norm": 0.9440905893907634, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 38195 + }, + { + "epoch": 0.38196, + "grad_norm": 1.1129658789234709, + "learning_rate": 0.003, + "loss": 4.047, + "step": 38196 + }, + { + "epoch": 0.38197, + "grad_norm": 0.9015291265912871, + "learning_rate": 0.003, + "loss": 3.9903, + "step": 38197 + }, + { + "epoch": 0.38198, + "grad_norm": 0.7691333685113845, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 38198 + }, + { + "epoch": 0.38199, + "grad_norm": 0.7118548279454094, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 38199 + }, + { + "epoch": 0.382, + "grad_norm": 0.78517755917031, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 38200 + }, + { + "epoch": 0.38201, + "grad_norm": 0.8175909581180227, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 38201 + }, + { + "epoch": 0.38202, + "grad_norm": 0.8088230115355497, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 38202 + }, + { + "epoch": 0.38203, + "grad_norm": 0.8160943943387331, + "learning_rate": 0.003, + "loss": 3.9999, + "step": 38203 + }, + { + "epoch": 0.38204, + "grad_norm": 0.8562910890220424, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 38204 + }, + { + "epoch": 0.38205, + "grad_norm": 0.7588551773442546, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 38205 + }, + { + "epoch": 0.38206, + "grad_norm": 0.7361264368694087, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 38206 + }, + { + "epoch": 0.38207, + "grad_norm": 0.9606733512799424, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 38207 + }, + { + "epoch": 0.38208, + "grad_norm": 1.1622713370665512, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 38208 + }, + { + "epoch": 0.38209, + "grad_norm": 1.0087326490571946, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 38209 + }, + { + "epoch": 0.3821, + "grad_norm": 0.9281286629665868, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 38210 + }, + { + "epoch": 0.38211, + "grad_norm": 0.9101830434973647, + "learning_rate": 0.003, + "loss": 4.0713, + "step": 38211 + }, + { + "epoch": 0.38212, + "grad_norm": 0.9677771421167225, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 38212 + }, + { + "epoch": 0.38213, + "grad_norm": 0.9543214848103911, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 38213 + }, + { + "epoch": 0.38214, + "grad_norm": 0.8877246036600248, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 38214 + }, + { + "epoch": 0.38215, + "grad_norm": 0.8291365415602149, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 38215 + }, + { + "epoch": 0.38216, + "grad_norm": 0.7649374104975208, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 38216 + }, + { + "epoch": 0.38217, + "grad_norm": 0.7806903710554224, + "learning_rate": 0.003, + "loss": 4.035, + "step": 38217 + }, + { + "epoch": 0.38218, + "grad_norm": 0.6990499791425989, + "learning_rate": 0.003, + "loss": 4.002, + "step": 38218 + }, + { + "epoch": 0.38219, + "grad_norm": 0.6025004003308435, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 38219 + }, + { + "epoch": 0.3822, + "grad_norm": 0.5994460612321049, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 38220 + }, + { + "epoch": 0.38221, + "grad_norm": 0.6154776249101559, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 38221 + }, + { + "epoch": 0.38222, + "grad_norm": 0.5576724717754409, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 38222 + }, + { + "epoch": 0.38223, + "grad_norm": 0.5654164905992014, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 38223 + }, + { + "epoch": 0.38224, + "grad_norm": 0.6263624729185215, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 38224 + }, + { + "epoch": 0.38225, + "grad_norm": 0.5988096580870717, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 38225 + }, + { + "epoch": 0.38226, + "grad_norm": 0.7004853786211319, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 38226 + }, + { + "epoch": 0.38227, + "grad_norm": 0.878930915715756, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 38227 + }, + { + "epoch": 0.38228, + "grad_norm": 1.1633267143571222, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 38228 + }, + { + "epoch": 0.38229, + "grad_norm": 0.9334669159686126, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 38229 + }, + { + "epoch": 0.3823, + "grad_norm": 0.883004902702577, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 38230 + }, + { + "epoch": 0.38231, + "grad_norm": 0.859039781438227, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 38231 + }, + { + "epoch": 0.38232, + "grad_norm": 0.872710898157416, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 38232 + }, + { + "epoch": 0.38233, + "grad_norm": 0.9177408572196086, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 38233 + }, + { + "epoch": 0.38234, + "grad_norm": 0.8626695745066131, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 38234 + }, + { + "epoch": 0.38235, + "grad_norm": 0.7497009620750489, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 38235 + }, + { + "epoch": 0.38236, + "grad_norm": 0.8384866182934206, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 38236 + }, + { + "epoch": 0.38237, + "grad_norm": 0.8706915876037862, + "learning_rate": 0.003, + "loss": 4.017, + "step": 38237 + }, + { + "epoch": 0.38238, + "grad_norm": 0.8450130642598656, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 38238 + }, + { + "epoch": 0.38239, + "grad_norm": 0.7214434935861455, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 38239 + }, + { + "epoch": 0.3824, + "grad_norm": 0.6984162819151052, + "learning_rate": 0.003, + "loss": 3.9853, + "step": 38240 + }, + { + "epoch": 0.38241, + "grad_norm": 0.7638310904007192, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 38241 + }, + { + "epoch": 0.38242, + "grad_norm": 0.9145569173529132, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 38242 + }, + { + "epoch": 0.38243, + "grad_norm": 1.2537281327639183, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 38243 + }, + { + "epoch": 0.38244, + "grad_norm": 0.902046625787553, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 38244 + }, + { + "epoch": 0.38245, + "grad_norm": 0.7687433202283129, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 38245 + }, + { + "epoch": 0.38246, + "grad_norm": 0.6853916062756973, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 38246 + }, + { + "epoch": 0.38247, + "grad_norm": 0.7072487332544143, + "learning_rate": 0.003, + "loss": 4.026, + "step": 38247 + }, + { + "epoch": 0.38248, + "grad_norm": 0.6826328869699361, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 38248 + }, + { + "epoch": 0.38249, + "grad_norm": 0.6236758619849694, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 38249 + }, + { + "epoch": 0.3825, + "grad_norm": 0.5667219827668587, + "learning_rate": 0.003, + "loss": 4.046, + "step": 38250 + }, + { + "epoch": 0.38251, + "grad_norm": 0.5633365706339813, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 38251 + }, + { + "epoch": 0.38252, + "grad_norm": 0.574154861058642, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 38252 + }, + { + "epoch": 0.38253, + "grad_norm": 0.6326818959584596, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 38253 + }, + { + "epoch": 0.38254, + "grad_norm": 0.7633988019314266, + "learning_rate": 0.003, + "loss": 4.035, + "step": 38254 + }, + { + "epoch": 0.38255, + "grad_norm": 0.9228703443545998, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 38255 + }, + { + "epoch": 0.38256, + "grad_norm": 1.0183061813645695, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 38256 + }, + { + "epoch": 0.38257, + "grad_norm": 0.8158289938074268, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 38257 + }, + { + "epoch": 0.38258, + "grad_norm": 0.652155199691517, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 38258 + }, + { + "epoch": 0.38259, + "grad_norm": 0.5996172347692865, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 38259 + }, + { + "epoch": 0.3826, + "grad_norm": 0.6264222841012916, + "learning_rate": 0.003, + "loss": 4.0088, + "step": 38260 + }, + { + "epoch": 0.38261, + "grad_norm": 0.6215455899518391, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 38261 + }, + { + "epoch": 0.38262, + "grad_norm": 0.613267327418831, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 38262 + }, + { + "epoch": 0.38263, + "grad_norm": 0.7143219836700865, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 38263 + }, + { + "epoch": 0.38264, + "grad_norm": 0.7073467390322806, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 38264 + }, + { + "epoch": 0.38265, + "grad_norm": 0.6629084289308758, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 38265 + }, + { + "epoch": 0.38266, + "grad_norm": 0.7467953906869944, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 38266 + }, + { + "epoch": 0.38267, + "grad_norm": 0.8639840016807362, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 38267 + }, + { + "epoch": 0.38268, + "grad_norm": 1.0584664263784687, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 38268 + }, + { + "epoch": 0.38269, + "grad_norm": 1.1562626328805585, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 38269 + }, + { + "epoch": 0.3827, + "grad_norm": 0.9948381764033294, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 38270 + }, + { + "epoch": 0.38271, + "grad_norm": 1.0045957399582301, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 38271 + }, + { + "epoch": 0.38272, + "grad_norm": 1.0901298829063626, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 38272 + }, + { + "epoch": 0.38273, + "grad_norm": 1.0380470599019243, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 38273 + }, + { + "epoch": 0.38274, + "grad_norm": 0.8712928460451816, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 38274 + }, + { + "epoch": 0.38275, + "grad_norm": 0.8967985044589531, + "learning_rate": 0.003, + "loss": 4.042, + "step": 38275 + }, + { + "epoch": 0.38276, + "grad_norm": 0.936726672679494, + "learning_rate": 0.003, + "loss": 3.9998, + "step": 38276 + }, + { + "epoch": 0.38277, + "grad_norm": 1.0543752005987141, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 38277 + }, + { + "epoch": 0.38278, + "grad_norm": 1.038468972649628, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 38278 + }, + { + "epoch": 0.38279, + "grad_norm": 1.0447909721108448, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 38279 + }, + { + "epoch": 0.3828, + "grad_norm": 0.9836604220572613, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 38280 + }, + { + "epoch": 0.38281, + "grad_norm": 0.907735063716453, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 38281 + }, + { + "epoch": 0.38282, + "grad_norm": 0.8908357644925228, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 38282 + }, + { + "epoch": 0.38283, + "grad_norm": 0.8179045557856334, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 38283 + }, + { + "epoch": 0.38284, + "grad_norm": 0.8021178606617618, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 38284 + }, + { + "epoch": 0.38285, + "grad_norm": 0.8339261064923779, + "learning_rate": 0.003, + "loss": 4.058, + "step": 38285 + }, + { + "epoch": 0.38286, + "grad_norm": 0.9018473459281032, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 38286 + }, + { + "epoch": 0.38287, + "grad_norm": 1.1418216355343147, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 38287 + }, + { + "epoch": 0.38288, + "grad_norm": 1.0580929791429916, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 38288 + }, + { + "epoch": 0.38289, + "grad_norm": 0.9139096933223557, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 38289 + }, + { + "epoch": 0.3829, + "grad_norm": 0.8430118077206638, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 38290 + }, + { + "epoch": 0.38291, + "grad_norm": 0.7300175152190269, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 38291 + }, + { + "epoch": 0.38292, + "grad_norm": 0.7324412471448691, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 38292 + }, + { + "epoch": 0.38293, + "grad_norm": 0.8390342263569496, + "learning_rate": 0.003, + "loss": 4.044, + "step": 38293 + }, + { + "epoch": 0.38294, + "grad_norm": 0.9193062657279968, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 38294 + }, + { + "epoch": 0.38295, + "grad_norm": 0.9807984634007391, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 38295 + }, + { + "epoch": 0.38296, + "grad_norm": 1.0136233735332427, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 38296 + }, + { + "epoch": 0.38297, + "grad_norm": 0.8747192852197898, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 38297 + }, + { + "epoch": 0.38298, + "grad_norm": 0.8208925569448884, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 38298 + }, + { + "epoch": 0.38299, + "grad_norm": 0.732128156988727, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 38299 + }, + { + "epoch": 0.383, + "grad_norm": 0.8486562600489408, + "learning_rate": 0.003, + "loss": 4.013, + "step": 38300 + }, + { + "epoch": 0.38301, + "grad_norm": 0.9600953890700886, + "learning_rate": 0.003, + "loss": 4.036, + "step": 38301 + }, + { + "epoch": 0.38302, + "grad_norm": 1.0274513495914694, + "learning_rate": 0.003, + "loss": 4.021, + "step": 38302 + }, + { + "epoch": 0.38303, + "grad_norm": 0.8731304108908561, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 38303 + }, + { + "epoch": 0.38304, + "grad_norm": 0.7004242087667257, + "learning_rate": 0.003, + "loss": 4.066, + "step": 38304 + }, + { + "epoch": 0.38305, + "grad_norm": 0.7318336096461876, + "learning_rate": 0.003, + "loss": 3.9977, + "step": 38305 + }, + { + "epoch": 0.38306, + "grad_norm": 0.7076118906848037, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 38306 + }, + { + "epoch": 0.38307, + "grad_norm": 0.5873059372288422, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 38307 + }, + { + "epoch": 0.38308, + "grad_norm": 0.6742141318681139, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 38308 + }, + { + "epoch": 0.38309, + "grad_norm": 0.7169958739587303, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 38309 + }, + { + "epoch": 0.3831, + "grad_norm": 0.6978776472976926, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 38310 + }, + { + "epoch": 0.38311, + "grad_norm": 0.7476510540816808, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 38311 + }, + { + "epoch": 0.38312, + "grad_norm": 0.88238566644499, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 38312 + }, + { + "epoch": 0.38313, + "grad_norm": 1.020566672201757, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 38313 + }, + { + "epoch": 0.38314, + "grad_norm": 1.069116600020821, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 38314 + }, + { + "epoch": 0.38315, + "grad_norm": 0.9114718962433317, + "learning_rate": 0.003, + "loss": 4.049, + "step": 38315 + }, + { + "epoch": 0.38316, + "grad_norm": 0.8308704841719874, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 38316 + }, + { + "epoch": 0.38317, + "grad_norm": 0.7343669983599562, + "learning_rate": 0.003, + "loss": 4.006, + "step": 38317 + }, + { + "epoch": 0.38318, + "grad_norm": 0.7221398472586807, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 38318 + }, + { + "epoch": 0.38319, + "grad_norm": 0.7793273916051927, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 38319 + }, + { + "epoch": 0.3832, + "grad_norm": 0.8192746829187295, + "learning_rate": 0.003, + "loss": 4.034, + "step": 38320 + }, + { + "epoch": 0.38321, + "grad_norm": 0.8716827283170723, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 38321 + }, + { + "epoch": 0.38322, + "grad_norm": 0.9823526253479445, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 38322 + }, + { + "epoch": 0.38323, + "grad_norm": 1.0009987501530366, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 38323 + }, + { + "epoch": 0.38324, + "grad_norm": 1.0584404515247603, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 38324 + }, + { + "epoch": 0.38325, + "grad_norm": 0.9904545912907484, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 38325 + }, + { + "epoch": 0.38326, + "grad_norm": 0.9596538359474809, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 38326 + }, + { + "epoch": 0.38327, + "grad_norm": 1.0140286378551824, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 38327 + }, + { + "epoch": 0.38328, + "grad_norm": 0.9856310696806114, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 38328 + }, + { + "epoch": 0.38329, + "grad_norm": 0.9096030715935448, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 38329 + }, + { + "epoch": 0.3833, + "grad_norm": 0.8876948734848753, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 38330 + }, + { + "epoch": 0.38331, + "grad_norm": 0.8727920342191605, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 38331 + }, + { + "epoch": 0.38332, + "grad_norm": 0.8345956786499664, + "learning_rate": 0.003, + "loss": 4.041, + "step": 38332 + }, + { + "epoch": 0.38333, + "grad_norm": 0.7007944660770661, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 38333 + }, + { + "epoch": 0.38334, + "grad_norm": 0.701481433039187, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 38334 + }, + { + "epoch": 0.38335, + "grad_norm": 0.6649045613690813, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 38335 + }, + { + "epoch": 0.38336, + "grad_norm": 0.7698826154147883, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 38336 + }, + { + "epoch": 0.38337, + "grad_norm": 0.9832759543771118, + "learning_rate": 0.003, + "loss": 4.043, + "step": 38337 + }, + { + "epoch": 0.38338, + "grad_norm": 1.0339977401315303, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 38338 + }, + { + "epoch": 0.38339, + "grad_norm": 0.8305627035395318, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 38339 + }, + { + "epoch": 0.3834, + "grad_norm": 0.764188469113583, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 38340 + }, + { + "epoch": 0.38341, + "grad_norm": 0.831442180802809, + "learning_rate": 0.003, + "loss": 4.051, + "step": 38341 + }, + { + "epoch": 0.38342, + "grad_norm": 0.9065237522166487, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 38342 + }, + { + "epoch": 0.38343, + "grad_norm": 0.9323884992743262, + "learning_rate": 0.003, + "loss": 4.051, + "step": 38343 + }, + { + "epoch": 0.38344, + "grad_norm": 1.0867226605281097, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 38344 + }, + { + "epoch": 0.38345, + "grad_norm": 0.8949984525895092, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 38345 + }, + { + "epoch": 0.38346, + "grad_norm": 0.7689918505170829, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 38346 + }, + { + "epoch": 0.38347, + "grad_norm": 0.6622399582343097, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 38347 + }, + { + "epoch": 0.38348, + "grad_norm": 0.7285565789817915, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 38348 + }, + { + "epoch": 0.38349, + "grad_norm": 0.7623716582678902, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 38349 + }, + { + "epoch": 0.3835, + "grad_norm": 0.7420710275539159, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 38350 + }, + { + "epoch": 0.38351, + "grad_norm": 0.5527449792242131, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 38351 + }, + { + "epoch": 0.38352, + "grad_norm": 0.5079480114939919, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 38352 + }, + { + "epoch": 0.38353, + "grad_norm": 0.544444168421759, + "learning_rate": 0.003, + "loss": 4.015, + "step": 38353 + }, + { + "epoch": 0.38354, + "grad_norm": 0.5707117403951282, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 38354 + }, + { + "epoch": 0.38355, + "grad_norm": 0.6495337812400409, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 38355 + }, + { + "epoch": 0.38356, + "grad_norm": 0.7124335014302726, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 38356 + }, + { + "epoch": 0.38357, + "grad_norm": 0.7017462129141766, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 38357 + }, + { + "epoch": 0.38358, + "grad_norm": 0.7574510220695566, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 38358 + }, + { + "epoch": 0.38359, + "grad_norm": 0.8734043827742205, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 38359 + }, + { + "epoch": 0.3836, + "grad_norm": 0.9682576596548667, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 38360 + }, + { + "epoch": 0.38361, + "grad_norm": 1.0586802357967222, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 38361 + }, + { + "epoch": 0.38362, + "grad_norm": 0.9317684273695579, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 38362 + }, + { + "epoch": 0.38363, + "grad_norm": 0.8699939216679229, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 38363 + }, + { + "epoch": 0.38364, + "grad_norm": 1.0500445934900249, + "learning_rate": 0.003, + "loss": 4.0897, + "step": 38364 + }, + { + "epoch": 0.38365, + "grad_norm": 1.0856211754629281, + "learning_rate": 0.003, + "loss": 4.018, + "step": 38365 + }, + { + "epoch": 0.38366, + "grad_norm": 0.9435100782931793, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 38366 + }, + { + "epoch": 0.38367, + "grad_norm": 0.9593406099354385, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 38367 + }, + { + "epoch": 0.38368, + "grad_norm": 0.9440784718207625, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 38368 + }, + { + "epoch": 0.38369, + "grad_norm": 0.8686360073456583, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 38369 + }, + { + "epoch": 0.3837, + "grad_norm": 0.8804186442461308, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 38370 + }, + { + "epoch": 0.38371, + "grad_norm": 0.8279674089038523, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 38371 + }, + { + "epoch": 0.38372, + "grad_norm": 0.9480611649875347, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 38372 + }, + { + "epoch": 0.38373, + "grad_norm": 0.971484109622593, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 38373 + }, + { + "epoch": 0.38374, + "grad_norm": 1.0171956415539414, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 38374 + }, + { + "epoch": 0.38375, + "grad_norm": 1.0715426498991882, + "learning_rate": 0.003, + "loss": 4.072, + "step": 38375 + }, + { + "epoch": 0.38376, + "grad_norm": 1.0091126413841105, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 38376 + }, + { + "epoch": 0.38377, + "grad_norm": 0.9918600457411877, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 38377 + }, + { + "epoch": 0.38378, + "grad_norm": 0.9570186985681394, + "learning_rate": 0.003, + "loss": 4.0918, + "step": 38378 + }, + { + "epoch": 0.38379, + "grad_norm": 0.8904836908345013, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 38379 + }, + { + "epoch": 0.3838, + "grad_norm": 0.8141916412620356, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 38380 + }, + { + "epoch": 0.38381, + "grad_norm": 0.7766883666731607, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 38381 + }, + { + "epoch": 0.38382, + "grad_norm": 0.8105599029609607, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 38382 + }, + { + "epoch": 0.38383, + "grad_norm": 0.8324466183698255, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 38383 + }, + { + "epoch": 0.38384, + "grad_norm": 0.7889439863228936, + "learning_rate": 0.003, + "loss": 4.055, + "step": 38384 + }, + { + "epoch": 0.38385, + "grad_norm": 0.7280428111104094, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 38385 + }, + { + "epoch": 0.38386, + "grad_norm": 0.752829965128567, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 38386 + }, + { + "epoch": 0.38387, + "grad_norm": 0.8989711947263774, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 38387 + }, + { + "epoch": 0.38388, + "grad_norm": 1.007983334883119, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 38388 + }, + { + "epoch": 0.38389, + "grad_norm": 0.9512202483764856, + "learning_rate": 0.003, + "loss": 4.01, + "step": 38389 + }, + { + "epoch": 0.3839, + "grad_norm": 0.953140243682501, + "learning_rate": 0.003, + "loss": 4.038, + "step": 38390 + }, + { + "epoch": 0.38391, + "grad_norm": 1.099275839602197, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 38391 + }, + { + "epoch": 0.38392, + "grad_norm": 0.9666098457759531, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 38392 + }, + { + "epoch": 0.38393, + "grad_norm": 0.7210155497663595, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 38393 + }, + { + "epoch": 0.38394, + "grad_norm": 0.5902911393841234, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 38394 + }, + { + "epoch": 0.38395, + "grad_norm": 0.5912869837485113, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 38395 + }, + { + "epoch": 0.38396, + "grad_norm": 0.5383328103700031, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 38396 + }, + { + "epoch": 0.38397, + "grad_norm": 0.5736136425971055, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 38397 + }, + { + "epoch": 0.38398, + "grad_norm": 0.6203003505616085, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 38398 + }, + { + "epoch": 0.38399, + "grad_norm": 0.7245257676102668, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 38399 + }, + { + "epoch": 0.384, + "grad_norm": 0.7679025904576539, + "learning_rate": 0.003, + "loss": 4.049, + "step": 38400 + }, + { + "epoch": 0.38401, + "grad_norm": 0.6959086507007487, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 38401 + }, + { + "epoch": 0.38402, + "grad_norm": 0.5261778313818017, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 38402 + }, + { + "epoch": 0.38403, + "grad_norm": 0.47984450454797856, + "learning_rate": 0.003, + "loss": 4.0045, + "step": 38403 + }, + { + "epoch": 0.38404, + "grad_norm": 0.5518306746245791, + "learning_rate": 0.003, + "loss": 4.0076, + "step": 38404 + }, + { + "epoch": 0.38405, + "grad_norm": 0.6400318920445696, + "learning_rate": 0.003, + "loss": 4.012, + "step": 38405 + }, + { + "epoch": 0.38406, + "grad_norm": 0.8049885108788755, + "learning_rate": 0.003, + "loss": 3.996, + "step": 38406 + }, + { + "epoch": 0.38407, + "grad_norm": 0.952720594336879, + "learning_rate": 0.003, + "loss": 3.9819, + "step": 38407 + }, + { + "epoch": 0.38408, + "grad_norm": 1.071890766935744, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 38408 + }, + { + "epoch": 0.38409, + "grad_norm": 0.9137349728508855, + "learning_rate": 0.003, + "loss": 3.994, + "step": 38409 + }, + { + "epoch": 0.3841, + "grad_norm": 0.9350431141498825, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 38410 + }, + { + "epoch": 0.38411, + "grad_norm": 1.0196316282239697, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 38411 + }, + { + "epoch": 0.38412, + "grad_norm": 0.9066864334056985, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 38412 + }, + { + "epoch": 0.38413, + "grad_norm": 1.0634781387141354, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 38413 + }, + { + "epoch": 0.38414, + "grad_norm": 0.8314247581756048, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 38414 + }, + { + "epoch": 0.38415, + "grad_norm": 0.818590029226095, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 38415 + }, + { + "epoch": 0.38416, + "grad_norm": 0.6921250150279099, + "learning_rate": 0.003, + "loss": 4.038, + "step": 38416 + }, + { + "epoch": 0.38417, + "grad_norm": 0.6492419502478116, + "learning_rate": 0.003, + "loss": 4.032, + "step": 38417 + }, + { + "epoch": 0.38418, + "grad_norm": 0.683526938377807, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 38418 + }, + { + "epoch": 0.38419, + "grad_norm": 0.693464346548832, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 38419 + }, + { + "epoch": 0.3842, + "grad_norm": 0.7495670718499495, + "learning_rate": 0.003, + "loss": 3.9999, + "step": 38420 + }, + { + "epoch": 0.38421, + "grad_norm": 0.8363713293796691, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 38421 + }, + { + "epoch": 0.38422, + "grad_norm": 1.0663975927417466, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 38422 + }, + { + "epoch": 0.38423, + "grad_norm": 1.0285955203939983, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 38423 + }, + { + "epoch": 0.38424, + "grad_norm": 0.890497384902093, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 38424 + }, + { + "epoch": 0.38425, + "grad_norm": 0.9168890524867803, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 38425 + }, + { + "epoch": 0.38426, + "grad_norm": 0.9979100163098978, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 38426 + }, + { + "epoch": 0.38427, + "grad_norm": 1.0290014126358218, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 38427 + }, + { + "epoch": 0.38428, + "grad_norm": 0.9047711893889405, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 38428 + }, + { + "epoch": 0.38429, + "grad_norm": 0.808223840677729, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 38429 + }, + { + "epoch": 0.3843, + "grad_norm": 0.804025395018948, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 38430 + }, + { + "epoch": 0.38431, + "grad_norm": 0.8796226014195162, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 38431 + }, + { + "epoch": 0.38432, + "grad_norm": 0.9315412533879778, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 38432 + }, + { + "epoch": 0.38433, + "grad_norm": 1.0981642176962814, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 38433 + }, + { + "epoch": 0.38434, + "grad_norm": 1.0609758361071964, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 38434 + }, + { + "epoch": 0.38435, + "grad_norm": 1.0212743963502444, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 38435 + }, + { + "epoch": 0.38436, + "grad_norm": 0.8459002708761771, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 38436 + }, + { + "epoch": 0.38437, + "grad_norm": 0.8463110149367146, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 38437 + }, + { + "epoch": 0.38438, + "grad_norm": 0.8440341616065391, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 38438 + }, + { + "epoch": 0.38439, + "grad_norm": 0.8391981315466991, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 38439 + }, + { + "epoch": 0.3844, + "grad_norm": 0.8641895480912863, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 38440 + }, + { + "epoch": 0.38441, + "grad_norm": 0.8615555221035336, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 38441 + }, + { + "epoch": 0.38442, + "grad_norm": 0.7933978642382148, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 38442 + }, + { + "epoch": 0.38443, + "grad_norm": 0.7242960242376147, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 38443 + }, + { + "epoch": 0.38444, + "grad_norm": 0.7114413811196463, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 38444 + }, + { + "epoch": 0.38445, + "grad_norm": 0.8156888912077056, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 38445 + }, + { + "epoch": 0.38446, + "grad_norm": 0.8425157438753897, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 38446 + }, + { + "epoch": 0.38447, + "grad_norm": 0.8130134922854695, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 38447 + }, + { + "epoch": 0.38448, + "grad_norm": 0.9389073735085602, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 38448 + }, + { + "epoch": 0.38449, + "grad_norm": 0.8927318458258628, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 38449 + }, + { + "epoch": 0.3845, + "grad_norm": 0.6964680420170354, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 38450 + }, + { + "epoch": 0.38451, + "grad_norm": 0.6809980470915487, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 38451 + }, + { + "epoch": 0.38452, + "grad_norm": 0.5852908759894189, + "learning_rate": 0.003, + "loss": 4.025, + "step": 38452 + }, + { + "epoch": 0.38453, + "grad_norm": 0.5996907020191601, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 38453 + }, + { + "epoch": 0.38454, + "grad_norm": 0.6997323472297519, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 38454 + }, + { + "epoch": 0.38455, + "grad_norm": 0.9456797849340214, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 38455 + }, + { + "epoch": 0.38456, + "grad_norm": 1.2444712396243065, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 38456 + }, + { + "epoch": 0.38457, + "grad_norm": 0.7433660675189298, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 38457 + }, + { + "epoch": 0.38458, + "grad_norm": 0.6908733734182065, + "learning_rate": 0.003, + "loss": 4.042, + "step": 38458 + }, + { + "epoch": 0.38459, + "grad_norm": 0.7296039638865555, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 38459 + }, + { + "epoch": 0.3846, + "grad_norm": 0.7412318559484918, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 38460 + }, + { + "epoch": 0.38461, + "grad_norm": 0.8125292869136224, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 38461 + }, + { + "epoch": 0.38462, + "grad_norm": 0.8721080436957482, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 38462 + }, + { + "epoch": 0.38463, + "grad_norm": 0.850480239455861, + "learning_rate": 0.003, + "loss": 4.016, + "step": 38463 + }, + { + "epoch": 0.38464, + "grad_norm": 0.6961271426127764, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 38464 + }, + { + "epoch": 0.38465, + "grad_norm": 0.6973762679367502, + "learning_rate": 0.003, + "loss": 3.9983, + "step": 38465 + }, + { + "epoch": 0.38466, + "grad_norm": 0.6992262087097497, + "learning_rate": 0.003, + "loss": 4.011, + "step": 38466 + }, + { + "epoch": 0.38467, + "grad_norm": 0.8849768317044727, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 38467 + }, + { + "epoch": 0.38468, + "grad_norm": 1.1432984419668881, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 38468 + }, + { + "epoch": 0.38469, + "grad_norm": 0.8323537472326215, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 38469 + }, + { + "epoch": 0.3847, + "grad_norm": 0.7804119817610037, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 38470 + }, + { + "epoch": 0.38471, + "grad_norm": 0.7279496314486317, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 38471 + }, + { + "epoch": 0.38472, + "grad_norm": 0.6747909580706382, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 38472 + }, + { + "epoch": 0.38473, + "grad_norm": 0.6131330320251754, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 38473 + }, + { + "epoch": 0.38474, + "grad_norm": 0.5596362325965908, + "learning_rate": 0.003, + "loss": 4.022, + "step": 38474 + }, + { + "epoch": 0.38475, + "grad_norm": 0.5957683591895472, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 38475 + }, + { + "epoch": 0.38476, + "grad_norm": 0.5766654543419216, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 38476 + }, + { + "epoch": 0.38477, + "grad_norm": 0.5962845322750212, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 38477 + }, + { + "epoch": 0.38478, + "grad_norm": 0.665108305878101, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 38478 + }, + { + "epoch": 0.38479, + "grad_norm": 0.780025953644987, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 38479 + }, + { + "epoch": 0.3848, + "grad_norm": 0.9404923612328443, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 38480 + }, + { + "epoch": 0.38481, + "grad_norm": 1.060499237138171, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 38481 + }, + { + "epoch": 0.38482, + "grad_norm": 0.8422753838574888, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 38482 + }, + { + "epoch": 0.38483, + "grad_norm": 0.8156091768093142, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 38483 + }, + { + "epoch": 0.38484, + "grad_norm": 0.8780399787390039, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 38484 + }, + { + "epoch": 0.38485, + "grad_norm": 0.9748216424243997, + "learning_rate": 0.003, + "loss": 4.019, + "step": 38485 + }, + { + "epoch": 0.38486, + "grad_norm": 1.0747911403158215, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 38486 + }, + { + "epoch": 0.38487, + "grad_norm": 0.7707756657153543, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 38487 + }, + { + "epoch": 0.38488, + "grad_norm": 0.7386435282428715, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 38488 + }, + { + "epoch": 0.38489, + "grad_norm": 0.846188544154581, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 38489 + }, + { + "epoch": 0.3849, + "grad_norm": 0.7925212882008189, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 38490 + }, + { + "epoch": 0.38491, + "grad_norm": 0.9207690451532322, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 38491 + }, + { + "epoch": 0.38492, + "grad_norm": 1.0128494646170636, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 38492 + }, + { + "epoch": 0.38493, + "grad_norm": 1.122731444784906, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 38493 + }, + { + "epoch": 0.38494, + "grad_norm": 1.1252700659453696, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 38494 + }, + { + "epoch": 0.38495, + "grad_norm": 0.912687395553478, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 38495 + }, + { + "epoch": 0.38496, + "grad_norm": 0.8402284614757158, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 38496 + }, + { + "epoch": 0.38497, + "grad_norm": 0.8042597396578828, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 38497 + }, + { + "epoch": 0.38498, + "grad_norm": 0.9072147916600264, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 38498 + }, + { + "epoch": 0.38499, + "grad_norm": 1.0446291563011294, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 38499 + }, + { + "epoch": 0.385, + "grad_norm": 0.981594659212102, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 38500 + }, + { + "epoch": 0.38501, + "grad_norm": 1.0543773669012202, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 38501 + }, + { + "epoch": 0.38502, + "grad_norm": 0.9832699911647329, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 38502 + }, + { + "epoch": 0.38503, + "grad_norm": 0.9173750895521752, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 38503 + }, + { + "epoch": 0.38504, + "grad_norm": 0.9056368641633824, + "learning_rate": 0.003, + "loss": 4.014, + "step": 38504 + }, + { + "epoch": 0.38505, + "grad_norm": 0.9280334911376429, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 38505 + }, + { + "epoch": 0.38506, + "grad_norm": 0.8023711602745903, + "learning_rate": 0.003, + "loss": 3.9894, + "step": 38506 + }, + { + "epoch": 0.38507, + "grad_norm": 0.7303340386954768, + "learning_rate": 0.003, + "loss": 4.039, + "step": 38507 + }, + { + "epoch": 0.38508, + "grad_norm": 0.7151346443336082, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 38508 + }, + { + "epoch": 0.38509, + "grad_norm": 0.6993840933926821, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 38509 + }, + { + "epoch": 0.3851, + "grad_norm": 0.6328148637794426, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 38510 + }, + { + "epoch": 0.38511, + "grad_norm": 0.6043054766309005, + "learning_rate": 0.003, + "loss": 4.021, + "step": 38511 + }, + { + "epoch": 0.38512, + "grad_norm": 0.644644491952634, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 38512 + }, + { + "epoch": 0.38513, + "grad_norm": 0.7659096528671718, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 38513 + }, + { + "epoch": 0.38514, + "grad_norm": 0.7934667374437026, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 38514 + }, + { + "epoch": 0.38515, + "grad_norm": 0.7768003707949046, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 38515 + }, + { + "epoch": 0.38516, + "grad_norm": 0.7382483345407349, + "learning_rate": 0.003, + "loss": 4.036, + "step": 38516 + }, + { + "epoch": 0.38517, + "grad_norm": 0.6174416675227244, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 38517 + }, + { + "epoch": 0.38518, + "grad_norm": 0.65416950789888, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 38518 + }, + { + "epoch": 0.38519, + "grad_norm": 0.7849111999645018, + "learning_rate": 0.003, + "loss": 4.032, + "step": 38519 + }, + { + "epoch": 0.3852, + "grad_norm": 0.8581360126062351, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 38520 + }, + { + "epoch": 0.38521, + "grad_norm": 0.9950146161401462, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 38521 + }, + { + "epoch": 0.38522, + "grad_norm": 0.9819636029114348, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 38522 + }, + { + "epoch": 0.38523, + "grad_norm": 0.9837008477988654, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 38523 + }, + { + "epoch": 0.38524, + "grad_norm": 1.0320546754792046, + "learning_rate": 0.003, + "loss": 4.058, + "step": 38524 + }, + { + "epoch": 0.38525, + "grad_norm": 1.1691738309246351, + "learning_rate": 0.003, + "loss": 4.023, + "step": 38525 + }, + { + "epoch": 0.38526, + "grad_norm": 0.9267223278819766, + "learning_rate": 0.003, + "loss": 4.064, + "step": 38526 + }, + { + "epoch": 0.38527, + "grad_norm": 0.8810908730684174, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 38527 + }, + { + "epoch": 0.38528, + "grad_norm": 0.9377856923176395, + "learning_rate": 0.003, + "loss": 4.029, + "step": 38528 + }, + { + "epoch": 0.38529, + "grad_norm": 1.0501386702970363, + "learning_rate": 0.003, + "loss": 4.045, + "step": 38529 + }, + { + "epoch": 0.3853, + "grad_norm": 0.9394573092251713, + "learning_rate": 0.003, + "loss": 4.067, + "step": 38530 + }, + { + "epoch": 0.38531, + "grad_norm": 0.9111757387077319, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 38531 + }, + { + "epoch": 0.38532, + "grad_norm": 0.9636497271104937, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 38532 + }, + { + "epoch": 0.38533, + "grad_norm": 0.9471422730947957, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 38533 + }, + { + "epoch": 0.38534, + "grad_norm": 0.9069997661676339, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 38534 + }, + { + "epoch": 0.38535, + "grad_norm": 0.9211790508711268, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 38535 + }, + { + "epoch": 0.38536, + "grad_norm": 0.9041381463643217, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 38536 + }, + { + "epoch": 0.38537, + "grad_norm": 1.0191232477629812, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 38537 + }, + { + "epoch": 0.38538, + "grad_norm": 0.9737252817418087, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 38538 + }, + { + "epoch": 0.38539, + "grad_norm": 0.8730604094537794, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 38539 + }, + { + "epoch": 0.3854, + "grad_norm": 0.9343366075492773, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 38540 + }, + { + "epoch": 0.38541, + "grad_norm": 0.985106514620539, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 38541 + }, + { + "epoch": 0.38542, + "grad_norm": 1.1079450311537336, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 38542 + }, + { + "epoch": 0.38543, + "grad_norm": 0.9831826593969551, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 38543 + }, + { + "epoch": 0.38544, + "grad_norm": 0.915130390666594, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 38544 + }, + { + "epoch": 0.38545, + "grad_norm": 0.7478846899294815, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 38545 + }, + { + "epoch": 0.38546, + "grad_norm": 0.6849469854750732, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 38546 + }, + { + "epoch": 0.38547, + "grad_norm": 0.6447933712427969, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 38547 + }, + { + "epoch": 0.38548, + "grad_norm": 0.7133848159700936, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 38548 + }, + { + "epoch": 0.38549, + "grad_norm": 0.880872757702583, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 38549 + }, + { + "epoch": 0.3855, + "grad_norm": 0.9907721663909942, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 38550 + }, + { + "epoch": 0.38551, + "grad_norm": 1.038954879006838, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 38551 + }, + { + "epoch": 0.38552, + "grad_norm": 0.9524883048684212, + "learning_rate": 0.003, + "loss": 4.051, + "step": 38552 + }, + { + "epoch": 0.38553, + "grad_norm": 0.8062499668479594, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 38553 + }, + { + "epoch": 0.38554, + "grad_norm": 0.7520193489189723, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 38554 + }, + { + "epoch": 0.38555, + "grad_norm": 0.8378902195591581, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 38555 + }, + { + "epoch": 0.38556, + "grad_norm": 0.8333114068496017, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 38556 + }, + { + "epoch": 0.38557, + "grad_norm": 0.7303523596778515, + "learning_rate": 0.003, + "loss": 4.016, + "step": 38557 + }, + { + "epoch": 0.38558, + "grad_norm": 0.6805423929935032, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 38558 + }, + { + "epoch": 0.38559, + "grad_norm": 0.679212304893731, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 38559 + }, + { + "epoch": 0.3856, + "grad_norm": 0.6172671650131032, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 38560 + }, + { + "epoch": 0.38561, + "grad_norm": 0.5327007948954607, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 38561 + }, + { + "epoch": 0.38562, + "grad_norm": 0.4601477867521996, + "learning_rate": 0.003, + "loss": 4.034, + "step": 38562 + }, + { + "epoch": 0.38563, + "grad_norm": 0.46276518340301626, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 38563 + }, + { + "epoch": 0.38564, + "grad_norm": 0.42933829739308693, + "learning_rate": 0.003, + "loss": 3.9975, + "step": 38564 + }, + { + "epoch": 0.38565, + "grad_norm": 0.49535804964057106, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 38565 + }, + { + "epoch": 0.38566, + "grad_norm": 0.6215418026205526, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 38566 + }, + { + "epoch": 0.38567, + "grad_norm": 0.803537645849172, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 38567 + }, + { + "epoch": 0.38568, + "grad_norm": 0.855431500320925, + "learning_rate": 0.003, + "loss": 4.0031, + "step": 38568 + }, + { + "epoch": 0.38569, + "grad_norm": 0.916004856108913, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 38569 + }, + { + "epoch": 0.3857, + "grad_norm": 0.8243415740191272, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 38570 + }, + { + "epoch": 0.38571, + "grad_norm": 0.6313860724677136, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 38571 + }, + { + "epoch": 0.38572, + "grad_norm": 0.6000446541149322, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 38572 + }, + { + "epoch": 0.38573, + "grad_norm": 0.7072600158585183, + "learning_rate": 0.003, + "loss": 4.027, + "step": 38573 + }, + { + "epoch": 0.38574, + "grad_norm": 0.7869986546645801, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 38574 + }, + { + "epoch": 0.38575, + "grad_norm": 0.7848014644066057, + "learning_rate": 0.003, + "loss": 3.9955, + "step": 38575 + }, + { + "epoch": 0.38576, + "grad_norm": 0.7301149737260982, + "learning_rate": 0.003, + "loss": 3.9805, + "step": 38576 + }, + { + "epoch": 0.38577, + "grad_norm": 0.6670490551254721, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 38577 + }, + { + "epoch": 0.38578, + "grad_norm": 0.6664330707872401, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 38578 + }, + { + "epoch": 0.38579, + "grad_norm": 0.6383907616538848, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 38579 + }, + { + "epoch": 0.3858, + "grad_norm": 0.6827059412662635, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 38580 + }, + { + "epoch": 0.38581, + "grad_norm": 0.6441470658128243, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 38581 + }, + { + "epoch": 0.38582, + "grad_norm": 0.654062107974507, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 38582 + }, + { + "epoch": 0.38583, + "grad_norm": 0.6858413923959238, + "learning_rate": 0.003, + "loss": 4.029, + "step": 38583 + }, + { + "epoch": 0.38584, + "grad_norm": 0.7397030431837326, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 38584 + }, + { + "epoch": 0.38585, + "grad_norm": 0.7821959440159575, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 38585 + }, + { + "epoch": 0.38586, + "grad_norm": 0.9988989038410818, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 38586 + }, + { + "epoch": 0.38587, + "grad_norm": 1.0081439288855516, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 38587 + }, + { + "epoch": 0.38588, + "grad_norm": 1.0903105631876193, + "learning_rate": 0.003, + "loss": 4.0817, + "step": 38588 + }, + { + "epoch": 0.38589, + "grad_norm": 0.8788729766158484, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 38589 + }, + { + "epoch": 0.3859, + "grad_norm": 0.989697532621475, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 38590 + }, + { + "epoch": 0.38591, + "grad_norm": 1.0392007619608292, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 38591 + }, + { + "epoch": 0.38592, + "grad_norm": 1.0063734095320154, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 38592 + }, + { + "epoch": 0.38593, + "grad_norm": 1.0204800831414291, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 38593 + }, + { + "epoch": 0.38594, + "grad_norm": 1.0340613460345005, + "learning_rate": 0.003, + "loss": 4.0858, + "step": 38594 + }, + { + "epoch": 0.38595, + "grad_norm": 1.2039954277090155, + "learning_rate": 0.003, + "loss": 4.0809, + "step": 38595 + }, + { + "epoch": 0.38596, + "grad_norm": 0.8635947055646812, + "learning_rate": 0.003, + "loss": 4.044, + "step": 38596 + }, + { + "epoch": 0.38597, + "grad_norm": 0.9679734456866925, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 38597 + }, + { + "epoch": 0.38598, + "grad_norm": 1.041590079965243, + "learning_rate": 0.003, + "loss": 4.055, + "step": 38598 + }, + { + "epoch": 0.38599, + "grad_norm": 1.128442892435875, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 38599 + }, + { + "epoch": 0.386, + "grad_norm": 1.0133536056370909, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 38600 + }, + { + "epoch": 0.38601, + "grad_norm": 0.8768136172246547, + "learning_rate": 0.003, + "loss": 4.0709, + "step": 38601 + }, + { + "epoch": 0.38602, + "grad_norm": 0.7568408740242858, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 38602 + }, + { + "epoch": 0.38603, + "grad_norm": 0.7888085959585408, + "learning_rate": 0.003, + "loss": 4.046, + "step": 38603 + }, + { + "epoch": 0.38604, + "grad_norm": 0.8497415981028341, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 38604 + }, + { + "epoch": 0.38605, + "grad_norm": 0.8576661494187056, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 38605 + }, + { + "epoch": 0.38606, + "grad_norm": 0.7284325945407247, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 38606 + }, + { + "epoch": 0.38607, + "grad_norm": 0.7386277984066054, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 38607 + }, + { + "epoch": 0.38608, + "grad_norm": 0.8769383092974253, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 38608 + }, + { + "epoch": 0.38609, + "grad_norm": 0.9320086130106819, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 38609 + }, + { + "epoch": 0.3861, + "grad_norm": 0.8969472717046812, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 38610 + }, + { + "epoch": 0.38611, + "grad_norm": 0.7714627641662256, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 38611 + }, + { + "epoch": 0.38612, + "grad_norm": 0.5968833081154682, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 38612 + }, + { + "epoch": 0.38613, + "grad_norm": 0.5559804732117781, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 38613 + }, + { + "epoch": 0.38614, + "grad_norm": 0.48475097506749487, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 38614 + }, + { + "epoch": 0.38615, + "grad_norm": 0.5040824111285533, + "learning_rate": 0.003, + "loss": 4.008, + "step": 38615 + }, + { + "epoch": 0.38616, + "grad_norm": 0.6605613005382841, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 38616 + }, + { + "epoch": 0.38617, + "grad_norm": 0.8405739507862828, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 38617 + }, + { + "epoch": 0.38618, + "grad_norm": 1.085795840309311, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 38618 + }, + { + "epoch": 0.38619, + "grad_norm": 0.8565069031539776, + "learning_rate": 0.003, + "loss": 3.9922, + "step": 38619 + }, + { + "epoch": 0.3862, + "grad_norm": 0.5946395035238811, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 38620 + }, + { + "epoch": 0.38621, + "grad_norm": 0.6646943465973881, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 38621 + }, + { + "epoch": 0.38622, + "grad_norm": 0.8435727159083463, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 38622 + }, + { + "epoch": 0.38623, + "grad_norm": 0.9576906961172382, + "learning_rate": 0.003, + "loss": 4.021, + "step": 38623 + }, + { + "epoch": 0.38624, + "grad_norm": 0.9822334397616271, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 38624 + }, + { + "epoch": 0.38625, + "grad_norm": 0.9357439128399935, + "learning_rate": 0.003, + "loss": 3.994, + "step": 38625 + }, + { + "epoch": 0.38626, + "grad_norm": 0.7680817526942815, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 38626 + }, + { + "epoch": 0.38627, + "grad_norm": 0.6901555889186552, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 38627 + }, + { + "epoch": 0.38628, + "grad_norm": 0.6897430662339139, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 38628 + }, + { + "epoch": 0.38629, + "grad_norm": 0.71294421253421, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 38629 + }, + { + "epoch": 0.3863, + "grad_norm": 0.7164710616373653, + "learning_rate": 0.003, + "loss": 4.0041, + "step": 38630 + }, + { + "epoch": 0.38631, + "grad_norm": 0.676173944793977, + "learning_rate": 0.003, + "loss": 4.0024, + "step": 38631 + }, + { + "epoch": 0.38632, + "grad_norm": 0.6751742330934711, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 38632 + }, + { + "epoch": 0.38633, + "grad_norm": 0.6097423536411619, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 38633 + }, + { + "epoch": 0.38634, + "grad_norm": 0.8005911881622565, + "learning_rate": 0.003, + "loss": 3.9996, + "step": 38634 + }, + { + "epoch": 0.38635, + "grad_norm": 0.9266247786409381, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 38635 + }, + { + "epoch": 0.38636, + "grad_norm": 0.9676039465138799, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 38636 + }, + { + "epoch": 0.38637, + "grad_norm": 0.9178563646770437, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 38637 + }, + { + "epoch": 0.38638, + "grad_norm": 0.7620482266158438, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 38638 + }, + { + "epoch": 0.38639, + "grad_norm": 0.7443175923286365, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 38639 + }, + { + "epoch": 0.3864, + "grad_norm": 0.7926528812055846, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 38640 + }, + { + "epoch": 0.38641, + "grad_norm": 0.8341867239924328, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 38641 + }, + { + "epoch": 0.38642, + "grad_norm": 0.9171414636213439, + "learning_rate": 0.003, + "loss": 4.025, + "step": 38642 + }, + { + "epoch": 0.38643, + "grad_norm": 1.116630815597077, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 38643 + }, + { + "epoch": 0.38644, + "grad_norm": 1.0231364341661102, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 38644 + }, + { + "epoch": 0.38645, + "grad_norm": 1.059827435474559, + "learning_rate": 0.003, + "loss": 4.057, + "step": 38645 + }, + { + "epoch": 0.38646, + "grad_norm": 0.9020721674139157, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 38646 + }, + { + "epoch": 0.38647, + "grad_norm": 0.9312006857210884, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 38647 + }, + { + "epoch": 0.38648, + "grad_norm": 0.8328658087644785, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 38648 + }, + { + "epoch": 0.38649, + "grad_norm": 0.6627360996556314, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 38649 + }, + { + "epoch": 0.3865, + "grad_norm": 0.6142359485438406, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 38650 + }, + { + "epoch": 0.38651, + "grad_norm": 0.5974215262914429, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 38651 + }, + { + "epoch": 0.38652, + "grad_norm": 0.626268590989505, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 38652 + }, + { + "epoch": 0.38653, + "grad_norm": 0.6860822789654675, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 38653 + }, + { + "epoch": 0.38654, + "grad_norm": 0.7904239279141149, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 38654 + }, + { + "epoch": 0.38655, + "grad_norm": 0.8578101598921916, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 38655 + }, + { + "epoch": 0.38656, + "grad_norm": 0.9666071674897531, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 38656 + }, + { + "epoch": 0.38657, + "grad_norm": 1.0624244943566805, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 38657 + }, + { + "epoch": 0.38658, + "grad_norm": 0.9411787581209088, + "learning_rate": 0.003, + "loss": 4.031, + "step": 38658 + }, + { + "epoch": 0.38659, + "grad_norm": 0.9191807534128373, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 38659 + }, + { + "epoch": 0.3866, + "grad_norm": 1.020684117915154, + "learning_rate": 0.003, + "loss": 4.032, + "step": 38660 + }, + { + "epoch": 0.38661, + "grad_norm": 1.1902991988454863, + "learning_rate": 0.003, + "loss": 4.048, + "step": 38661 + }, + { + "epoch": 0.38662, + "grad_norm": 0.883604649867097, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 38662 + }, + { + "epoch": 0.38663, + "grad_norm": 0.7252787339536347, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 38663 + }, + { + "epoch": 0.38664, + "grad_norm": 0.6635936561197294, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 38664 + }, + { + "epoch": 0.38665, + "grad_norm": 0.6618758826228899, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 38665 + }, + { + "epoch": 0.38666, + "grad_norm": 0.6604945573253588, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 38666 + }, + { + "epoch": 0.38667, + "grad_norm": 0.5974486968769233, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 38667 + }, + { + "epoch": 0.38668, + "grad_norm": 0.5740677034883688, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 38668 + }, + { + "epoch": 0.38669, + "grad_norm": 0.602493796135321, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 38669 + }, + { + "epoch": 0.3867, + "grad_norm": 0.5905824355572905, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 38670 + }, + { + "epoch": 0.38671, + "grad_norm": 0.5846383275610086, + "learning_rate": 0.003, + "loss": 4.039, + "step": 38671 + }, + { + "epoch": 0.38672, + "grad_norm": 0.7101028041911418, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 38672 + }, + { + "epoch": 0.38673, + "grad_norm": 0.9063670902473244, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 38673 + }, + { + "epoch": 0.38674, + "grad_norm": 1.2588688254148332, + "learning_rate": 0.003, + "loss": 4.019, + "step": 38674 + }, + { + "epoch": 0.38675, + "grad_norm": 0.7943364494312544, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 38675 + }, + { + "epoch": 0.38676, + "grad_norm": 0.7071334783423584, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 38676 + }, + { + "epoch": 0.38677, + "grad_norm": 0.703276444140072, + "learning_rate": 0.003, + "loss": 4.0011, + "step": 38677 + }, + { + "epoch": 0.38678, + "grad_norm": 0.7089157303679449, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 38678 + }, + { + "epoch": 0.38679, + "grad_norm": 0.798234032186939, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 38679 + }, + { + "epoch": 0.3868, + "grad_norm": 0.8910456977821848, + "learning_rate": 0.003, + "loss": 3.9919, + "step": 38680 + }, + { + "epoch": 0.38681, + "grad_norm": 0.8902901762285752, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 38681 + }, + { + "epoch": 0.38682, + "grad_norm": 1.0403622244733046, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 38682 + }, + { + "epoch": 0.38683, + "grad_norm": 1.0446148269147293, + "learning_rate": 0.003, + "loss": 4.0018, + "step": 38683 + }, + { + "epoch": 0.38684, + "grad_norm": 0.9660005118162569, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 38684 + }, + { + "epoch": 0.38685, + "grad_norm": 0.9791677110510582, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 38685 + }, + { + "epoch": 0.38686, + "grad_norm": 0.9452109041289011, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 38686 + }, + { + "epoch": 0.38687, + "grad_norm": 0.878099441362969, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 38687 + }, + { + "epoch": 0.38688, + "grad_norm": 0.8811680120507416, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 38688 + }, + { + "epoch": 0.38689, + "grad_norm": 0.9401921457323398, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 38689 + }, + { + "epoch": 0.3869, + "grad_norm": 1.1415800120142867, + "learning_rate": 0.003, + "loss": 4.0701, + "step": 38690 + }, + { + "epoch": 0.38691, + "grad_norm": 1.132269517104631, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 38691 + }, + { + "epoch": 0.38692, + "grad_norm": 1.0466187803950728, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 38692 + }, + { + "epoch": 0.38693, + "grad_norm": 0.8814529042948382, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 38693 + }, + { + "epoch": 0.38694, + "grad_norm": 0.8119988904640713, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 38694 + }, + { + "epoch": 0.38695, + "grad_norm": 0.7426369382551237, + "learning_rate": 0.003, + "loss": 4.042, + "step": 38695 + }, + { + "epoch": 0.38696, + "grad_norm": 0.7561640507336973, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 38696 + }, + { + "epoch": 0.38697, + "grad_norm": 0.7968428768832325, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 38697 + }, + { + "epoch": 0.38698, + "grad_norm": 0.8039993244939544, + "learning_rate": 0.003, + "loss": 4.033, + "step": 38698 + }, + { + "epoch": 0.38699, + "grad_norm": 0.8594366718250979, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 38699 + }, + { + "epoch": 0.387, + "grad_norm": 0.918160780760975, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 38700 + }, + { + "epoch": 0.38701, + "grad_norm": 0.9063897687497507, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 38701 + }, + { + "epoch": 0.38702, + "grad_norm": 0.8384097455092195, + "learning_rate": 0.003, + "loss": 3.9918, + "step": 38702 + }, + { + "epoch": 0.38703, + "grad_norm": 0.7078344366962823, + "learning_rate": 0.003, + "loss": 3.986, + "step": 38703 + }, + { + "epoch": 0.38704, + "grad_norm": 0.6193479255676454, + "learning_rate": 0.003, + "loss": 4.04, + "step": 38704 + }, + { + "epoch": 0.38705, + "grad_norm": 0.6322065306467397, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 38705 + }, + { + "epoch": 0.38706, + "grad_norm": 0.6072280018893921, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 38706 + }, + { + "epoch": 0.38707, + "grad_norm": 0.7183864756807341, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 38707 + }, + { + "epoch": 0.38708, + "grad_norm": 0.8281254764078789, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 38708 + }, + { + "epoch": 0.38709, + "grad_norm": 0.9291654605270411, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 38709 + }, + { + "epoch": 0.3871, + "grad_norm": 0.858887250305021, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 38710 + }, + { + "epoch": 0.38711, + "grad_norm": 0.9135793048310575, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 38711 + }, + { + "epoch": 0.38712, + "grad_norm": 0.9456705507545765, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 38712 + }, + { + "epoch": 0.38713, + "grad_norm": 0.6806295698282154, + "learning_rate": 0.003, + "loss": 4.0009, + "step": 38713 + }, + { + "epoch": 0.38714, + "grad_norm": 0.7021894269772626, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 38714 + }, + { + "epoch": 0.38715, + "grad_norm": 0.794091015055748, + "learning_rate": 0.003, + "loss": 4.037, + "step": 38715 + }, + { + "epoch": 0.38716, + "grad_norm": 0.9055393935315006, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 38716 + }, + { + "epoch": 0.38717, + "grad_norm": 0.8825773643187306, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 38717 + }, + { + "epoch": 0.38718, + "grad_norm": 0.8116339879007304, + "learning_rate": 0.003, + "loss": 4.026, + "step": 38718 + }, + { + "epoch": 0.38719, + "grad_norm": 0.793459588121486, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 38719 + }, + { + "epoch": 0.3872, + "grad_norm": 0.8643817361700938, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 38720 + }, + { + "epoch": 0.38721, + "grad_norm": 1.0309354062320704, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 38721 + }, + { + "epoch": 0.38722, + "grad_norm": 1.1234035590489593, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 38722 + }, + { + "epoch": 0.38723, + "grad_norm": 0.9773726581867122, + "learning_rate": 0.003, + "loss": 4.049, + "step": 38723 + }, + { + "epoch": 0.38724, + "grad_norm": 1.149039293331859, + "learning_rate": 0.003, + "loss": 4.05, + "step": 38724 + }, + { + "epoch": 0.38725, + "grad_norm": 1.0932523720976883, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 38725 + }, + { + "epoch": 0.38726, + "grad_norm": 0.8469911898728546, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 38726 + }, + { + "epoch": 0.38727, + "grad_norm": 0.673524848718887, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 38727 + }, + { + "epoch": 0.38728, + "grad_norm": 0.7008049423107547, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 38728 + }, + { + "epoch": 0.38729, + "grad_norm": 0.7179163385286665, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 38729 + }, + { + "epoch": 0.3873, + "grad_norm": 0.7294010635346988, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 38730 + }, + { + "epoch": 0.38731, + "grad_norm": 0.7925765083319943, + "learning_rate": 0.003, + "loss": 4.067, + "step": 38731 + }, + { + "epoch": 0.38732, + "grad_norm": 0.9281526025387395, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 38732 + }, + { + "epoch": 0.38733, + "grad_norm": 0.9072048111997109, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 38733 + }, + { + "epoch": 0.38734, + "grad_norm": 0.9086541399989644, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 38734 + }, + { + "epoch": 0.38735, + "grad_norm": 0.9924437987521345, + "learning_rate": 0.003, + "loss": 4.0726, + "step": 38735 + }, + { + "epoch": 0.38736, + "grad_norm": 1.0144834594354653, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 38736 + }, + { + "epoch": 0.38737, + "grad_norm": 0.8938145776637586, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 38737 + }, + { + "epoch": 0.38738, + "grad_norm": 0.780511799277081, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 38738 + }, + { + "epoch": 0.38739, + "grad_norm": 0.8906539233371042, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 38739 + }, + { + "epoch": 0.3874, + "grad_norm": 0.8728304040067413, + "learning_rate": 0.003, + "loss": 4.046, + "step": 38740 + }, + { + "epoch": 0.38741, + "grad_norm": 0.7665215194650611, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 38741 + }, + { + "epoch": 0.38742, + "grad_norm": 0.8240599986424338, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 38742 + }, + { + "epoch": 0.38743, + "grad_norm": 1.0972338346880208, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 38743 + }, + { + "epoch": 0.38744, + "grad_norm": 1.1596842934700404, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 38744 + }, + { + "epoch": 0.38745, + "grad_norm": 0.6520403991367214, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 38745 + }, + { + "epoch": 0.38746, + "grad_norm": 0.5916890180179258, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 38746 + }, + { + "epoch": 0.38747, + "grad_norm": 0.6005513701064658, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 38747 + }, + { + "epoch": 0.38748, + "grad_norm": 0.650083832229813, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 38748 + }, + { + "epoch": 0.38749, + "grad_norm": 0.7885210572645265, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 38749 + }, + { + "epoch": 0.3875, + "grad_norm": 0.8859007569599856, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 38750 + }, + { + "epoch": 0.38751, + "grad_norm": 0.9698415664894792, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 38751 + }, + { + "epoch": 0.38752, + "grad_norm": 0.9085220678643049, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 38752 + }, + { + "epoch": 0.38753, + "grad_norm": 0.7921367681770242, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 38753 + }, + { + "epoch": 0.38754, + "grad_norm": 0.7645735911250109, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 38754 + }, + { + "epoch": 0.38755, + "grad_norm": 0.7155507171579202, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 38755 + }, + { + "epoch": 0.38756, + "grad_norm": 0.6957848612839949, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 38756 + }, + { + "epoch": 0.38757, + "grad_norm": 0.767819916395646, + "learning_rate": 0.003, + "loss": 4.055, + "step": 38757 + }, + { + "epoch": 0.38758, + "grad_norm": 0.8932461059539477, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 38758 + }, + { + "epoch": 0.38759, + "grad_norm": 1.0277355528457635, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 38759 + }, + { + "epoch": 0.3876, + "grad_norm": 0.993802051369648, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 38760 + }, + { + "epoch": 0.38761, + "grad_norm": 1.0026377950056506, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 38761 + }, + { + "epoch": 0.38762, + "grad_norm": 1.017261655531139, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 38762 + }, + { + "epoch": 0.38763, + "grad_norm": 0.918018034087445, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 38763 + }, + { + "epoch": 0.38764, + "grad_norm": 0.7235253341910148, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 38764 + }, + { + "epoch": 0.38765, + "grad_norm": 0.7164155255062978, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 38765 + }, + { + "epoch": 0.38766, + "grad_norm": 0.6992737830509747, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 38766 + }, + { + "epoch": 0.38767, + "grad_norm": 0.6872148823969672, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 38767 + }, + { + "epoch": 0.38768, + "grad_norm": 0.6658166765457662, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 38768 + }, + { + "epoch": 0.38769, + "grad_norm": 0.6615952069715874, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 38769 + }, + { + "epoch": 0.3877, + "grad_norm": 0.6700575417251037, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 38770 + }, + { + "epoch": 0.38771, + "grad_norm": 0.6185448899391608, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 38771 + }, + { + "epoch": 0.38772, + "grad_norm": 0.6955647764326439, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 38772 + }, + { + "epoch": 0.38773, + "grad_norm": 0.8037109300428518, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 38773 + }, + { + "epoch": 0.38774, + "grad_norm": 0.8400210108486711, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 38774 + }, + { + "epoch": 0.38775, + "grad_norm": 0.8830139015215968, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 38775 + }, + { + "epoch": 0.38776, + "grad_norm": 1.0617028933183017, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 38776 + }, + { + "epoch": 0.38777, + "grad_norm": 0.9944172466111262, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 38777 + }, + { + "epoch": 0.38778, + "grad_norm": 1.0572814690666266, + "learning_rate": 0.003, + "loss": 4.038, + "step": 38778 + }, + { + "epoch": 0.38779, + "grad_norm": 0.8241808007587661, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 38779 + }, + { + "epoch": 0.3878, + "grad_norm": 0.7342213805548473, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 38780 + }, + { + "epoch": 0.38781, + "grad_norm": 0.7474044670786602, + "learning_rate": 0.003, + "loss": 4.0016, + "step": 38781 + }, + { + "epoch": 0.38782, + "grad_norm": 0.6809445088618278, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 38782 + }, + { + "epoch": 0.38783, + "grad_norm": 0.7090891364290571, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 38783 + }, + { + "epoch": 0.38784, + "grad_norm": 0.7325057759706495, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 38784 + }, + { + "epoch": 0.38785, + "grad_norm": 0.8082541791198126, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 38785 + }, + { + "epoch": 0.38786, + "grad_norm": 0.8464188429113675, + "learning_rate": 0.003, + "loss": 4.059, + "step": 38786 + }, + { + "epoch": 0.38787, + "grad_norm": 0.8953173969151452, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 38787 + }, + { + "epoch": 0.38788, + "grad_norm": 0.886954155279283, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 38788 + }, + { + "epoch": 0.38789, + "grad_norm": 0.8255688276365657, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 38789 + }, + { + "epoch": 0.3879, + "grad_norm": 0.8056608998934448, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 38790 + }, + { + "epoch": 0.38791, + "grad_norm": 0.8013958385401401, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 38791 + }, + { + "epoch": 0.38792, + "grad_norm": 0.9810190238176275, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 38792 + }, + { + "epoch": 0.38793, + "grad_norm": 1.0873647471097694, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 38793 + }, + { + "epoch": 0.38794, + "grad_norm": 0.8931998537049883, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 38794 + }, + { + "epoch": 0.38795, + "grad_norm": 0.93802389537284, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 38795 + }, + { + "epoch": 0.38796, + "grad_norm": 0.9110414726844671, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 38796 + }, + { + "epoch": 0.38797, + "grad_norm": 0.9764211319199182, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 38797 + }, + { + "epoch": 0.38798, + "grad_norm": 1.1236833397140664, + "learning_rate": 0.003, + "loss": 4.045, + "step": 38798 + }, + { + "epoch": 0.38799, + "grad_norm": 0.8723820896668953, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 38799 + }, + { + "epoch": 0.388, + "grad_norm": 0.7445083523969214, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 38800 + }, + { + "epoch": 0.38801, + "grad_norm": 0.7826517245682794, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 38801 + }, + { + "epoch": 0.38802, + "grad_norm": 0.929228264484275, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 38802 + }, + { + "epoch": 0.38803, + "grad_norm": 0.9745090952910564, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 38803 + }, + { + "epoch": 0.38804, + "grad_norm": 1.092029127447523, + "learning_rate": 0.003, + "loss": 4.087, + "step": 38804 + }, + { + "epoch": 0.38805, + "grad_norm": 0.884209091166197, + "learning_rate": 0.003, + "loss": 3.9998, + "step": 38805 + }, + { + "epoch": 0.38806, + "grad_norm": 0.7741202144009707, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 38806 + }, + { + "epoch": 0.38807, + "grad_norm": 0.7424979978936036, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 38807 + }, + { + "epoch": 0.38808, + "grad_norm": 0.7984054031112232, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 38808 + }, + { + "epoch": 0.38809, + "grad_norm": 0.6973851915821029, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 38809 + }, + { + "epoch": 0.3881, + "grad_norm": 0.6917795641875923, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 38810 + }, + { + "epoch": 0.38811, + "grad_norm": 0.7131589352671903, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 38811 + }, + { + "epoch": 0.38812, + "grad_norm": 0.6881773470686708, + "learning_rate": 0.003, + "loss": 3.9989, + "step": 38812 + }, + { + "epoch": 0.38813, + "grad_norm": 0.7007229136584241, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 38813 + }, + { + "epoch": 0.38814, + "grad_norm": 0.726106449274915, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 38814 + }, + { + "epoch": 0.38815, + "grad_norm": 0.7140240922045591, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 38815 + }, + { + "epoch": 0.38816, + "grad_norm": 0.7393519881804133, + "learning_rate": 0.003, + "loss": 4.061, + "step": 38816 + }, + { + "epoch": 0.38817, + "grad_norm": 0.6615490982869946, + "learning_rate": 0.003, + "loss": 4.036, + "step": 38817 + }, + { + "epoch": 0.38818, + "grad_norm": 0.5288050519555644, + "learning_rate": 0.003, + "loss": 4.03, + "step": 38818 + }, + { + "epoch": 0.38819, + "grad_norm": 0.5885860754321083, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 38819 + }, + { + "epoch": 0.3882, + "grad_norm": 0.5895713011476201, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 38820 + }, + { + "epoch": 0.38821, + "grad_norm": 0.7413880465808881, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 38821 + }, + { + "epoch": 0.38822, + "grad_norm": 0.9750506325545932, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 38822 + }, + { + "epoch": 0.38823, + "grad_norm": 1.2139161479808454, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 38823 + }, + { + "epoch": 0.38824, + "grad_norm": 0.7886819348720684, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 38824 + }, + { + "epoch": 0.38825, + "grad_norm": 0.6676736765456752, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 38825 + }, + { + "epoch": 0.38826, + "grad_norm": 0.7054472415580418, + "learning_rate": 0.003, + "loss": 4.015, + "step": 38826 + }, + { + "epoch": 0.38827, + "grad_norm": 0.7387763147265617, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 38827 + }, + { + "epoch": 0.38828, + "grad_norm": 0.7105443679175205, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 38828 + }, + { + "epoch": 0.38829, + "grad_norm": 0.7394311311988541, + "learning_rate": 0.003, + "loss": 4.015, + "step": 38829 + }, + { + "epoch": 0.3883, + "grad_norm": 0.7424867031308011, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 38830 + }, + { + "epoch": 0.38831, + "grad_norm": 0.7184114973174371, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 38831 + }, + { + "epoch": 0.38832, + "grad_norm": 0.762020651166056, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 38832 + }, + { + "epoch": 0.38833, + "grad_norm": 0.8794983976672027, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 38833 + }, + { + "epoch": 0.38834, + "grad_norm": 1.031588275116529, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 38834 + }, + { + "epoch": 0.38835, + "grad_norm": 0.9554156370488993, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 38835 + }, + { + "epoch": 0.38836, + "grad_norm": 1.0846645292378072, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 38836 + }, + { + "epoch": 0.38837, + "grad_norm": 1.1689614783236506, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 38837 + }, + { + "epoch": 0.38838, + "grad_norm": 0.8904727087312934, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 38838 + }, + { + "epoch": 0.38839, + "grad_norm": 0.8405078702142063, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 38839 + }, + { + "epoch": 0.3884, + "grad_norm": 0.8879475123678611, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 38840 + }, + { + "epoch": 0.38841, + "grad_norm": 0.9650349603938438, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 38841 + }, + { + "epoch": 0.38842, + "grad_norm": 0.7704356716856862, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 38842 + }, + { + "epoch": 0.38843, + "grad_norm": 0.7129145091834721, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 38843 + }, + { + "epoch": 0.38844, + "grad_norm": 0.739876632236984, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 38844 + }, + { + "epoch": 0.38845, + "grad_norm": 0.8708493429239457, + "learning_rate": 0.003, + "loss": 4.0039, + "step": 38845 + }, + { + "epoch": 0.38846, + "grad_norm": 0.9991632658663147, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 38846 + }, + { + "epoch": 0.38847, + "grad_norm": 0.9990111853861054, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 38847 + }, + { + "epoch": 0.38848, + "grad_norm": 0.9803878604047449, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 38848 + }, + { + "epoch": 0.38849, + "grad_norm": 0.9494346769403634, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 38849 + }, + { + "epoch": 0.3885, + "grad_norm": 0.8841432863629712, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 38850 + }, + { + "epoch": 0.38851, + "grad_norm": 0.8403736930833733, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 38851 + }, + { + "epoch": 0.38852, + "grad_norm": 0.9143557473228322, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 38852 + }, + { + "epoch": 0.38853, + "grad_norm": 1.0461919143837948, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 38853 + }, + { + "epoch": 0.38854, + "grad_norm": 0.9405096887967528, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 38854 + }, + { + "epoch": 0.38855, + "grad_norm": 0.7643452593824263, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 38855 + }, + { + "epoch": 0.38856, + "grad_norm": 0.5889429680071775, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 38856 + }, + { + "epoch": 0.38857, + "grad_norm": 0.5366453404239271, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 38857 + }, + { + "epoch": 0.38858, + "grad_norm": 0.4921191610104327, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 38858 + }, + { + "epoch": 0.38859, + "grad_norm": 0.5468116503383323, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 38859 + }, + { + "epoch": 0.3886, + "grad_norm": 0.5392675336786404, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 38860 + }, + { + "epoch": 0.38861, + "grad_norm": 0.5886073501150086, + "learning_rate": 0.003, + "loss": 4.0077, + "step": 38861 + }, + { + "epoch": 0.38862, + "grad_norm": 0.7201623230633344, + "learning_rate": 0.003, + "loss": 4.042, + "step": 38862 + }, + { + "epoch": 0.38863, + "grad_norm": 1.0461346376011476, + "learning_rate": 0.003, + "loss": 4.017, + "step": 38863 + }, + { + "epoch": 0.38864, + "grad_norm": 1.3443418046964795, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 38864 + }, + { + "epoch": 0.38865, + "grad_norm": 0.6860456503273306, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 38865 + }, + { + "epoch": 0.38866, + "grad_norm": 0.6560592132844741, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 38866 + }, + { + "epoch": 0.38867, + "grad_norm": 0.7614212054102282, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 38867 + }, + { + "epoch": 0.38868, + "grad_norm": 0.7376117341288431, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 38868 + }, + { + "epoch": 0.38869, + "grad_norm": 0.7021641408768662, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 38869 + }, + { + "epoch": 0.3887, + "grad_norm": 0.7658816181622817, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 38870 + }, + { + "epoch": 0.38871, + "grad_norm": 0.7467105727457862, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 38871 + }, + { + "epoch": 0.38872, + "grad_norm": 0.802382408578719, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 38872 + }, + { + "epoch": 0.38873, + "grad_norm": 0.8437441938474158, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 38873 + }, + { + "epoch": 0.38874, + "grad_norm": 0.866562250077537, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 38874 + }, + { + "epoch": 0.38875, + "grad_norm": 0.8896982236307648, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 38875 + }, + { + "epoch": 0.38876, + "grad_norm": 0.9245164156154686, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 38876 + }, + { + "epoch": 0.38877, + "grad_norm": 1.0055188720577737, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 38877 + }, + { + "epoch": 0.38878, + "grad_norm": 1.043229347542332, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 38878 + }, + { + "epoch": 0.38879, + "grad_norm": 0.8735707042919002, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 38879 + }, + { + "epoch": 0.3888, + "grad_norm": 0.8272029474796789, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 38880 + }, + { + "epoch": 0.38881, + "grad_norm": 0.791893251752632, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 38881 + }, + { + "epoch": 0.38882, + "grad_norm": 0.8552987529191952, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 38882 + }, + { + "epoch": 0.38883, + "grad_norm": 0.8831763322828671, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 38883 + }, + { + "epoch": 0.38884, + "grad_norm": 0.8171166204769043, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 38884 + }, + { + "epoch": 0.38885, + "grad_norm": 0.7823027177030938, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 38885 + }, + { + "epoch": 0.38886, + "grad_norm": 0.8360816139240257, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 38886 + }, + { + "epoch": 0.38887, + "grad_norm": 0.8141543009585094, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 38887 + }, + { + "epoch": 0.38888, + "grad_norm": 0.7528923574591511, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 38888 + }, + { + "epoch": 0.38889, + "grad_norm": 0.7442194474515561, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 38889 + }, + { + "epoch": 0.3889, + "grad_norm": 0.72136939209497, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 38890 + }, + { + "epoch": 0.38891, + "grad_norm": 0.8438400297688057, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 38891 + }, + { + "epoch": 0.38892, + "grad_norm": 0.9191798917574244, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 38892 + }, + { + "epoch": 0.38893, + "grad_norm": 0.9613853062179062, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 38893 + }, + { + "epoch": 0.38894, + "grad_norm": 1.0089860173309275, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 38894 + }, + { + "epoch": 0.38895, + "grad_norm": 1.0570936771475727, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 38895 + }, + { + "epoch": 0.38896, + "grad_norm": 1.0552139853518523, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 38896 + }, + { + "epoch": 0.38897, + "grad_norm": 0.9455680386364133, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 38897 + }, + { + "epoch": 0.38898, + "grad_norm": 0.8492668244543198, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 38898 + }, + { + "epoch": 0.38899, + "grad_norm": 0.7524702029028567, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 38899 + }, + { + "epoch": 0.389, + "grad_norm": 0.8120458198729998, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 38900 + }, + { + "epoch": 0.38901, + "grad_norm": 0.7947706074835125, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 38901 + }, + { + "epoch": 0.38902, + "grad_norm": 0.7933560573426388, + "learning_rate": 0.003, + "loss": 4.0095, + "step": 38902 + }, + { + "epoch": 0.38903, + "grad_norm": 0.8496379156724773, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 38903 + }, + { + "epoch": 0.38904, + "grad_norm": 0.8774219795195742, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 38904 + }, + { + "epoch": 0.38905, + "grad_norm": 0.9664971768488186, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 38905 + }, + { + "epoch": 0.38906, + "grad_norm": 1.0899555755492965, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 38906 + }, + { + "epoch": 0.38907, + "grad_norm": 0.849017529672495, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 38907 + }, + { + "epoch": 0.38908, + "grad_norm": 0.7199400527412286, + "learning_rate": 0.003, + "loss": 4.006, + "step": 38908 + }, + { + "epoch": 0.38909, + "grad_norm": 0.67924964328141, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 38909 + }, + { + "epoch": 0.3891, + "grad_norm": 0.6940250997500222, + "learning_rate": 0.003, + "loss": 4.04, + "step": 38910 + }, + { + "epoch": 0.38911, + "grad_norm": 0.5559738020524451, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 38911 + }, + { + "epoch": 0.38912, + "grad_norm": 0.5921073519425198, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 38912 + }, + { + "epoch": 0.38913, + "grad_norm": 0.6230562356755613, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 38913 + }, + { + "epoch": 0.38914, + "grad_norm": 0.6895741538928168, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 38914 + }, + { + "epoch": 0.38915, + "grad_norm": 0.7860879551141758, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 38915 + }, + { + "epoch": 0.38916, + "grad_norm": 0.8682962825595825, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 38916 + }, + { + "epoch": 0.38917, + "grad_norm": 0.975198671473647, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 38917 + }, + { + "epoch": 0.38918, + "grad_norm": 1.0732940007613685, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 38918 + }, + { + "epoch": 0.38919, + "grad_norm": 0.8411871383405861, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 38919 + }, + { + "epoch": 0.3892, + "grad_norm": 0.8313365817412937, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 38920 + }, + { + "epoch": 0.38921, + "grad_norm": 0.8827515473172604, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 38921 + }, + { + "epoch": 0.38922, + "grad_norm": 0.9596854211226428, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 38922 + }, + { + "epoch": 0.38923, + "grad_norm": 0.9464803285360043, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 38923 + }, + { + "epoch": 0.38924, + "grad_norm": 0.8522489742292542, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 38924 + }, + { + "epoch": 0.38925, + "grad_norm": 0.840231550887451, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 38925 + }, + { + "epoch": 0.38926, + "grad_norm": 1.0388275617542007, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 38926 + }, + { + "epoch": 0.38927, + "grad_norm": 1.0690710640658656, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 38927 + }, + { + "epoch": 0.38928, + "grad_norm": 0.8637132525634666, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 38928 + }, + { + "epoch": 0.38929, + "grad_norm": 0.8171513271154603, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 38929 + }, + { + "epoch": 0.3893, + "grad_norm": 0.9014062239064397, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 38930 + }, + { + "epoch": 0.38931, + "grad_norm": 0.8144631533830814, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 38931 + }, + { + "epoch": 0.38932, + "grad_norm": 0.8174336872743337, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 38932 + }, + { + "epoch": 0.38933, + "grad_norm": 0.7795168997780574, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 38933 + }, + { + "epoch": 0.38934, + "grad_norm": 0.9281143058797, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 38934 + }, + { + "epoch": 0.38935, + "grad_norm": 0.8957178213201187, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 38935 + }, + { + "epoch": 0.38936, + "grad_norm": 0.9164262158645974, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 38936 + }, + { + "epoch": 0.38937, + "grad_norm": 1.0291050643485076, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 38937 + }, + { + "epoch": 0.38938, + "grad_norm": 1.0957936607863676, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 38938 + }, + { + "epoch": 0.38939, + "grad_norm": 1.098660513747761, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 38939 + }, + { + "epoch": 0.3894, + "grad_norm": 0.7621377477820444, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 38940 + }, + { + "epoch": 0.38941, + "grad_norm": 0.6790827357894071, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 38941 + }, + { + "epoch": 0.38942, + "grad_norm": 0.5382310134274575, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 38942 + }, + { + "epoch": 0.38943, + "grad_norm": 0.5583205839425206, + "learning_rate": 0.003, + "loss": 3.9851, + "step": 38943 + }, + { + "epoch": 0.38944, + "grad_norm": 0.580692629471832, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 38944 + }, + { + "epoch": 0.38945, + "grad_norm": 0.5733318489656112, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 38945 + }, + { + "epoch": 0.38946, + "grad_norm": 0.5894294656917289, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 38946 + }, + { + "epoch": 0.38947, + "grad_norm": 0.6312906814797671, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 38947 + }, + { + "epoch": 0.38948, + "grad_norm": 0.7049903930285736, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 38948 + }, + { + "epoch": 0.38949, + "grad_norm": 0.7431493035843727, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 38949 + }, + { + "epoch": 0.3895, + "grad_norm": 0.7866662324480733, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 38950 + }, + { + "epoch": 0.38951, + "grad_norm": 0.9014127844945743, + "learning_rate": 0.003, + "loss": 4.0004, + "step": 38951 + }, + { + "epoch": 0.38952, + "grad_norm": 0.9091008560639542, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 38952 + }, + { + "epoch": 0.38953, + "grad_norm": 0.9223619840039616, + "learning_rate": 0.003, + "loss": 3.9972, + "step": 38953 + }, + { + "epoch": 0.38954, + "grad_norm": 1.006563178998871, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 38954 + }, + { + "epoch": 0.38955, + "grad_norm": 1.0868419556022058, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 38955 + }, + { + "epoch": 0.38956, + "grad_norm": 0.8943488562574647, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 38956 + }, + { + "epoch": 0.38957, + "grad_norm": 0.6955443014698706, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 38957 + }, + { + "epoch": 0.38958, + "grad_norm": 0.7116984686914618, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 38958 + }, + { + "epoch": 0.38959, + "grad_norm": 0.7171066022732264, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 38959 + }, + { + "epoch": 0.3896, + "grad_norm": 0.6576161758171452, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 38960 + }, + { + "epoch": 0.38961, + "grad_norm": 0.7610570473283674, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 38961 + }, + { + "epoch": 0.38962, + "grad_norm": 0.9168683743420454, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 38962 + }, + { + "epoch": 0.38963, + "grad_norm": 0.9344990487173336, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 38963 + }, + { + "epoch": 0.38964, + "grad_norm": 0.8803188591142238, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 38964 + }, + { + "epoch": 0.38965, + "grad_norm": 0.8339102740827433, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 38965 + }, + { + "epoch": 0.38966, + "grad_norm": 0.8136553126348813, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 38966 + }, + { + "epoch": 0.38967, + "grad_norm": 0.8532324411042475, + "learning_rate": 0.003, + "loss": 4.037, + "step": 38967 + }, + { + "epoch": 0.38968, + "grad_norm": 0.9666876587222399, + "learning_rate": 0.003, + "loss": 4.0011, + "step": 38968 + }, + { + "epoch": 0.38969, + "grad_norm": 1.1123325443586711, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 38969 + }, + { + "epoch": 0.3897, + "grad_norm": 1.088252325615684, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 38970 + }, + { + "epoch": 0.38971, + "grad_norm": 0.9640628590776612, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 38971 + }, + { + "epoch": 0.38972, + "grad_norm": 0.8642308316591061, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 38972 + }, + { + "epoch": 0.38973, + "grad_norm": 0.8355882329239119, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 38973 + }, + { + "epoch": 0.38974, + "grad_norm": 0.8349768605499327, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 38974 + }, + { + "epoch": 0.38975, + "grad_norm": 0.8359524407821309, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 38975 + }, + { + "epoch": 0.38976, + "grad_norm": 0.7379956707158815, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 38976 + }, + { + "epoch": 0.38977, + "grad_norm": 0.7252847145469319, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 38977 + }, + { + "epoch": 0.38978, + "grad_norm": 0.7906610660158844, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 38978 + }, + { + "epoch": 0.38979, + "grad_norm": 0.6877822267203617, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 38979 + }, + { + "epoch": 0.3898, + "grad_norm": 0.6263863183054964, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 38980 + }, + { + "epoch": 0.38981, + "grad_norm": 0.7448636873713926, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 38981 + }, + { + "epoch": 0.38982, + "grad_norm": 0.8148338921753411, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 38982 + }, + { + "epoch": 0.38983, + "grad_norm": 0.9325320562349716, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 38983 + }, + { + "epoch": 0.38984, + "grad_norm": 1.050256232983453, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 38984 + }, + { + "epoch": 0.38985, + "grad_norm": 1.0241462610129308, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 38985 + }, + { + "epoch": 0.38986, + "grad_norm": 1.0374398966908258, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 38986 + }, + { + "epoch": 0.38987, + "grad_norm": 0.9200840474352151, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 38987 + }, + { + "epoch": 0.38988, + "grad_norm": 0.8633091329418429, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 38988 + }, + { + "epoch": 0.38989, + "grad_norm": 0.8410832476061626, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 38989 + }, + { + "epoch": 0.3899, + "grad_norm": 0.8446467442385374, + "learning_rate": 0.003, + "loss": 3.9929, + "step": 38990 + }, + { + "epoch": 0.38991, + "grad_norm": 0.9147136725833791, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 38991 + }, + { + "epoch": 0.38992, + "grad_norm": 0.9121203015491869, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 38992 + }, + { + "epoch": 0.38993, + "grad_norm": 0.7552315227822552, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 38993 + }, + { + "epoch": 0.38994, + "grad_norm": 0.734094664199908, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 38994 + }, + { + "epoch": 0.38995, + "grad_norm": 0.6794853266731388, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 38995 + }, + { + "epoch": 0.38996, + "grad_norm": 0.8050589064337504, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 38996 + }, + { + "epoch": 0.38997, + "grad_norm": 0.9706926309101267, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 38997 + }, + { + "epoch": 0.38998, + "grad_norm": 1.181155929781573, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 38998 + }, + { + "epoch": 0.38999, + "grad_norm": 0.7904184459350899, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 38999 + }, + { + "epoch": 0.39, + "grad_norm": 0.5930299198736347, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 39000 + }, + { + "epoch": 0.39001, + "grad_norm": 0.6186870154988988, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 39001 + }, + { + "epoch": 0.39002, + "grad_norm": 0.7473532379716887, + "learning_rate": 0.003, + "loss": 3.9881, + "step": 39002 + }, + { + "epoch": 0.39003, + "grad_norm": 0.8818727228743183, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 39003 + }, + { + "epoch": 0.39004, + "grad_norm": 0.805603682692471, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 39004 + }, + { + "epoch": 0.39005, + "grad_norm": 0.6190587763397818, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 39005 + }, + { + "epoch": 0.39006, + "grad_norm": 0.5436488695345838, + "learning_rate": 0.003, + "loss": 4.025, + "step": 39006 + }, + { + "epoch": 0.39007, + "grad_norm": 0.5149292274198279, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 39007 + }, + { + "epoch": 0.39008, + "grad_norm": 0.5547521749224712, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 39008 + }, + { + "epoch": 0.39009, + "grad_norm": 0.6312343539087266, + "learning_rate": 0.003, + "loss": 4.0096, + "step": 39009 + }, + { + "epoch": 0.3901, + "grad_norm": 0.7797757625505974, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 39010 + }, + { + "epoch": 0.39011, + "grad_norm": 0.9313813846826744, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 39011 + }, + { + "epoch": 0.39012, + "grad_norm": 1.0634102633726241, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 39012 + }, + { + "epoch": 0.39013, + "grad_norm": 0.781883728540922, + "learning_rate": 0.003, + "loss": 4.012, + "step": 39013 + }, + { + "epoch": 0.39014, + "grad_norm": 0.7379892977304804, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 39014 + }, + { + "epoch": 0.39015, + "grad_norm": 0.8201390572191543, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 39015 + }, + { + "epoch": 0.39016, + "grad_norm": 0.8098030872920576, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 39016 + }, + { + "epoch": 0.39017, + "grad_norm": 0.7598209188856165, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 39017 + }, + { + "epoch": 0.39018, + "grad_norm": 0.7412118914894952, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 39018 + }, + { + "epoch": 0.39019, + "grad_norm": 0.718323461178245, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 39019 + }, + { + "epoch": 0.3902, + "grad_norm": 0.6883329701648876, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 39020 + }, + { + "epoch": 0.39021, + "grad_norm": 0.7385570481268774, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 39021 + }, + { + "epoch": 0.39022, + "grad_norm": 0.7909187157723366, + "learning_rate": 0.003, + "loss": 4.017, + "step": 39022 + }, + { + "epoch": 0.39023, + "grad_norm": 0.807423563408142, + "learning_rate": 0.003, + "loss": 4.0, + "step": 39023 + }, + { + "epoch": 0.39024, + "grad_norm": 0.9209797286727841, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 39024 + }, + { + "epoch": 0.39025, + "grad_norm": 1.0546052173098763, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 39025 + }, + { + "epoch": 0.39026, + "grad_norm": 1.034520788275599, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 39026 + }, + { + "epoch": 0.39027, + "grad_norm": 0.865482330395809, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 39027 + }, + { + "epoch": 0.39028, + "grad_norm": 0.8532870966696903, + "learning_rate": 0.003, + "loss": 4.052, + "step": 39028 + }, + { + "epoch": 0.39029, + "grad_norm": 0.8273736011518318, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 39029 + }, + { + "epoch": 0.3903, + "grad_norm": 0.7592301759482895, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 39030 + }, + { + "epoch": 0.39031, + "grad_norm": 0.6723497074667578, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 39031 + }, + { + "epoch": 0.39032, + "grad_norm": 0.6155583017027578, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 39032 + }, + { + "epoch": 0.39033, + "grad_norm": 0.6052675709376193, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 39033 + }, + { + "epoch": 0.39034, + "grad_norm": 0.5948435171037059, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 39034 + }, + { + "epoch": 0.39035, + "grad_norm": 0.6439500685309086, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 39035 + }, + { + "epoch": 0.39036, + "grad_norm": 0.8151781135669158, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 39036 + }, + { + "epoch": 0.39037, + "grad_norm": 0.9688666741380939, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 39037 + }, + { + "epoch": 0.39038, + "grad_norm": 1.1935179003796808, + "learning_rate": 0.003, + "loss": 4.033, + "step": 39038 + }, + { + "epoch": 0.39039, + "grad_norm": 0.7292052745242955, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 39039 + }, + { + "epoch": 0.3904, + "grad_norm": 0.7531042125933944, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 39040 + }, + { + "epoch": 0.39041, + "grad_norm": 1.0141415219005792, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 39041 + }, + { + "epoch": 0.39042, + "grad_norm": 1.1386303700168978, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 39042 + }, + { + "epoch": 0.39043, + "grad_norm": 0.823033554370875, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 39043 + }, + { + "epoch": 0.39044, + "grad_norm": 0.8409542909149446, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 39044 + }, + { + "epoch": 0.39045, + "grad_norm": 0.8362177091273493, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 39045 + }, + { + "epoch": 0.39046, + "grad_norm": 0.9152444780548338, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 39046 + }, + { + "epoch": 0.39047, + "grad_norm": 1.092514952438449, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 39047 + }, + { + "epoch": 0.39048, + "grad_norm": 1.0590417238730638, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 39048 + }, + { + "epoch": 0.39049, + "grad_norm": 1.1103259219064612, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 39049 + }, + { + "epoch": 0.3905, + "grad_norm": 0.8626227138873207, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 39050 + }, + { + "epoch": 0.39051, + "grad_norm": 0.9611983737957676, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 39051 + }, + { + "epoch": 0.39052, + "grad_norm": 1.0628279161739942, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 39052 + }, + { + "epoch": 0.39053, + "grad_norm": 1.0260922451044823, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 39053 + }, + { + "epoch": 0.39054, + "grad_norm": 1.0394948541671032, + "learning_rate": 0.003, + "loss": 4.051, + "step": 39054 + }, + { + "epoch": 0.39055, + "grad_norm": 0.8646047880241559, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 39055 + }, + { + "epoch": 0.39056, + "grad_norm": 0.7441383548119915, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 39056 + }, + { + "epoch": 0.39057, + "grad_norm": 0.7293936417853237, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 39057 + }, + { + "epoch": 0.39058, + "grad_norm": 0.6945999922724599, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 39058 + }, + { + "epoch": 0.39059, + "grad_norm": 0.7429887946430981, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 39059 + }, + { + "epoch": 0.3906, + "grad_norm": 0.670487270930614, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 39060 + }, + { + "epoch": 0.39061, + "grad_norm": 0.5395580035551854, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 39061 + }, + { + "epoch": 0.39062, + "grad_norm": 0.5490094323476469, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 39062 + }, + { + "epoch": 0.39063, + "grad_norm": 0.5753079008362326, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 39063 + }, + { + "epoch": 0.39064, + "grad_norm": 0.715007954582314, + "learning_rate": 0.003, + "loss": 3.9967, + "step": 39064 + }, + { + "epoch": 0.39065, + "grad_norm": 0.7666306551651113, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 39065 + }, + { + "epoch": 0.39066, + "grad_norm": 0.7646146504676241, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 39066 + }, + { + "epoch": 0.39067, + "grad_norm": 0.8146670307774407, + "learning_rate": 0.003, + "loss": 3.9781, + "step": 39067 + }, + { + "epoch": 0.39068, + "grad_norm": 0.970755667401487, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 39068 + }, + { + "epoch": 0.39069, + "grad_norm": 1.2884001552992577, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 39069 + }, + { + "epoch": 0.3907, + "grad_norm": 0.7590498380683756, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 39070 + }, + { + "epoch": 0.39071, + "grad_norm": 0.6989054549598689, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 39071 + }, + { + "epoch": 0.39072, + "grad_norm": 0.8018970126419395, + "learning_rate": 0.003, + "loss": 3.9913, + "step": 39072 + }, + { + "epoch": 0.39073, + "grad_norm": 0.7572439944844384, + "learning_rate": 0.003, + "loss": 3.9885, + "step": 39073 + }, + { + "epoch": 0.39074, + "grad_norm": 0.7087942428071533, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 39074 + }, + { + "epoch": 0.39075, + "grad_norm": 0.7340673413110724, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 39075 + }, + { + "epoch": 0.39076, + "grad_norm": 0.7664551332444007, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 39076 + }, + { + "epoch": 0.39077, + "grad_norm": 0.6825573437304558, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 39077 + }, + { + "epoch": 0.39078, + "grad_norm": 0.6765893599924173, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 39078 + }, + { + "epoch": 0.39079, + "grad_norm": 0.6597317465230786, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 39079 + }, + { + "epoch": 0.3908, + "grad_norm": 0.6230104103087698, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 39080 + }, + { + "epoch": 0.39081, + "grad_norm": 0.6680611657305693, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 39081 + }, + { + "epoch": 0.39082, + "grad_norm": 0.7810623199002388, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 39082 + }, + { + "epoch": 0.39083, + "grad_norm": 0.7661572672282719, + "learning_rate": 0.003, + "loss": 3.9993, + "step": 39083 + }, + { + "epoch": 0.39084, + "grad_norm": 0.8457696068966085, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 39084 + }, + { + "epoch": 0.39085, + "grad_norm": 1.1167313974837558, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 39085 + }, + { + "epoch": 0.39086, + "grad_norm": 0.8951446734527246, + "learning_rate": 0.003, + "loss": 4.055, + "step": 39086 + }, + { + "epoch": 0.39087, + "grad_norm": 0.810913042046443, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 39087 + }, + { + "epoch": 0.39088, + "grad_norm": 1.0102564474742426, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 39088 + }, + { + "epoch": 0.39089, + "grad_norm": 1.042320406970806, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 39089 + }, + { + "epoch": 0.3909, + "grad_norm": 1.119946141671696, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 39090 + }, + { + "epoch": 0.39091, + "grad_norm": 0.8648334896000849, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 39091 + }, + { + "epoch": 0.39092, + "grad_norm": 0.7459348819096437, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 39092 + }, + { + "epoch": 0.39093, + "grad_norm": 0.6485998644103419, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 39093 + }, + { + "epoch": 0.39094, + "grad_norm": 0.7101076934208466, + "learning_rate": 0.003, + "loss": 4.0021, + "step": 39094 + }, + { + "epoch": 0.39095, + "grad_norm": 0.7722428852785114, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 39095 + }, + { + "epoch": 0.39096, + "grad_norm": 0.678579739145256, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 39096 + }, + { + "epoch": 0.39097, + "grad_norm": 0.7935463738849089, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 39097 + }, + { + "epoch": 0.39098, + "grad_norm": 0.9067087956951159, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 39098 + }, + { + "epoch": 0.39099, + "grad_norm": 1.065898099607121, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 39099 + }, + { + "epoch": 0.391, + "grad_norm": 0.8672468148212513, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 39100 + }, + { + "epoch": 0.39101, + "grad_norm": 0.7379129932952632, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 39101 + }, + { + "epoch": 0.39102, + "grad_norm": 0.6591863617054038, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 39102 + }, + { + "epoch": 0.39103, + "grad_norm": 0.62426663867065, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 39103 + }, + { + "epoch": 0.39104, + "grad_norm": 0.6230506821806564, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 39104 + }, + { + "epoch": 0.39105, + "grad_norm": 0.7128543199225079, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 39105 + }, + { + "epoch": 0.39106, + "grad_norm": 0.8076648397340168, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 39106 + }, + { + "epoch": 0.39107, + "grad_norm": 0.8502337706539554, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 39107 + }, + { + "epoch": 0.39108, + "grad_norm": 0.8364200815755978, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 39108 + }, + { + "epoch": 0.39109, + "grad_norm": 0.8409030125382458, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 39109 + }, + { + "epoch": 0.3911, + "grad_norm": 0.8111703202968038, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 39110 + }, + { + "epoch": 0.39111, + "grad_norm": 1.058239558649576, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 39111 + }, + { + "epoch": 0.39112, + "grad_norm": 1.134845888210395, + "learning_rate": 0.003, + "loss": 4.028, + "step": 39112 + }, + { + "epoch": 0.39113, + "grad_norm": 1.1421267254010425, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 39113 + }, + { + "epoch": 0.39114, + "grad_norm": 1.0619022302925285, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 39114 + }, + { + "epoch": 0.39115, + "grad_norm": 0.9100691483077936, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 39115 + }, + { + "epoch": 0.39116, + "grad_norm": 0.8916725090090295, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 39116 + }, + { + "epoch": 0.39117, + "grad_norm": 0.7742151324389536, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 39117 + }, + { + "epoch": 0.39118, + "grad_norm": 0.7312962315672108, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 39118 + }, + { + "epoch": 0.39119, + "grad_norm": 0.6550006263443916, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 39119 + }, + { + "epoch": 0.3912, + "grad_norm": 0.5405007084207282, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 39120 + }, + { + "epoch": 0.39121, + "grad_norm": 0.542016444460779, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 39121 + }, + { + "epoch": 0.39122, + "grad_norm": 0.6147718477231362, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 39122 + }, + { + "epoch": 0.39123, + "grad_norm": 0.7322773675441783, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 39123 + }, + { + "epoch": 0.39124, + "grad_norm": 0.9462150727499833, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 39124 + }, + { + "epoch": 0.39125, + "grad_norm": 1.2542806562753868, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 39125 + }, + { + "epoch": 0.39126, + "grad_norm": 0.9394940708322731, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 39126 + }, + { + "epoch": 0.39127, + "grad_norm": 1.0994460767400747, + "learning_rate": 0.003, + "loss": 4.035, + "step": 39127 + }, + { + "epoch": 0.39128, + "grad_norm": 0.9617860888676376, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 39128 + }, + { + "epoch": 0.39129, + "grad_norm": 0.9047188816109052, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 39129 + }, + { + "epoch": 0.3913, + "grad_norm": 0.8827174911386031, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 39130 + }, + { + "epoch": 0.39131, + "grad_norm": 0.8119238691381943, + "learning_rate": 0.003, + "loss": 4.0688, + "step": 39131 + }, + { + "epoch": 0.39132, + "grad_norm": 0.7564593583873209, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 39132 + }, + { + "epoch": 0.39133, + "grad_norm": 0.8360525915268799, + "learning_rate": 0.003, + "loss": 3.9915, + "step": 39133 + }, + { + "epoch": 0.39134, + "grad_norm": 0.907630519728082, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 39134 + }, + { + "epoch": 0.39135, + "grad_norm": 0.9874644276696573, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 39135 + }, + { + "epoch": 0.39136, + "grad_norm": 1.0175786629972918, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 39136 + }, + { + "epoch": 0.39137, + "grad_norm": 1.09515926991001, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 39137 + }, + { + "epoch": 0.39138, + "grad_norm": 1.1035986536489992, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 39138 + }, + { + "epoch": 0.39139, + "grad_norm": 0.9717009150948287, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 39139 + }, + { + "epoch": 0.3914, + "grad_norm": 0.991816437854977, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 39140 + }, + { + "epoch": 0.39141, + "grad_norm": 1.0507327152302461, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 39141 + }, + { + "epoch": 0.39142, + "grad_norm": 0.8752065263736463, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 39142 + }, + { + "epoch": 0.39143, + "grad_norm": 0.7422867330543961, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 39143 + }, + { + "epoch": 0.39144, + "grad_norm": 0.6886539915218373, + "learning_rate": 0.003, + "loss": 4.064, + "step": 39144 + }, + { + "epoch": 0.39145, + "grad_norm": 0.66960040505332, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 39145 + }, + { + "epoch": 0.39146, + "grad_norm": 0.7071860537749026, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 39146 + }, + { + "epoch": 0.39147, + "grad_norm": 0.7569258681196903, + "learning_rate": 0.003, + "loss": 4.047, + "step": 39147 + }, + { + "epoch": 0.39148, + "grad_norm": 0.6907810126536352, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 39148 + }, + { + "epoch": 0.39149, + "grad_norm": 0.6552271194516468, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 39149 + }, + { + "epoch": 0.3915, + "grad_norm": 0.7816907756452668, + "learning_rate": 0.003, + "loss": 4.054, + "step": 39150 + }, + { + "epoch": 0.39151, + "grad_norm": 0.8386439894285723, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 39151 + }, + { + "epoch": 0.39152, + "grad_norm": 0.8171315624755412, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 39152 + }, + { + "epoch": 0.39153, + "grad_norm": 0.853747185541811, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 39153 + }, + { + "epoch": 0.39154, + "grad_norm": 0.9605083144814628, + "learning_rate": 0.003, + "loss": 4.07, + "step": 39154 + }, + { + "epoch": 0.39155, + "grad_norm": 1.2381372791512897, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 39155 + }, + { + "epoch": 0.39156, + "grad_norm": 0.9478790356670367, + "learning_rate": 0.003, + "loss": 4.011, + "step": 39156 + }, + { + "epoch": 0.39157, + "grad_norm": 0.8590800922167017, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 39157 + }, + { + "epoch": 0.39158, + "grad_norm": 0.749655052662787, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 39158 + }, + { + "epoch": 0.39159, + "grad_norm": 0.6901234196831307, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 39159 + }, + { + "epoch": 0.3916, + "grad_norm": 0.7439698335622922, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 39160 + }, + { + "epoch": 0.39161, + "grad_norm": 0.7070418220766892, + "learning_rate": 0.003, + "loss": 4.033, + "step": 39161 + }, + { + "epoch": 0.39162, + "grad_norm": 0.6190260471622396, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 39162 + }, + { + "epoch": 0.39163, + "grad_norm": 0.6394745670572777, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 39163 + }, + { + "epoch": 0.39164, + "grad_norm": 0.6472982730350298, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 39164 + }, + { + "epoch": 0.39165, + "grad_norm": 0.6097871082081026, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 39165 + }, + { + "epoch": 0.39166, + "grad_norm": 0.6374839114281045, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 39166 + }, + { + "epoch": 0.39167, + "grad_norm": 0.5778677589894321, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 39167 + }, + { + "epoch": 0.39168, + "grad_norm": 0.5798901994986596, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 39168 + }, + { + "epoch": 0.39169, + "grad_norm": 0.5687253130619117, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 39169 + }, + { + "epoch": 0.3917, + "grad_norm": 0.5902816500879364, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 39170 + }, + { + "epoch": 0.39171, + "grad_norm": 0.6410600931299834, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 39171 + }, + { + "epoch": 0.39172, + "grad_norm": 0.7516672631800212, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 39172 + }, + { + "epoch": 0.39173, + "grad_norm": 0.9917869704591344, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 39173 + }, + { + "epoch": 0.39174, + "grad_norm": 1.3194410921130513, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 39174 + }, + { + "epoch": 0.39175, + "grad_norm": 0.644083819857115, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 39175 + }, + { + "epoch": 0.39176, + "grad_norm": 0.7447531302470304, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 39176 + }, + { + "epoch": 0.39177, + "grad_norm": 0.8940569501511737, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 39177 + }, + { + "epoch": 0.39178, + "grad_norm": 1.088532865633208, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 39178 + }, + { + "epoch": 0.39179, + "grad_norm": 1.002795948255031, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 39179 + }, + { + "epoch": 0.3918, + "grad_norm": 0.9128416913521716, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 39180 + }, + { + "epoch": 0.39181, + "grad_norm": 0.8953820444645237, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 39181 + }, + { + "epoch": 0.39182, + "grad_norm": 0.8677020056449347, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 39182 + }, + { + "epoch": 0.39183, + "grad_norm": 1.0471275180337432, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 39183 + }, + { + "epoch": 0.39184, + "grad_norm": 0.7304498802266806, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 39184 + }, + { + "epoch": 0.39185, + "grad_norm": 0.808945930939048, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 39185 + }, + { + "epoch": 0.39186, + "grad_norm": 0.8856314257362674, + "learning_rate": 0.003, + "loss": 4.047, + "step": 39186 + }, + { + "epoch": 0.39187, + "grad_norm": 0.9953575145879139, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 39187 + }, + { + "epoch": 0.39188, + "grad_norm": 0.9216021510469722, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 39188 + }, + { + "epoch": 0.39189, + "grad_norm": 0.9531773750945535, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 39189 + }, + { + "epoch": 0.3919, + "grad_norm": 1.0777790737606725, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 39190 + }, + { + "epoch": 0.39191, + "grad_norm": 0.9407047651114191, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 39191 + }, + { + "epoch": 0.39192, + "grad_norm": 1.0111179148576765, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 39192 + }, + { + "epoch": 0.39193, + "grad_norm": 0.9500408660436961, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 39193 + }, + { + "epoch": 0.39194, + "grad_norm": 0.9143682913804794, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 39194 + }, + { + "epoch": 0.39195, + "grad_norm": 0.97782849421417, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 39195 + }, + { + "epoch": 0.39196, + "grad_norm": 0.9811707207643349, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 39196 + }, + { + "epoch": 0.39197, + "grad_norm": 0.8868488282352592, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 39197 + }, + { + "epoch": 0.39198, + "grad_norm": 0.8398086680347624, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 39198 + }, + { + "epoch": 0.39199, + "grad_norm": 0.8606191031619372, + "learning_rate": 0.003, + "loss": 4.078, + "step": 39199 + }, + { + "epoch": 0.392, + "grad_norm": 0.8699709368488356, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 39200 + }, + { + "epoch": 0.39201, + "grad_norm": 0.9672969296252945, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 39201 + }, + { + "epoch": 0.39202, + "grad_norm": 1.1293047652472585, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 39202 + }, + { + "epoch": 0.39203, + "grad_norm": 0.9314906976611025, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 39203 + }, + { + "epoch": 0.39204, + "grad_norm": 0.903918576806293, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 39204 + }, + { + "epoch": 0.39205, + "grad_norm": 0.8422354174422024, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 39205 + }, + { + "epoch": 0.39206, + "grad_norm": 0.7860563402573563, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 39206 + }, + { + "epoch": 0.39207, + "grad_norm": 0.8132812271827865, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 39207 + }, + { + "epoch": 0.39208, + "grad_norm": 0.8925741061122466, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 39208 + }, + { + "epoch": 0.39209, + "grad_norm": 0.8176020255838593, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 39209 + }, + { + "epoch": 0.3921, + "grad_norm": 0.7741772855711979, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 39210 + }, + { + "epoch": 0.39211, + "grad_norm": 0.7713102613131877, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 39211 + }, + { + "epoch": 0.39212, + "grad_norm": 0.6968579711058123, + "learning_rate": 0.003, + "loss": 3.9901, + "step": 39212 + }, + { + "epoch": 0.39213, + "grad_norm": 0.6192057717017315, + "learning_rate": 0.003, + "loss": 4.053, + "step": 39213 + }, + { + "epoch": 0.39214, + "grad_norm": 0.6553635129757325, + "learning_rate": 0.003, + "loss": 4.015, + "step": 39214 + }, + { + "epoch": 0.39215, + "grad_norm": 0.7351935610680815, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 39215 + }, + { + "epoch": 0.39216, + "grad_norm": 0.8823796069277474, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 39216 + }, + { + "epoch": 0.39217, + "grad_norm": 1.1723983895259513, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 39217 + }, + { + "epoch": 0.39218, + "grad_norm": 0.950409561714263, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 39218 + }, + { + "epoch": 0.39219, + "grad_norm": 0.7512428723261747, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 39219 + }, + { + "epoch": 0.3922, + "grad_norm": 0.6863898737111728, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 39220 + }, + { + "epoch": 0.39221, + "grad_norm": 0.7864439953135651, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 39221 + }, + { + "epoch": 0.39222, + "grad_norm": 0.7827897337545702, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 39222 + }, + { + "epoch": 0.39223, + "grad_norm": 0.690365005712535, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 39223 + }, + { + "epoch": 0.39224, + "grad_norm": 0.6980146240415882, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 39224 + }, + { + "epoch": 0.39225, + "grad_norm": 0.6042417874449307, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 39225 + }, + { + "epoch": 0.39226, + "grad_norm": 0.649751259753974, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 39226 + }, + { + "epoch": 0.39227, + "grad_norm": 0.8531613009713758, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 39227 + }, + { + "epoch": 0.39228, + "grad_norm": 1.0928428177189653, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 39228 + }, + { + "epoch": 0.39229, + "grad_norm": 0.9821857546893401, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 39229 + }, + { + "epoch": 0.3923, + "grad_norm": 0.9948934931024971, + "learning_rate": 0.003, + "loss": 3.9888, + "step": 39230 + }, + { + "epoch": 0.39231, + "grad_norm": 0.8006880613214733, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 39231 + }, + { + "epoch": 0.39232, + "grad_norm": 0.6897992054615917, + "learning_rate": 0.003, + "loss": 4.029, + "step": 39232 + }, + { + "epoch": 0.39233, + "grad_norm": 0.6261283284903247, + "learning_rate": 0.003, + "loss": 3.9997, + "step": 39233 + }, + { + "epoch": 0.39234, + "grad_norm": 0.6849039242896179, + "learning_rate": 0.003, + "loss": 3.9807, + "step": 39234 + }, + { + "epoch": 0.39235, + "grad_norm": 0.6595396059691478, + "learning_rate": 0.003, + "loss": 4.026, + "step": 39235 + }, + { + "epoch": 0.39236, + "grad_norm": 0.6047383296491797, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 39236 + }, + { + "epoch": 0.39237, + "grad_norm": 0.6553993178022095, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 39237 + }, + { + "epoch": 0.39238, + "grad_norm": 0.6441906859536398, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 39238 + }, + { + "epoch": 0.39239, + "grad_norm": 0.5986791164187617, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 39239 + }, + { + "epoch": 0.3924, + "grad_norm": 0.6440119728242677, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 39240 + }, + { + "epoch": 0.39241, + "grad_norm": 0.8389791156507256, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 39241 + }, + { + "epoch": 0.39242, + "grad_norm": 1.1799076450984576, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 39242 + }, + { + "epoch": 0.39243, + "grad_norm": 1.0825347323151535, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 39243 + }, + { + "epoch": 0.39244, + "grad_norm": 0.9421154356810132, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 39244 + }, + { + "epoch": 0.39245, + "grad_norm": 0.9649149239441657, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 39245 + }, + { + "epoch": 0.39246, + "grad_norm": 0.9245207823778563, + "learning_rate": 0.003, + "loss": 4.004, + "step": 39246 + }, + { + "epoch": 0.39247, + "grad_norm": 0.8209301872756202, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 39247 + }, + { + "epoch": 0.39248, + "grad_norm": 0.8210388583168771, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 39248 + }, + { + "epoch": 0.39249, + "grad_norm": 0.7826180666897121, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 39249 + }, + { + "epoch": 0.3925, + "grad_norm": 0.6905460022350609, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 39250 + }, + { + "epoch": 0.39251, + "grad_norm": 0.6656693695041219, + "learning_rate": 0.003, + "loss": 4.019, + "step": 39251 + }, + { + "epoch": 0.39252, + "grad_norm": 0.8325390186469159, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 39252 + }, + { + "epoch": 0.39253, + "grad_norm": 0.9492766380976652, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 39253 + }, + { + "epoch": 0.39254, + "grad_norm": 0.9605533371249015, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 39254 + }, + { + "epoch": 0.39255, + "grad_norm": 1.3086997210385067, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 39255 + }, + { + "epoch": 0.39256, + "grad_norm": 0.8320447578834089, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 39256 + }, + { + "epoch": 0.39257, + "grad_norm": 0.6631181263761855, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 39257 + }, + { + "epoch": 0.39258, + "grad_norm": 0.5736438596538885, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 39258 + }, + { + "epoch": 0.39259, + "grad_norm": 0.5643464679279309, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 39259 + }, + { + "epoch": 0.3926, + "grad_norm": 0.5809158122417329, + "learning_rate": 0.003, + "loss": 4.0032, + "step": 39260 + }, + { + "epoch": 0.39261, + "grad_norm": 0.6156774955466525, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 39261 + }, + { + "epoch": 0.39262, + "grad_norm": 0.668153026685711, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 39262 + }, + { + "epoch": 0.39263, + "grad_norm": 0.6819708629679846, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 39263 + }, + { + "epoch": 0.39264, + "grad_norm": 0.7890984716576543, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 39264 + }, + { + "epoch": 0.39265, + "grad_norm": 0.9257010508334221, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 39265 + }, + { + "epoch": 0.39266, + "grad_norm": 1.0755305837438882, + "learning_rate": 0.003, + "loss": 4.066, + "step": 39266 + }, + { + "epoch": 0.39267, + "grad_norm": 0.8719852001625102, + "learning_rate": 0.003, + "loss": 4.067, + "step": 39267 + }, + { + "epoch": 0.39268, + "grad_norm": 0.7950974388895254, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 39268 + }, + { + "epoch": 0.39269, + "grad_norm": 0.8709461191242271, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 39269 + }, + { + "epoch": 0.3927, + "grad_norm": 1.0366531741888154, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 39270 + }, + { + "epoch": 0.39271, + "grad_norm": 1.0276860743772074, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 39271 + }, + { + "epoch": 0.39272, + "grad_norm": 1.0958920376843175, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 39272 + }, + { + "epoch": 0.39273, + "grad_norm": 1.170524723667493, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 39273 + }, + { + "epoch": 0.39274, + "grad_norm": 0.8816511680264806, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 39274 + }, + { + "epoch": 0.39275, + "grad_norm": 0.8760904573043058, + "learning_rate": 0.003, + "loss": 4.037, + "step": 39275 + }, + { + "epoch": 0.39276, + "grad_norm": 0.8552417816975166, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 39276 + }, + { + "epoch": 0.39277, + "grad_norm": 0.8303743868837168, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 39277 + }, + { + "epoch": 0.39278, + "grad_norm": 0.9693179437601708, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 39278 + }, + { + "epoch": 0.39279, + "grad_norm": 1.0889612020917343, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 39279 + }, + { + "epoch": 0.3928, + "grad_norm": 1.0301227188296969, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 39280 + }, + { + "epoch": 0.39281, + "grad_norm": 0.9845212805307286, + "learning_rate": 0.003, + "loss": 4.052, + "step": 39281 + }, + { + "epoch": 0.39282, + "grad_norm": 0.9030340285891382, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 39282 + }, + { + "epoch": 0.39283, + "grad_norm": 0.8540092325350647, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 39283 + }, + { + "epoch": 0.39284, + "grad_norm": 0.8760339890962637, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 39284 + }, + { + "epoch": 0.39285, + "grad_norm": 0.8662304599789061, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 39285 + }, + { + "epoch": 0.39286, + "grad_norm": 0.7720622716273693, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 39286 + }, + { + "epoch": 0.39287, + "grad_norm": 0.7137305941966132, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 39287 + }, + { + "epoch": 0.39288, + "grad_norm": 0.7351038960377343, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 39288 + }, + { + "epoch": 0.39289, + "grad_norm": 0.809935734436347, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 39289 + }, + { + "epoch": 0.3929, + "grad_norm": 0.7255659699299394, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 39290 + }, + { + "epoch": 0.39291, + "grad_norm": 0.6897473992782334, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 39291 + }, + { + "epoch": 0.39292, + "grad_norm": 0.7331768737765448, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 39292 + }, + { + "epoch": 0.39293, + "grad_norm": 0.7071662559824192, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 39293 + }, + { + "epoch": 0.39294, + "grad_norm": 0.7384469184156739, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 39294 + }, + { + "epoch": 0.39295, + "grad_norm": 0.8145392926083829, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 39295 + }, + { + "epoch": 0.39296, + "grad_norm": 0.8682553222642254, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 39296 + }, + { + "epoch": 0.39297, + "grad_norm": 0.878482837143102, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 39297 + }, + { + "epoch": 0.39298, + "grad_norm": 0.8094820996536946, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 39298 + }, + { + "epoch": 0.39299, + "grad_norm": 0.7783234464288294, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 39299 + }, + { + "epoch": 0.393, + "grad_norm": 0.9084665487451583, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 39300 + }, + { + "epoch": 0.39301, + "grad_norm": 1.0248024007419332, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 39301 + }, + { + "epoch": 0.39302, + "grad_norm": 0.9877401092919567, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 39302 + }, + { + "epoch": 0.39303, + "grad_norm": 0.9393565740283653, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 39303 + }, + { + "epoch": 0.39304, + "grad_norm": 0.8857623080709521, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 39304 + }, + { + "epoch": 0.39305, + "grad_norm": 0.8541992290961894, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 39305 + }, + { + "epoch": 0.39306, + "grad_norm": 0.6644852930480416, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 39306 + }, + { + "epoch": 0.39307, + "grad_norm": 0.681863841032857, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 39307 + }, + { + "epoch": 0.39308, + "grad_norm": 0.6246724889737784, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 39308 + }, + { + "epoch": 0.39309, + "grad_norm": 0.6831906781406697, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 39309 + }, + { + "epoch": 0.3931, + "grad_norm": 0.7860477947054407, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 39310 + }, + { + "epoch": 0.39311, + "grad_norm": 0.8966953868752349, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 39311 + }, + { + "epoch": 0.39312, + "grad_norm": 0.8935627260373181, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 39312 + }, + { + "epoch": 0.39313, + "grad_norm": 0.8923670710000523, + "learning_rate": 0.003, + "loss": 4.044, + "step": 39313 + }, + { + "epoch": 0.39314, + "grad_norm": 0.9008286091383213, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 39314 + }, + { + "epoch": 0.39315, + "grad_norm": 0.8910763580928758, + "learning_rate": 0.003, + "loss": 4.036, + "step": 39315 + }, + { + "epoch": 0.39316, + "grad_norm": 0.8265104727233311, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 39316 + }, + { + "epoch": 0.39317, + "grad_norm": 0.7318070471729506, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 39317 + }, + { + "epoch": 0.39318, + "grad_norm": 0.8591312593138556, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 39318 + }, + { + "epoch": 0.39319, + "grad_norm": 1.1587053159485254, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 39319 + }, + { + "epoch": 0.3932, + "grad_norm": 0.9206810320474739, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 39320 + }, + { + "epoch": 0.39321, + "grad_norm": 0.9163699581605151, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 39321 + }, + { + "epoch": 0.39322, + "grad_norm": 0.9565794996267237, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 39322 + }, + { + "epoch": 0.39323, + "grad_norm": 0.8536925161416944, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 39323 + }, + { + "epoch": 0.39324, + "grad_norm": 0.8308648005841419, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 39324 + }, + { + "epoch": 0.39325, + "grad_norm": 0.6916226770250781, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 39325 + }, + { + "epoch": 0.39326, + "grad_norm": 0.6715291207664057, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 39326 + }, + { + "epoch": 0.39327, + "grad_norm": 0.6333348965138055, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 39327 + }, + { + "epoch": 0.39328, + "grad_norm": 0.6460291751810526, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 39328 + }, + { + "epoch": 0.39329, + "grad_norm": 0.7928199907275805, + "learning_rate": 0.003, + "loss": 4.061, + "step": 39329 + }, + { + "epoch": 0.3933, + "grad_norm": 0.945742414624607, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 39330 + }, + { + "epoch": 0.39331, + "grad_norm": 1.0003611337515999, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 39331 + }, + { + "epoch": 0.39332, + "grad_norm": 0.9538576129453069, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 39332 + }, + { + "epoch": 0.39333, + "grad_norm": 0.9677872966642639, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 39333 + }, + { + "epoch": 0.39334, + "grad_norm": 0.970568785603239, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 39334 + }, + { + "epoch": 0.39335, + "grad_norm": 0.8943144487859285, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 39335 + }, + { + "epoch": 0.39336, + "grad_norm": 0.8757786325888861, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 39336 + }, + { + "epoch": 0.39337, + "grad_norm": 0.8333598115871387, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 39337 + }, + { + "epoch": 0.39338, + "grad_norm": 0.7333171538506128, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 39338 + }, + { + "epoch": 0.39339, + "grad_norm": 0.7189329898732368, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 39339 + }, + { + "epoch": 0.3934, + "grad_norm": 0.6729006557029984, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 39340 + }, + { + "epoch": 0.39341, + "grad_norm": 0.740914978705691, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 39341 + }, + { + "epoch": 0.39342, + "grad_norm": 0.911686289737936, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 39342 + }, + { + "epoch": 0.39343, + "grad_norm": 1.0556295367331145, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 39343 + }, + { + "epoch": 0.39344, + "grad_norm": 0.7780437477090704, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 39344 + }, + { + "epoch": 0.39345, + "grad_norm": 0.7821298813208998, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 39345 + }, + { + "epoch": 0.39346, + "grad_norm": 0.9673114736344106, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 39346 + }, + { + "epoch": 0.39347, + "grad_norm": 1.133844479152847, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 39347 + }, + { + "epoch": 0.39348, + "grad_norm": 0.7714166227233458, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 39348 + }, + { + "epoch": 0.39349, + "grad_norm": 0.7313304281103975, + "learning_rate": 0.003, + "loss": 4.015, + "step": 39349 + }, + { + "epoch": 0.3935, + "grad_norm": 0.798402839072497, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 39350 + }, + { + "epoch": 0.39351, + "grad_norm": 0.7865530044680373, + "learning_rate": 0.003, + "loss": 4.036, + "step": 39351 + }, + { + "epoch": 0.39352, + "grad_norm": 0.717804972605612, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 39352 + }, + { + "epoch": 0.39353, + "grad_norm": 0.7321220900284712, + "learning_rate": 0.003, + "loss": 3.9932, + "step": 39353 + }, + { + "epoch": 0.39354, + "grad_norm": 0.7579842256227945, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 39354 + }, + { + "epoch": 0.39355, + "grad_norm": 0.7527028946527576, + "learning_rate": 0.003, + "loss": 4.0044, + "step": 39355 + }, + { + "epoch": 0.39356, + "grad_norm": 0.7824211053302695, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 39356 + }, + { + "epoch": 0.39357, + "grad_norm": 0.8530930430540148, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 39357 + }, + { + "epoch": 0.39358, + "grad_norm": 0.8605179076079595, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 39358 + }, + { + "epoch": 0.39359, + "grad_norm": 0.8903815462188469, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 39359 + }, + { + "epoch": 0.3936, + "grad_norm": 0.9498909193521354, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 39360 + }, + { + "epoch": 0.39361, + "grad_norm": 0.8435629386016806, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 39361 + }, + { + "epoch": 0.39362, + "grad_norm": 0.8105677456018803, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 39362 + }, + { + "epoch": 0.39363, + "grad_norm": 0.7550140593491753, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 39363 + }, + { + "epoch": 0.39364, + "grad_norm": 0.7743958750034681, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 39364 + }, + { + "epoch": 0.39365, + "grad_norm": 0.824352795874293, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 39365 + }, + { + "epoch": 0.39366, + "grad_norm": 0.8559434074209179, + "learning_rate": 0.003, + "loss": 4.046, + "step": 39366 + }, + { + "epoch": 0.39367, + "grad_norm": 0.8612770806902038, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 39367 + }, + { + "epoch": 0.39368, + "grad_norm": 0.9530054392228408, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 39368 + }, + { + "epoch": 0.39369, + "grad_norm": 1.0751153266417104, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 39369 + }, + { + "epoch": 0.3937, + "grad_norm": 1.0785235145841485, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 39370 + }, + { + "epoch": 0.39371, + "grad_norm": 0.9726653631724052, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 39371 + }, + { + "epoch": 0.39372, + "grad_norm": 0.9309857058883836, + "learning_rate": 0.003, + "loss": 4.026, + "step": 39372 + }, + { + "epoch": 0.39373, + "grad_norm": 0.8808502490041031, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 39373 + }, + { + "epoch": 0.39374, + "grad_norm": 0.8681161302631496, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 39374 + }, + { + "epoch": 0.39375, + "grad_norm": 0.8291569845758874, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 39375 + }, + { + "epoch": 0.39376, + "grad_norm": 0.8449883368809782, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 39376 + }, + { + "epoch": 0.39377, + "grad_norm": 0.7287910600400807, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 39377 + }, + { + "epoch": 0.39378, + "grad_norm": 0.6732661749069272, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 39378 + }, + { + "epoch": 0.39379, + "grad_norm": 0.5995011281943955, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 39379 + }, + { + "epoch": 0.3938, + "grad_norm": 0.5682648057679893, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 39380 + }, + { + "epoch": 0.39381, + "grad_norm": 0.5714411067380628, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 39381 + }, + { + "epoch": 0.39382, + "grad_norm": 0.608376655224097, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 39382 + }, + { + "epoch": 0.39383, + "grad_norm": 0.5971768884467347, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 39383 + }, + { + "epoch": 0.39384, + "grad_norm": 0.6590563173503614, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 39384 + }, + { + "epoch": 0.39385, + "grad_norm": 0.8104543965963003, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 39385 + }, + { + "epoch": 0.39386, + "grad_norm": 1.0194136314241367, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 39386 + }, + { + "epoch": 0.39387, + "grad_norm": 1.1839910724660057, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 39387 + }, + { + "epoch": 0.39388, + "grad_norm": 0.816729660594923, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 39388 + }, + { + "epoch": 0.39389, + "grad_norm": 0.7691269539439922, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 39389 + }, + { + "epoch": 0.3939, + "grad_norm": 0.7712922010369773, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 39390 + }, + { + "epoch": 0.39391, + "grad_norm": 0.7758174201629645, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 39391 + }, + { + "epoch": 0.39392, + "grad_norm": 0.6804334012222153, + "learning_rate": 0.003, + "loss": 3.998, + "step": 39392 + }, + { + "epoch": 0.39393, + "grad_norm": 0.7589390839194043, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 39393 + }, + { + "epoch": 0.39394, + "grad_norm": 0.8374012789309756, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 39394 + }, + { + "epoch": 0.39395, + "grad_norm": 0.9430963243502279, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 39395 + }, + { + "epoch": 0.39396, + "grad_norm": 1.1104839399175914, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 39396 + }, + { + "epoch": 0.39397, + "grad_norm": 0.9264122652692927, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 39397 + }, + { + "epoch": 0.39398, + "grad_norm": 1.1074003158594874, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 39398 + }, + { + "epoch": 0.39399, + "grad_norm": 1.0039023249208912, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 39399 + }, + { + "epoch": 0.394, + "grad_norm": 0.8687764974370222, + "learning_rate": 0.003, + "loss": 4.044, + "step": 39400 + }, + { + "epoch": 0.39401, + "grad_norm": 0.86382876191506, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 39401 + }, + { + "epoch": 0.39402, + "grad_norm": 0.8903330889399652, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 39402 + }, + { + "epoch": 0.39403, + "grad_norm": 0.9459608595027006, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 39403 + }, + { + "epoch": 0.39404, + "grad_norm": 0.9807477501453775, + "learning_rate": 0.003, + "loss": 4.05, + "step": 39404 + }, + { + "epoch": 0.39405, + "grad_norm": 0.9081516561510692, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 39405 + }, + { + "epoch": 0.39406, + "grad_norm": 0.7434106104658543, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 39406 + }, + { + "epoch": 0.39407, + "grad_norm": 0.7864780517809654, + "learning_rate": 0.003, + "loss": 4.049, + "step": 39407 + }, + { + "epoch": 0.39408, + "grad_norm": 0.8411320462435301, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 39408 + }, + { + "epoch": 0.39409, + "grad_norm": 0.8696580802973238, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 39409 + }, + { + "epoch": 0.3941, + "grad_norm": 0.9666480221603674, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 39410 + }, + { + "epoch": 0.39411, + "grad_norm": 1.158092838565567, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 39411 + }, + { + "epoch": 0.39412, + "grad_norm": 0.8162739042110112, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 39412 + }, + { + "epoch": 0.39413, + "grad_norm": 0.7889844187181035, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 39413 + }, + { + "epoch": 0.39414, + "grad_norm": 0.7212218073903436, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 39414 + }, + { + "epoch": 0.39415, + "grad_norm": 0.6589304539311145, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 39415 + }, + { + "epoch": 0.39416, + "grad_norm": 0.7210279102119095, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 39416 + }, + { + "epoch": 0.39417, + "grad_norm": 0.7728798135377588, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 39417 + }, + { + "epoch": 0.39418, + "grad_norm": 0.7439486809263833, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 39418 + }, + { + "epoch": 0.39419, + "grad_norm": 0.7858049134668706, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 39419 + }, + { + "epoch": 0.3942, + "grad_norm": 0.7948550959097102, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 39420 + }, + { + "epoch": 0.39421, + "grad_norm": 0.8696114063849216, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 39421 + }, + { + "epoch": 0.39422, + "grad_norm": 0.985685439539121, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 39422 + }, + { + "epoch": 0.39423, + "grad_norm": 1.2051649708312722, + "learning_rate": 0.003, + "loss": 4.0732, + "step": 39423 + }, + { + "epoch": 0.39424, + "grad_norm": 0.9593345009482851, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 39424 + }, + { + "epoch": 0.39425, + "grad_norm": 1.0516917304442652, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 39425 + }, + { + "epoch": 0.39426, + "grad_norm": 0.8801276196805193, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 39426 + }, + { + "epoch": 0.39427, + "grad_norm": 0.8590477379906473, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 39427 + }, + { + "epoch": 0.39428, + "grad_norm": 0.7650217470753636, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 39428 + }, + { + "epoch": 0.39429, + "grad_norm": 0.738964571621468, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 39429 + }, + { + "epoch": 0.3943, + "grad_norm": 0.7967061279108083, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 39430 + }, + { + "epoch": 0.39431, + "grad_norm": 0.8068902258140342, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 39431 + }, + { + "epoch": 0.39432, + "grad_norm": 0.805576741762713, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 39432 + }, + { + "epoch": 0.39433, + "grad_norm": 0.7306827858164376, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 39433 + }, + { + "epoch": 0.39434, + "grad_norm": 0.6974999451965005, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 39434 + }, + { + "epoch": 0.39435, + "grad_norm": 0.7141334276978409, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 39435 + }, + { + "epoch": 0.39436, + "grad_norm": 0.6907014559181129, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 39436 + }, + { + "epoch": 0.39437, + "grad_norm": 0.6838401450130062, + "learning_rate": 0.003, + "loss": 3.9934, + "step": 39437 + }, + { + "epoch": 0.39438, + "grad_norm": 0.7185604171194111, + "learning_rate": 0.003, + "loss": 4.034, + "step": 39438 + }, + { + "epoch": 0.39439, + "grad_norm": 0.7688005274321775, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 39439 + }, + { + "epoch": 0.3944, + "grad_norm": 0.7318561476134802, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 39440 + }, + { + "epoch": 0.39441, + "grad_norm": 0.6589869966119208, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 39441 + }, + { + "epoch": 0.39442, + "grad_norm": 0.6200500626538661, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 39442 + }, + { + "epoch": 0.39443, + "grad_norm": 0.6549471282408907, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 39443 + }, + { + "epoch": 0.39444, + "grad_norm": 0.6923043397090122, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 39444 + }, + { + "epoch": 0.39445, + "grad_norm": 0.8027258405727149, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 39445 + }, + { + "epoch": 0.39446, + "grad_norm": 1.0270303611723481, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 39446 + }, + { + "epoch": 0.39447, + "grad_norm": 1.2204078026231515, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 39447 + }, + { + "epoch": 0.39448, + "grad_norm": 0.7398873086032811, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 39448 + }, + { + "epoch": 0.39449, + "grad_norm": 0.7539813983121769, + "learning_rate": 0.003, + "loss": 4.042, + "step": 39449 + }, + { + "epoch": 0.3945, + "grad_norm": 0.7435148819037185, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 39450 + }, + { + "epoch": 0.39451, + "grad_norm": 0.80715833498868, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 39451 + }, + { + "epoch": 0.39452, + "grad_norm": 0.8031536288072921, + "learning_rate": 0.003, + "loss": 4.025, + "step": 39452 + }, + { + "epoch": 0.39453, + "grad_norm": 0.8404358532117692, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 39453 + }, + { + "epoch": 0.39454, + "grad_norm": 0.8568321623951292, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 39454 + }, + { + "epoch": 0.39455, + "grad_norm": 0.8119375571031879, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 39455 + }, + { + "epoch": 0.39456, + "grad_norm": 0.8015787494498071, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 39456 + }, + { + "epoch": 0.39457, + "grad_norm": 0.7427152616427766, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 39457 + }, + { + "epoch": 0.39458, + "grad_norm": 0.871323054281941, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 39458 + }, + { + "epoch": 0.39459, + "grad_norm": 1.138213817100867, + "learning_rate": 0.003, + "loss": 4.019, + "step": 39459 + }, + { + "epoch": 0.3946, + "grad_norm": 0.9263340964100857, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 39460 + }, + { + "epoch": 0.39461, + "grad_norm": 0.981862113811283, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 39461 + }, + { + "epoch": 0.39462, + "grad_norm": 1.2239620013265853, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 39462 + }, + { + "epoch": 0.39463, + "grad_norm": 0.9890095506595465, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 39463 + }, + { + "epoch": 0.39464, + "grad_norm": 0.8737538622515242, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 39464 + }, + { + "epoch": 0.39465, + "grad_norm": 0.7432712244733141, + "learning_rate": 0.003, + "loss": 4.032, + "step": 39465 + }, + { + "epoch": 0.39466, + "grad_norm": 0.7689320291062746, + "learning_rate": 0.003, + "loss": 4.031, + "step": 39466 + }, + { + "epoch": 0.39467, + "grad_norm": 0.784188252231891, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 39467 + }, + { + "epoch": 0.39468, + "grad_norm": 0.7801554803267352, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 39468 + }, + { + "epoch": 0.39469, + "grad_norm": 0.8344587015918459, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 39469 + }, + { + "epoch": 0.3947, + "grad_norm": 0.8555449920543886, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 39470 + }, + { + "epoch": 0.39471, + "grad_norm": 0.9027673352989717, + "learning_rate": 0.003, + "loss": 4.039, + "step": 39471 + }, + { + "epoch": 0.39472, + "grad_norm": 1.0063626289179541, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 39472 + }, + { + "epoch": 0.39473, + "grad_norm": 1.052560274168696, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 39473 + }, + { + "epoch": 0.39474, + "grad_norm": 1.0580266065265673, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 39474 + }, + { + "epoch": 0.39475, + "grad_norm": 0.8460938847514488, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 39475 + }, + { + "epoch": 0.39476, + "grad_norm": 0.8205513318265636, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 39476 + }, + { + "epoch": 0.39477, + "grad_norm": 0.8381204970070919, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 39477 + }, + { + "epoch": 0.39478, + "grad_norm": 0.7477756893691646, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 39478 + }, + { + "epoch": 0.39479, + "grad_norm": 0.7670061391152819, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 39479 + }, + { + "epoch": 0.3948, + "grad_norm": 0.9093391548560923, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 39480 + }, + { + "epoch": 0.39481, + "grad_norm": 1.1552288354553872, + "learning_rate": 0.003, + "loss": 4.011, + "step": 39481 + }, + { + "epoch": 0.39482, + "grad_norm": 1.0781379426541078, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 39482 + }, + { + "epoch": 0.39483, + "grad_norm": 1.0607654142451035, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 39483 + }, + { + "epoch": 0.39484, + "grad_norm": 1.0322465289093847, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 39484 + }, + { + "epoch": 0.39485, + "grad_norm": 0.9458857024549181, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 39485 + }, + { + "epoch": 0.39486, + "grad_norm": 0.7490730001356407, + "learning_rate": 0.003, + "loss": 4.0057, + "step": 39486 + }, + { + "epoch": 0.39487, + "grad_norm": 0.6338629505305216, + "learning_rate": 0.003, + "loss": 4.046, + "step": 39487 + }, + { + "epoch": 0.39488, + "grad_norm": 0.6709451776255178, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 39488 + }, + { + "epoch": 0.39489, + "grad_norm": 0.5652422934687186, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 39489 + }, + { + "epoch": 0.3949, + "grad_norm": 0.5801528028674817, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 39490 + }, + { + "epoch": 0.39491, + "grad_norm": 0.5371368375912677, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 39491 + }, + { + "epoch": 0.39492, + "grad_norm": 0.5287025768240695, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 39492 + }, + { + "epoch": 0.39493, + "grad_norm": 0.616594713835355, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 39493 + }, + { + "epoch": 0.39494, + "grad_norm": 0.7266683613104948, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 39494 + }, + { + "epoch": 0.39495, + "grad_norm": 0.9027906677006201, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 39495 + }, + { + "epoch": 0.39496, + "grad_norm": 0.9803134948334052, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 39496 + }, + { + "epoch": 0.39497, + "grad_norm": 0.9716390630268202, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 39497 + }, + { + "epoch": 0.39498, + "grad_norm": 0.9497995795616121, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 39498 + }, + { + "epoch": 0.39499, + "grad_norm": 0.7984195834958184, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 39499 + }, + { + "epoch": 0.395, + "grad_norm": 0.7496733662754286, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 39500 + }, + { + "epoch": 0.39501, + "grad_norm": 0.8513837570834071, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 39501 + }, + { + "epoch": 0.39502, + "grad_norm": 0.9484563654902626, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 39502 + }, + { + "epoch": 0.39503, + "grad_norm": 0.9987160659547023, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 39503 + }, + { + "epoch": 0.39504, + "grad_norm": 0.8309032359842076, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 39504 + }, + { + "epoch": 0.39505, + "grad_norm": 0.6568095597184768, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 39505 + }, + { + "epoch": 0.39506, + "grad_norm": 0.6980635676244686, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 39506 + }, + { + "epoch": 0.39507, + "grad_norm": 0.6543511536286895, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 39507 + }, + { + "epoch": 0.39508, + "grad_norm": 0.7289878362390467, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 39508 + }, + { + "epoch": 0.39509, + "grad_norm": 0.8506912607063609, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 39509 + }, + { + "epoch": 0.3951, + "grad_norm": 1.085518423275409, + "learning_rate": 0.003, + "loss": 4.034, + "step": 39510 + }, + { + "epoch": 0.39511, + "grad_norm": 0.9742278160906207, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 39511 + }, + { + "epoch": 0.39512, + "grad_norm": 0.9281443807103495, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 39512 + }, + { + "epoch": 0.39513, + "grad_norm": 0.8861323259251069, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 39513 + }, + { + "epoch": 0.39514, + "grad_norm": 0.9030130323027508, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 39514 + }, + { + "epoch": 0.39515, + "grad_norm": 0.9289085342161861, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 39515 + }, + { + "epoch": 0.39516, + "grad_norm": 0.9151383455679437, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 39516 + }, + { + "epoch": 0.39517, + "grad_norm": 0.824994047498528, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 39517 + }, + { + "epoch": 0.39518, + "grad_norm": 0.8995129517239198, + "learning_rate": 0.003, + "loss": 4.043, + "step": 39518 + }, + { + "epoch": 0.39519, + "grad_norm": 0.8891123635045024, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 39519 + }, + { + "epoch": 0.3952, + "grad_norm": 0.9371076911791433, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 39520 + }, + { + "epoch": 0.39521, + "grad_norm": 0.9595739589521074, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 39521 + }, + { + "epoch": 0.39522, + "grad_norm": 1.0229033019762637, + "learning_rate": 0.003, + "loss": 4.0743, + "step": 39522 + }, + { + "epoch": 0.39523, + "grad_norm": 1.0150410761866633, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 39523 + }, + { + "epoch": 0.39524, + "grad_norm": 0.9285818951857724, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 39524 + }, + { + "epoch": 0.39525, + "grad_norm": 0.8934782039791924, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 39525 + }, + { + "epoch": 0.39526, + "grad_norm": 0.8035953648007976, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 39526 + }, + { + "epoch": 0.39527, + "grad_norm": 0.7530111795301493, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 39527 + }, + { + "epoch": 0.39528, + "grad_norm": 0.8084357885769176, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 39528 + }, + { + "epoch": 0.39529, + "grad_norm": 0.7697972992352837, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 39529 + }, + { + "epoch": 0.3953, + "grad_norm": 0.783466981550709, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 39530 + }, + { + "epoch": 0.39531, + "grad_norm": 0.8843001874260982, + "learning_rate": 0.003, + "loss": 4.039, + "step": 39531 + }, + { + "epoch": 0.39532, + "grad_norm": 0.9838628827543381, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 39532 + }, + { + "epoch": 0.39533, + "grad_norm": 0.9049331504121452, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 39533 + }, + { + "epoch": 0.39534, + "grad_norm": 0.7416893499563243, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 39534 + }, + { + "epoch": 0.39535, + "grad_norm": 0.7916245631535814, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 39535 + }, + { + "epoch": 0.39536, + "grad_norm": 0.7837589001819356, + "learning_rate": 0.003, + "loss": 4.0807, + "step": 39536 + }, + { + "epoch": 0.39537, + "grad_norm": 0.793993411795225, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 39537 + }, + { + "epoch": 0.39538, + "grad_norm": 0.897817327167971, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 39538 + }, + { + "epoch": 0.39539, + "grad_norm": 0.9242046918398277, + "learning_rate": 0.003, + "loss": 4.048, + "step": 39539 + }, + { + "epoch": 0.3954, + "grad_norm": 0.9321243841065555, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 39540 + }, + { + "epoch": 0.39541, + "grad_norm": 0.774730987696862, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 39541 + }, + { + "epoch": 0.39542, + "grad_norm": 0.6388457193380761, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 39542 + }, + { + "epoch": 0.39543, + "grad_norm": 0.6219159254360935, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 39543 + }, + { + "epoch": 0.39544, + "grad_norm": 0.5789086642625683, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 39544 + }, + { + "epoch": 0.39545, + "grad_norm": 0.5980817835926477, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 39545 + }, + { + "epoch": 0.39546, + "grad_norm": 0.5851739809283257, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 39546 + }, + { + "epoch": 0.39547, + "grad_norm": 0.6553212695554235, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 39547 + }, + { + "epoch": 0.39548, + "grad_norm": 0.7296296703475267, + "learning_rate": 0.003, + "loss": 3.9939, + "step": 39548 + }, + { + "epoch": 0.39549, + "grad_norm": 0.7031603632074337, + "learning_rate": 0.003, + "loss": 3.9928, + "step": 39549 + }, + { + "epoch": 0.3955, + "grad_norm": 0.6589191993926222, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 39550 + }, + { + "epoch": 0.39551, + "grad_norm": 0.797528112086645, + "learning_rate": 0.003, + "loss": 4.001, + "step": 39551 + }, + { + "epoch": 0.39552, + "grad_norm": 1.0588848607784989, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 39552 + }, + { + "epoch": 0.39553, + "grad_norm": 1.0375751563250726, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 39553 + }, + { + "epoch": 0.39554, + "grad_norm": 0.8408382531614119, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 39554 + }, + { + "epoch": 0.39555, + "grad_norm": 0.8546048731299055, + "learning_rate": 0.003, + "loss": 4.001, + "step": 39555 + }, + { + "epoch": 0.39556, + "grad_norm": 0.9583811156735228, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 39556 + }, + { + "epoch": 0.39557, + "grad_norm": 0.9530110632766503, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 39557 + }, + { + "epoch": 0.39558, + "grad_norm": 0.8662942793488251, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 39558 + }, + { + "epoch": 0.39559, + "grad_norm": 0.7804559406428531, + "learning_rate": 0.003, + "loss": 4.0019, + "step": 39559 + }, + { + "epoch": 0.3956, + "grad_norm": 0.709246552604697, + "learning_rate": 0.003, + "loss": 4.0046, + "step": 39560 + }, + { + "epoch": 0.39561, + "grad_norm": 0.7035233436439462, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 39561 + }, + { + "epoch": 0.39562, + "grad_norm": 0.7616058070439903, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 39562 + }, + { + "epoch": 0.39563, + "grad_norm": 0.7197737462637749, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 39563 + }, + { + "epoch": 0.39564, + "grad_norm": 0.6301106141628644, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 39564 + }, + { + "epoch": 0.39565, + "grad_norm": 0.7753779159475381, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 39565 + }, + { + "epoch": 0.39566, + "grad_norm": 0.7747849858438901, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 39566 + }, + { + "epoch": 0.39567, + "grad_norm": 0.7076978361745392, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 39567 + }, + { + "epoch": 0.39568, + "grad_norm": 0.5900869976935981, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 39568 + }, + { + "epoch": 0.39569, + "grad_norm": 0.6657978496355995, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 39569 + }, + { + "epoch": 0.3957, + "grad_norm": 0.6262424195768134, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 39570 + }, + { + "epoch": 0.39571, + "grad_norm": 0.6229881044800404, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 39571 + }, + { + "epoch": 0.39572, + "grad_norm": 0.6506346637604712, + "learning_rate": 0.003, + "loss": 4.015, + "step": 39572 + }, + { + "epoch": 0.39573, + "grad_norm": 0.703210095353316, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 39573 + }, + { + "epoch": 0.39574, + "grad_norm": 0.7899677500737662, + "learning_rate": 0.003, + "loss": 4.012, + "step": 39574 + }, + { + "epoch": 0.39575, + "grad_norm": 0.9406777043346433, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 39575 + }, + { + "epoch": 0.39576, + "grad_norm": 1.2265906656914343, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 39576 + }, + { + "epoch": 0.39577, + "grad_norm": 0.9593832085829892, + "learning_rate": 0.003, + "loss": 4.0014, + "step": 39577 + }, + { + "epoch": 0.39578, + "grad_norm": 1.359846607404065, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 39578 + }, + { + "epoch": 0.39579, + "grad_norm": 0.7193722518216393, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 39579 + }, + { + "epoch": 0.3958, + "grad_norm": 0.6245116528581971, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 39580 + }, + { + "epoch": 0.39581, + "grad_norm": 0.7402539041113921, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 39581 + }, + { + "epoch": 0.39582, + "grad_norm": 0.915496967782783, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 39582 + }, + { + "epoch": 0.39583, + "grad_norm": 1.1927707963872576, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 39583 + }, + { + "epoch": 0.39584, + "grad_norm": 0.7771807388378839, + "learning_rate": 0.003, + "loss": 4.0089, + "step": 39584 + }, + { + "epoch": 0.39585, + "grad_norm": 0.7435342729449983, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 39585 + }, + { + "epoch": 0.39586, + "grad_norm": 0.7956312981568532, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 39586 + }, + { + "epoch": 0.39587, + "grad_norm": 0.8290053147036375, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 39587 + }, + { + "epoch": 0.39588, + "grad_norm": 0.7609476916212884, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 39588 + }, + { + "epoch": 0.39589, + "grad_norm": 0.8769718448919175, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 39589 + }, + { + "epoch": 0.3959, + "grad_norm": 1.1104880057266198, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 39590 + }, + { + "epoch": 0.39591, + "grad_norm": 0.8517507350697395, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 39591 + }, + { + "epoch": 0.39592, + "grad_norm": 0.605401237309219, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 39592 + }, + { + "epoch": 0.39593, + "grad_norm": 0.5924368845801439, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 39593 + }, + { + "epoch": 0.39594, + "grad_norm": 0.6926688197102729, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 39594 + }, + { + "epoch": 0.39595, + "grad_norm": 0.8061486398625709, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 39595 + }, + { + "epoch": 0.39596, + "grad_norm": 0.792259550596706, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 39596 + }, + { + "epoch": 0.39597, + "grad_norm": 0.7395664326625887, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 39597 + }, + { + "epoch": 0.39598, + "grad_norm": 0.714073115130659, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 39598 + }, + { + "epoch": 0.39599, + "grad_norm": 0.6979941286288214, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 39599 + }, + { + "epoch": 0.396, + "grad_norm": 0.8374703820307141, + "learning_rate": 0.003, + "loss": 4.012, + "step": 39600 + }, + { + "epoch": 0.39601, + "grad_norm": 0.8185602153526833, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 39601 + }, + { + "epoch": 0.39602, + "grad_norm": 0.7919210851125984, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 39602 + }, + { + "epoch": 0.39603, + "grad_norm": 0.7552562070898411, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 39603 + }, + { + "epoch": 0.39604, + "grad_norm": 0.8427448145609328, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 39604 + }, + { + "epoch": 0.39605, + "grad_norm": 1.0589172241562845, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 39605 + }, + { + "epoch": 0.39606, + "grad_norm": 1.1357098862229262, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 39606 + }, + { + "epoch": 0.39607, + "grad_norm": 0.8752852920134031, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 39607 + }, + { + "epoch": 0.39608, + "grad_norm": 0.9163446356335886, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 39608 + }, + { + "epoch": 0.39609, + "grad_norm": 1.1371957394969663, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 39609 + }, + { + "epoch": 0.3961, + "grad_norm": 1.024935080924332, + "learning_rate": 0.003, + "loss": 4.055, + "step": 39610 + }, + { + "epoch": 0.39611, + "grad_norm": 1.0405186588360154, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 39611 + }, + { + "epoch": 0.39612, + "grad_norm": 0.8336213036144519, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 39612 + }, + { + "epoch": 0.39613, + "grad_norm": 0.7733351875552787, + "learning_rate": 0.003, + "loss": 4.027, + "step": 39613 + }, + { + "epoch": 0.39614, + "grad_norm": 0.6721358201114858, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 39614 + }, + { + "epoch": 0.39615, + "grad_norm": 0.731487984581066, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 39615 + }, + { + "epoch": 0.39616, + "grad_norm": 0.6911663396315415, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 39616 + }, + { + "epoch": 0.39617, + "grad_norm": 0.72756667963201, + "learning_rate": 0.003, + "loss": 4.042, + "step": 39617 + }, + { + "epoch": 0.39618, + "grad_norm": 0.7521646068486387, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 39618 + }, + { + "epoch": 0.39619, + "grad_norm": 0.7837254960912847, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 39619 + }, + { + "epoch": 0.3962, + "grad_norm": 0.8613577603293697, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 39620 + }, + { + "epoch": 0.39621, + "grad_norm": 0.9840925488843336, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 39621 + }, + { + "epoch": 0.39622, + "grad_norm": 1.1898493718497576, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 39622 + }, + { + "epoch": 0.39623, + "grad_norm": 0.8566663630507108, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 39623 + }, + { + "epoch": 0.39624, + "grad_norm": 0.8124318693383002, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 39624 + }, + { + "epoch": 0.39625, + "grad_norm": 0.6775428581463909, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 39625 + }, + { + "epoch": 0.39626, + "grad_norm": 0.6237370074457264, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 39626 + }, + { + "epoch": 0.39627, + "grad_norm": 0.6961567798983064, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 39627 + }, + { + "epoch": 0.39628, + "grad_norm": 0.6127708155201291, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 39628 + }, + { + "epoch": 0.39629, + "grad_norm": 0.6584641677875189, + "learning_rate": 0.003, + "loss": 3.995, + "step": 39629 + }, + { + "epoch": 0.3963, + "grad_norm": 0.6741383614704889, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 39630 + }, + { + "epoch": 0.39631, + "grad_norm": 0.8300915088588311, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 39631 + }, + { + "epoch": 0.39632, + "grad_norm": 0.9750641198078834, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 39632 + }, + { + "epoch": 0.39633, + "grad_norm": 1.0755110647303832, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 39633 + }, + { + "epoch": 0.39634, + "grad_norm": 0.9694840941960865, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 39634 + }, + { + "epoch": 0.39635, + "grad_norm": 0.9026468674719017, + "learning_rate": 0.003, + "loss": 4.046, + "step": 39635 + }, + { + "epoch": 0.39636, + "grad_norm": 0.8408562502244536, + "learning_rate": 0.003, + "loss": 4.024, + "step": 39636 + }, + { + "epoch": 0.39637, + "grad_norm": 0.7538094263211942, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 39637 + }, + { + "epoch": 0.39638, + "grad_norm": 0.7597344041867743, + "learning_rate": 0.003, + "loss": 3.9979, + "step": 39638 + }, + { + "epoch": 0.39639, + "grad_norm": 0.630460530032284, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 39639 + }, + { + "epoch": 0.3964, + "grad_norm": 0.6020483386275269, + "learning_rate": 0.003, + "loss": 4.026, + "step": 39640 + }, + { + "epoch": 0.39641, + "grad_norm": 0.549179810905212, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 39641 + }, + { + "epoch": 0.39642, + "grad_norm": 0.5710334186560586, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 39642 + }, + { + "epoch": 0.39643, + "grad_norm": 0.558878708317826, + "learning_rate": 0.003, + "loss": 3.9967, + "step": 39643 + }, + { + "epoch": 0.39644, + "grad_norm": 0.650745003026192, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 39644 + }, + { + "epoch": 0.39645, + "grad_norm": 0.8093004415878016, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 39645 + }, + { + "epoch": 0.39646, + "grad_norm": 0.9441135434476725, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 39646 + }, + { + "epoch": 0.39647, + "grad_norm": 1.1793505819555985, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 39647 + }, + { + "epoch": 0.39648, + "grad_norm": 1.0338484585251289, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 39648 + }, + { + "epoch": 0.39649, + "grad_norm": 0.8569954745771187, + "learning_rate": 0.003, + "loss": 4.0001, + "step": 39649 + }, + { + "epoch": 0.3965, + "grad_norm": 0.8531374729079778, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 39650 + }, + { + "epoch": 0.39651, + "grad_norm": 0.764428843621491, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 39651 + }, + { + "epoch": 0.39652, + "grad_norm": 0.7215626705185979, + "learning_rate": 0.003, + "loss": 4.0034, + "step": 39652 + }, + { + "epoch": 0.39653, + "grad_norm": 0.7373967074361513, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 39653 + }, + { + "epoch": 0.39654, + "grad_norm": 0.939988486345877, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 39654 + }, + { + "epoch": 0.39655, + "grad_norm": 0.9629836900764179, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 39655 + }, + { + "epoch": 0.39656, + "grad_norm": 0.9514539481826458, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 39656 + }, + { + "epoch": 0.39657, + "grad_norm": 0.8960786101413772, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 39657 + }, + { + "epoch": 0.39658, + "grad_norm": 0.8616418994886229, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 39658 + }, + { + "epoch": 0.39659, + "grad_norm": 0.7593595051394757, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 39659 + }, + { + "epoch": 0.3966, + "grad_norm": 0.7819069564722244, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 39660 + }, + { + "epoch": 0.39661, + "grad_norm": 0.8019883760010448, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 39661 + }, + { + "epoch": 0.39662, + "grad_norm": 0.8676566156399131, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 39662 + }, + { + "epoch": 0.39663, + "grad_norm": 1.0215829076965695, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 39663 + }, + { + "epoch": 0.39664, + "grad_norm": 1.0536366968246937, + "learning_rate": 0.003, + "loss": 4.057, + "step": 39664 + }, + { + "epoch": 0.39665, + "grad_norm": 1.1261882459978065, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 39665 + }, + { + "epoch": 0.39666, + "grad_norm": 0.9885816093712124, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 39666 + }, + { + "epoch": 0.39667, + "grad_norm": 1.1067542700666984, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 39667 + }, + { + "epoch": 0.39668, + "grad_norm": 0.8446447113266563, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 39668 + }, + { + "epoch": 0.39669, + "grad_norm": 0.7562508389208932, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 39669 + }, + { + "epoch": 0.3967, + "grad_norm": 0.8045272383897542, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 39670 + }, + { + "epoch": 0.39671, + "grad_norm": 0.9279835717882874, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 39671 + }, + { + "epoch": 0.39672, + "grad_norm": 1.208282844910523, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 39672 + }, + { + "epoch": 0.39673, + "grad_norm": 0.7570671844494334, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 39673 + }, + { + "epoch": 0.39674, + "grad_norm": 0.7779769284560004, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 39674 + }, + { + "epoch": 0.39675, + "grad_norm": 0.9351740540465701, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 39675 + }, + { + "epoch": 0.39676, + "grad_norm": 0.8990906954908355, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 39676 + }, + { + "epoch": 0.39677, + "grad_norm": 0.9266056241347902, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 39677 + }, + { + "epoch": 0.39678, + "grad_norm": 0.9603765378486661, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 39678 + }, + { + "epoch": 0.39679, + "grad_norm": 0.8670324705987328, + "learning_rate": 0.003, + "loss": 4.056, + "step": 39679 + }, + { + "epoch": 0.3968, + "grad_norm": 0.781254138708569, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 39680 + }, + { + "epoch": 0.39681, + "grad_norm": 0.8083583389828725, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 39681 + }, + { + "epoch": 0.39682, + "grad_norm": 0.8865371919783337, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 39682 + }, + { + "epoch": 0.39683, + "grad_norm": 0.9805538358005509, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 39683 + }, + { + "epoch": 0.39684, + "grad_norm": 1.0416063290812576, + "learning_rate": 0.003, + "loss": 4.059, + "step": 39684 + }, + { + "epoch": 0.39685, + "grad_norm": 0.9000135692450478, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 39685 + }, + { + "epoch": 0.39686, + "grad_norm": 1.0003807669093254, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 39686 + }, + { + "epoch": 0.39687, + "grad_norm": 1.0768883952970687, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 39687 + }, + { + "epoch": 0.39688, + "grad_norm": 0.8010417051509251, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 39688 + }, + { + "epoch": 0.39689, + "grad_norm": 0.7122293227642746, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 39689 + }, + { + "epoch": 0.3969, + "grad_norm": 0.6382364359468051, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 39690 + }, + { + "epoch": 0.39691, + "grad_norm": 0.6219767108342775, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 39691 + }, + { + "epoch": 0.39692, + "grad_norm": 0.6512133985951507, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 39692 + }, + { + "epoch": 0.39693, + "grad_norm": 0.6699951116578179, + "learning_rate": 0.003, + "loss": 3.976, + "step": 39693 + }, + { + "epoch": 0.39694, + "grad_norm": 0.7044316371578512, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 39694 + }, + { + "epoch": 0.39695, + "grad_norm": 0.7100431980786719, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 39695 + }, + { + "epoch": 0.39696, + "grad_norm": 0.6727108476515086, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 39696 + }, + { + "epoch": 0.39697, + "grad_norm": 0.7830173703964041, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 39697 + }, + { + "epoch": 0.39698, + "grad_norm": 0.9061312140166126, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 39698 + }, + { + "epoch": 0.39699, + "grad_norm": 1.081821218339152, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 39699 + }, + { + "epoch": 0.397, + "grad_norm": 1.0330154389404589, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 39700 + }, + { + "epoch": 0.39701, + "grad_norm": 0.797191863708295, + "learning_rate": 0.003, + "loss": 4.033, + "step": 39701 + }, + { + "epoch": 0.39702, + "grad_norm": 0.6945525083564948, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 39702 + }, + { + "epoch": 0.39703, + "grad_norm": 0.819966312177038, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 39703 + }, + { + "epoch": 0.39704, + "grad_norm": 0.9884537246381906, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 39704 + }, + { + "epoch": 0.39705, + "grad_norm": 1.0336595571748022, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 39705 + }, + { + "epoch": 0.39706, + "grad_norm": 0.9133849173453611, + "learning_rate": 0.003, + "loss": 4.029, + "step": 39706 + }, + { + "epoch": 0.39707, + "grad_norm": 0.8321479163214129, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 39707 + }, + { + "epoch": 0.39708, + "grad_norm": 0.7936330298088261, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 39708 + }, + { + "epoch": 0.39709, + "grad_norm": 0.6766863254980472, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 39709 + }, + { + "epoch": 0.3971, + "grad_norm": 0.7034111488290166, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 39710 + }, + { + "epoch": 0.39711, + "grad_norm": 0.6979725219412303, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 39711 + }, + { + "epoch": 0.39712, + "grad_norm": 0.6891166325593887, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 39712 + }, + { + "epoch": 0.39713, + "grad_norm": 0.7700316155983302, + "learning_rate": 0.003, + "loss": 4.0012, + "step": 39713 + }, + { + "epoch": 0.39714, + "grad_norm": 0.9313447137063859, + "learning_rate": 0.003, + "loss": 4.058, + "step": 39714 + }, + { + "epoch": 0.39715, + "grad_norm": 0.9898396915452405, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 39715 + }, + { + "epoch": 0.39716, + "grad_norm": 1.0512026679102813, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 39716 + }, + { + "epoch": 0.39717, + "grad_norm": 0.9599935845513682, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 39717 + }, + { + "epoch": 0.39718, + "grad_norm": 0.9087378992939291, + "learning_rate": 0.003, + "loss": 4.0708, + "step": 39718 + }, + { + "epoch": 0.39719, + "grad_norm": 0.8412834131606616, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 39719 + }, + { + "epoch": 0.3972, + "grad_norm": 0.856833673672133, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 39720 + }, + { + "epoch": 0.39721, + "grad_norm": 0.9318706632437175, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 39721 + }, + { + "epoch": 0.39722, + "grad_norm": 0.9281511936434623, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 39722 + }, + { + "epoch": 0.39723, + "grad_norm": 0.9446097553774266, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 39723 + }, + { + "epoch": 0.39724, + "grad_norm": 0.9073398895840605, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 39724 + }, + { + "epoch": 0.39725, + "grad_norm": 0.9181183531913965, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 39725 + }, + { + "epoch": 0.39726, + "grad_norm": 0.8948259322239421, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 39726 + }, + { + "epoch": 0.39727, + "grad_norm": 0.8572018859049785, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 39727 + }, + { + "epoch": 0.39728, + "grad_norm": 0.7713600751328611, + "learning_rate": 0.003, + "loss": 4.034, + "step": 39728 + }, + { + "epoch": 0.39729, + "grad_norm": 0.7053117015873317, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 39729 + }, + { + "epoch": 0.3973, + "grad_norm": 0.7939511037495691, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 39730 + }, + { + "epoch": 0.39731, + "grad_norm": 0.9448307497567878, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 39731 + }, + { + "epoch": 0.39732, + "grad_norm": 0.977595151280107, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 39732 + }, + { + "epoch": 0.39733, + "grad_norm": 0.9108976660621556, + "learning_rate": 0.003, + "loss": 4.029, + "step": 39733 + }, + { + "epoch": 0.39734, + "grad_norm": 0.9474023071925258, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 39734 + }, + { + "epoch": 0.39735, + "grad_norm": 0.8261179972914429, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 39735 + }, + { + "epoch": 0.39736, + "grad_norm": 0.8391253850311942, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 39736 + }, + { + "epoch": 0.39737, + "grad_norm": 0.8653481628631193, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 39737 + }, + { + "epoch": 0.39738, + "grad_norm": 0.9139004064178592, + "learning_rate": 0.003, + "loss": 4.02, + "step": 39738 + }, + { + "epoch": 0.39739, + "grad_norm": 0.900651654471135, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 39739 + }, + { + "epoch": 0.3974, + "grad_norm": 0.7590751126533718, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 39740 + }, + { + "epoch": 0.39741, + "grad_norm": 0.6327904681633292, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 39741 + }, + { + "epoch": 0.39742, + "grad_norm": 0.671280454437266, + "learning_rate": 0.003, + "loss": 4.0048, + "step": 39742 + }, + { + "epoch": 0.39743, + "grad_norm": 0.722529887458883, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 39743 + }, + { + "epoch": 0.39744, + "grad_norm": 0.7395302473212528, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 39744 + }, + { + "epoch": 0.39745, + "grad_norm": 0.7650699113151544, + "learning_rate": 0.003, + "loss": 3.9924, + "step": 39745 + }, + { + "epoch": 0.39746, + "grad_norm": 0.7756426537906189, + "learning_rate": 0.003, + "loss": 4.01, + "step": 39746 + }, + { + "epoch": 0.39747, + "grad_norm": 0.8013838908772996, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 39747 + }, + { + "epoch": 0.39748, + "grad_norm": 0.810558495847806, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 39748 + }, + { + "epoch": 0.39749, + "grad_norm": 0.9598672063500764, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 39749 + }, + { + "epoch": 0.3975, + "grad_norm": 1.0703718789446794, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 39750 + }, + { + "epoch": 0.39751, + "grad_norm": 0.8623834489659691, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 39751 + }, + { + "epoch": 0.39752, + "grad_norm": 0.7670584546739172, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 39752 + }, + { + "epoch": 0.39753, + "grad_norm": 0.6665852910990602, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 39753 + }, + { + "epoch": 0.39754, + "grad_norm": 0.6310826190719662, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 39754 + }, + { + "epoch": 0.39755, + "grad_norm": 0.6615296356055345, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 39755 + }, + { + "epoch": 0.39756, + "grad_norm": 0.7374640282917273, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 39756 + }, + { + "epoch": 0.39757, + "grad_norm": 0.7910750702470345, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 39757 + }, + { + "epoch": 0.39758, + "grad_norm": 0.8384149483612041, + "learning_rate": 0.003, + "loss": 4.0002, + "step": 39758 + }, + { + "epoch": 0.39759, + "grad_norm": 0.7988181511521107, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 39759 + }, + { + "epoch": 0.3976, + "grad_norm": 0.7514143778795332, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 39760 + }, + { + "epoch": 0.39761, + "grad_norm": 0.6893365404009618, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 39761 + }, + { + "epoch": 0.39762, + "grad_norm": 0.6909472946649564, + "learning_rate": 0.003, + "loss": 4.051, + "step": 39762 + }, + { + "epoch": 0.39763, + "grad_norm": 0.672892974173692, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 39763 + }, + { + "epoch": 0.39764, + "grad_norm": 0.6510648197950795, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 39764 + }, + { + "epoch": 0.39765, + "grad_norm": 0.612294862534544, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 39765 + }, + { + "epoch": 0.39766, + "grad_norm": 0.6604395926336303, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 39766 + }, + { + "epoch": 0.39767, + "grad_norm": 0.8770225192356241, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 39767 + }, + { + "epoch": 0.39768, + "grad_norm": 1.0855844158711359, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 39768 + }, + { + "epoch": 0.39769, + "grad_norm": 0.9140811926234591, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 39769 + }, + { + "epoch": 0.3977, + "grad_norm": 0.9125073193467402, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 39770 + }, + { + "epoch": 0.39771, + "grad_norm": 0.8708436920087413, + "learning_rate": 0.003, + "loss": 4.051, + "step": 39771 + }, + { + "epoch": 0.39772, + "grad_norm": 0.8200750656220751, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 39772 + }, + { + "epoch": 0.39773, + "grad_norm": 0.8537371026953673, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 39773 + }, + { + "epoch": 0.39774, + "grad_norm": 0.9270580278538741, + "learning_rate": 0.003, + "loss": 4.028, + "step": 39774 + }, + { + "epoch": 0.39775, + "grad_norm": 0.9479376840293395, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 39775 + }, + { + "epoch": 0.39776, + "grad_norm": 0.9018613959541131, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 39776 + }, + { + "epoch": 0.39777, + "grad_norm": 0.9824152857708035, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 39777 + }, + { + "epoch": 0.39778, + "grad_norm": 1.1209142558601453, + "learning_rate": 0.003, + "loss": 4.038, + "step": 39778 + }, + { + "epoch": 0.39779, + "grad_norm": 1.0463296557086672, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 39779 + }, + { + "epoch": 0.3978, + "grad_norm": 0.9455119375703926, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 39780 + }, + { + "epoch": 0.39781, + "grad_norm": 1.0853234785548118, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 39781 + }, + { + "epoch": 0.39782, + "grad_norm": 1.070025030566966, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 39782 + }, + { + "epoch": 0.39783, + "grad_norm": 0.9547713823810766, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 39783 + }, + { + "epoch": 0.39784, + "grad_norm": 1.178585797388636, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 39784 + }, + { + "epoch": 0.39785, + "grad_norm": 0.9091385796845208, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 39785 + }, + { + "epoch": 0.39786, + "grad_norm": 0.8305098157682966, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 39786 + }, + { + "epoch": 0.39787, + "grad_norm": 0.8308271250073269, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 39787 + }, + { + "epoch": 0.39788, + "grad_norm": 0.9992170925517697, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 39788 + }, + { + "epoch": 0.39789, + "grad_norm": 1.1274957221965825, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 39789 + }, + { + "epoch": 0.3979, + "grad_norm": 0.9615562863777863, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 39790 + }, + { + "epoch": 0.39791, + "grad_norm": 0.9088445082292439, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 39791 + }, + { + "epoch": 0.39792, + "grad_norm": 0.7986155731742246, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 39792 + }, + { + "epoch": 0.39793, + "grad_norm": 0.7577486311455848, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 39793 + }, + { + "epoch": 0.39794, + "grad_norm": 0.6379879677758522, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 39794 + }, + { + "epoch": 0.39795, + "grad_norm": 0.6161329557502638, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 39795 + }, + { + "epoch": 0.39796, + "grad_norm": 0.580625443184172, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 39796 + }, + { + "epoch": 0.39797, + "grad_norm": 0.5510704953403915, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 39797 + }, + { + "epoch": 0.39798, + "grad_norm": 0.49437360292642346, + "learning_rate": 0.003, + "loss": 3.9994, + "step": 39798 + }, + { + "epoch": 0.39799, + "grad_norm": 0.5231712906501177, + "learning_rate": 0.003, + "loss": 4.018, + "step": 39799 + }, + { + "epoch": 0.398, + "grad_norm": 0.5484121039494089, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 39800 + }, + { + "epoch": 0.39801, + "grad_norm": 0.7021880078262991, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 39801 + }, + { + "epoch": 0.39802, + "grad_norm": 0.9678454967625654, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 39802 + }, + { + "epoch": 0.39803, + "grad_norm": 1.3790048643442543, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 39803 + }, + { + "epoch": 0.39804, + "grad_norm": 0.524331662439947, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 39804 + }, + { + "epoch": 0.39805, + "grad_norm": 0.7348463755745545, + "learning_rate": 0.003, + "loss": 4.014, + "step": 39805 + }, + { + "epoch": 0.39806, + "grad_norm": 0.8799469448382179, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 39806 + }, + { + "epoch": 0.39807, + "grad_norm": 0.8571319227860412, + "learning_rate": 0.003, + "loss": 4.065, + "step": 39807 + }, + { + "epoch": 0.39808, + "grad_norm": 0.7868605041509095, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 39808 + }, + { + "epoch": 0.39809, + "grad_norm": 0.6660940262773337, + "learning_rate": 0.003, + "loss": 3.9942, + "step": 39809 + }, + { + "epoch": 0.3981, + "grad_norm": 0.6371319594233679, + "learning_rate": 0.003, + "loss": 4.014, + "step": 39810 + }, + { + "epoch": 0.39811, + "grad_norm": 0.6768299041932805, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 39811 + }, + { + "epoch": 0.39812, + "grad_norm": 0.7384334788676582, + "learning_rate": 0.003, + "loss": 3.9805, + "step": 39812 + }, + { + "epoch": 0.39813, + "grad_norm": 0.7885881974875533, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 39813 + }, + { + "epoch": 0.39814, + "grad_norm": 0.863003633450937, + "learning_rate": 0.003, + "loss": 4.023, + "step": 39814 + }, + { + "epoch": 0.39815, + "grad_norm": 0.8347501723434039, + "learning_rate": 0.003, + "loss": 4.038, + "step": 39815 + }, + { + "epoch": 0.39816, + "grad_norm": 0.7933301787420708, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 39816 + }, + { + "epoch": 0.39817, + "grad_norm": 0.7762223887757017, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 39817 + }, + { + "epoch": 0.39818, + "grad_norm": 1.0333987020494657, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 39818 + }, + { + "epoch": 0.39819, + "grad_norm": 1.1359299857170353, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 39819 + }, + { + "epoch": 0.3982, + "grad_norm": 0.9308881857349262, + "learning_rate": 0.003, + "loss": 4.036, + "step": 39820 + }, + { + "epoch": 0.39821, + "grad_norm": 0.776724192616154, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 39821 + }, + { + "epoch": 0.39822, + "grad_norm": 0.7652699492062214, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 39822 + }, + { + "epoch": 0.39823, + "grad_norm": 0.8051894573257433, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 39823 + }, + { + "epoch": 0.39824, + "grad_norm": 0.9244518735468559, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 39824 + }, + { + "epoch": 0.39825, + "grad_norm": 0.943816182687109, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 39825 + }, + { + "epoch": 0.39826, + "grad_norm": 1.0530708446320325, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 39826 + }, + { + "epoch": 0.39827, + "grad_norm": 0.9272475163164513, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 39827 + }, + { + "epoch": 0.39828, + "grad_norm": 0.888137081183261, + "learning_rate": 0.003, + "loss": 3.9927, + "step": 39828 + }, + { + "epoch": 0.39829, + "grad_norm": 0.810247875798411, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 39829 + }, + { + "epoch": 0.3983, + "grad_norm": 0.7084929685639022, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 39830 + }, + { + "epoch": 0.39831, + "grad_norm": 0.6325799393172306, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 39831 + }, + { + "epoch": 0.39832, + "grad_norm": 0.6653149768395196, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 39832 + }, + { + "epoch": 0.39833, + "grad_norm": 0.7366892729752627, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 39833 + }, + { + "epoch": 0.39834, + "grad_norm": 0.8246688200280695, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 39834 + }, + { + "epoch": 0.39835, + "grad_norm": 0.8439261354151786, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 39835 + }, + { + "epoch": 0.39836, + "grad_norm": 0.8136327407061245, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 39836 + }, + { + "epoch": 0.39837, + "grad_norm": 0.9229641348557016, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 39837 + }, + { + "epoch": 0.39838, + "grad_norm": 0.9276789550366452, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 39838 + }, + { + "epoch": 0.39839, + "grad_norm": 0.8757321526296263, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 39839 + }, + { + "epoch": 0.3984, + "grad_norm": 0.7863699153851665, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 39840 + }, + { + "epoch": 0.39841, + "grad_norm": 0.8342542141126283, + "learning_rate": 0.003, + "loss": 4.038, + "step": 39841 + }, + { + "epoch": 0.39842, + "grad_norm": 0.760773185948885, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 39842 + }, + { + "epoch": 0.39843, + "grad_norm": 0.6538543939208417, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 39843 + }, + { + "epoch": 0.39844, + "grad_norm": 0.6803781187416817, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 39844 + }, + { + "epoch": 0.39845, + "grad_norm": 0.7397430767588763, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 39845 + }, + { + "epoch": 0.39846, + "grad_norm": 0.8062491335295962, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 39846 + }, + { + "epoch": 0.39847, + "grad_norm": 0.7663830435547425, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 39847 + }, + { + "epoch": 0.39848, + "grad_norm": 0.7722775652111563, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 39848 + }, + { + "epoch": 0.39849, + "grad_norm": 0.8488515984760915, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 39849 + }, + { + "epoch": 0.3985, + "grad_norm": 0.8776969986432923, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 39850 + }, + { + "epoch": 0.39851, + "grad_norm": 0.8632973212397749, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 39851 + }, + { + "epoch": 0.39852, + "grad_norm": 1.1596912512220043, + "learning_rate": 0.003, + "loss": 4.056, + "step": 39852 + }, + { + "epoch": 0.39853, + "grad_norm": 1.105611836548117, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 39853 + }, + { + "epoch": 0.39854, + "grad_norm": 0.9225457092060736, + "learning_rate": 0.003, + "loss": 4.059, + "step": 39854 + }, + { + "epoch": 0.39855, + "grad_norm": 0.8643341271837051, + "learning_rate": 0.003, + "loss": 4.02, + "step": 39855 + }, + { + "epoch": 0.39856, + "grad_norm": 0.9564398351803541, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 39856 + }, + { + "epoch": 0.39857, + "grad_norm": 1.1232696163876872, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 39857 + }, + { + "epoch": 0.39858, + "grad_norm": 0.824528319774871, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 39858 + }, + { + "epoch": 0.39859, + "grad_norm": 0.7666824521012666, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 39859 + }, + { + "epoch": 0.3986, + "grad_norm": 0.6615941602651432, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 39860 + }, + { + "epoch": 0.39861, + "grad_norm": 0.5310243765842388, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 39861 + }, + { + "epoch": 0.39862, + "grad_norm": 0.5723345147407368, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 39862 + }, + { + "epoch": 0.39863, + "grad_norm": 0.5944207316263951, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 39863 + }, + { + "epoch": 0.39864, + "grad_norm": 0.6458492872973562, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 39864 + }, + { + "epoch": 0.39865, + "grad_norm": 0.7126869158835685, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 39865 + }, + { + "epoch": 0.39866, + "grad_norm": 0.7767860440979831, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 39866 + }, + { + "epoch": 0.39867, + "grad_norm": 0.7816637153378014, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 39867 + }, + { + "epoch": 0.39868, + "grad_norm": 0.8045245890268737, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 39868 + }, + { + "epoch": 0.39869, + "grad_norm": 0.8514870174858608, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 39869 + }, + { + "epoch": 0.3987, + "grad_norm": 0.9182642707265891, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 39870 + }, + { + "epoch": 0.39871, + "grad_norm": 0.9236915941388734, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 39871 + }, + { + "epoch": 0.39872, + "grad_norm": 0.9336220017460224, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 39872 + }, + { + "epoch": 0.39873, + "grad_norm": 0.9027670125156166, + "learning_rate": 0.003, + "loss": 3.984, + "step": 39873 + }, + { + "epoch": 0.39874, + "grad_norm": 0.7709274201977915, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 39874 + }, + { + "epoch": 0.39875, + "grad_norm": 0.7117133054815927, + "learning_rate": 0.003, + "loss": 4.0019, + "step": 39875 + }, + { + "epoch": 0.39876, + "grad_norm": 0.7781497484000709, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 39876 + }, + { + "epoch": 0.39877, + "grad_norm": 0.8504512777424913, + "learning_rate": 0.003, + "loss": 4.0043, + "step": 39877 + }, + { + "epoch": 0.39878, + "grad_norm": 0.8007935494545366, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 39878 + }, + { + "epoch": 0.39879, + "grad_norm": 0.8381478060820391, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 39879 + }, + { + "epoch": 0.3988, + "grad_norm": 0.7679686913830421, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 39880 + }, + { + "epoch": 0.39881, + "grad_norm": 0.7095766029614662, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 39881 + }, + { + "epoch": 0.39882, + "grad_norm": 0.7921452114811892, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 39882 + }, + { + "epoch": 0.39883, + "grad_norm": 0.9867420281834874, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 39883 + }, + { + "epoch": 0.39884, + "grad_norm": 1.190923885108094, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 39884 + }, + { + "epoch": 0.39885, + "grad_norm": 0.798879837062407, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 39885 + }, + { + "epoch": 0.39886, + "grad_norm": 0.8343901124501807, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 39886 + }, + { + "epoch": 0.39887, + "grad_norm": 1.0511349327547006, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 39887 + }, + { + "epoch": 0.39888, + "grad_norm": 0.9758700705848256, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 39888 + }, + { + "epoch": 0.39889, + "grad_norm": 0.9656509996005178, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 39889 + }, + { + "epoch": 0.3989, + "grad_norm": 0.9328214618163423, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 39890 + }, + { + "epoch": 0.39891, + "grad_norm": 0.8199227260516097, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 39891 + }, + { + "epoch": 0.39892, + "grad_norm": 0.8153896893824423, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 39892 + }, + { + "epoch": 0.39893, + "grad_norm": 0.9204564429562818, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 39893 + }, + { + "epoch": 0.39894, + "grad_norm": 1.201152113991954, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 39894 + }, + { + "epoch": 0.39895, + "grad_norm": 1.041394648631605, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 39895 + }, + { + "epoch": 0.39896, + "grad_norm": 0.9212179480742707, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 39896 + }, + { + "epoch": 0.39897, + "grad_norm": 0.9103322003176143, + "learning_rate": 0.003, + "loss": 4.03, + "step": 39897 + }, + { + "epoch": 0.39898, + "grad_norm": 0.9871785930369653, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 39898 + }, + { + "epoch": 0.39899, + "grad_norm": 0.8933511680157761, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 39899 + }, + { + "epoch": 0.399, + "grad_norm": 0.7832225240467451, + "learning_rate": 0.003, + "loss": 4.031, + "step": 39900 + }, + { + "epoch": 0.39901, + "grad_norm": 0.7207131869119145, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 39901 + }, + { + "epoch": 0.39902, + "grad_norm": 0.785771972287437, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 39902 + }, + { + "epoch": 0.39903, + "grad_norm": 0.9271159426195209, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 39903 + }, + { + "epoch": 0.39904, + "grad_norm": 0.9383485028576066, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 39904 + }, + { + "epoch": 0.39905, + "grad_norm": 0.8546433407982674, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 39905 + }, + { + "epoch": 0.39906, + "grad_norm": 0.8661502309982252, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 39906 + }, + { + "epoch": 0.39907, + "grad_norm": 0.8362392976041235, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 39907 + }, + { + "epoch": 0.39908, + "grad_norm": 0.8401890311670035, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 39908 + }, + { + "epoch": 0.39909, + "grad_norm": 0.8926242823373224, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 39909 + }, + { + "epoch": 0.3991, + "grad_norm": 0.9582650436092643, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 39910 + }, + { + "epoch": 0.39911, + "grad_norm": 1.0281367762050662, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 39911 + }, + { + "epoch": 0.39912, + "grad_norm": 0.9034784691450637, + "learning_rate": 0.003, + "loss": 4.0011, + "step": 39912 + }, + { + "epoch": 0.39913, + "grad_norm": 0.759466793971534, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 39913 + }, + { + "epoch": 0.39914, + "grad_norm": 0.7326859751432261, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 39914 + }, + { + "epoch": 0.39915, + "grad_norm": 0.776025332720924, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 39915 + }, + { + "epoch": 0.39916, + "grad_norm": 0.691725524069583, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 39916 + }, + { + "epoch": 0.39917, + "grad_norm": 0.6233627765991587, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 39917 + }, + { + "epoch": 0.39918, + "grad_norm": 0.6424627582015631, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 39918 + }, + { + "epoch": 0.39919, + "grad_norm": 0.6799094064863763, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 39919 + }, + { + "epoch": 0.3992, + "grad_norm": 0.621715610095118, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 39920 + }, + { + "epoch": 0.39921, + "grad_norm": 0.6182502394854795, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 39921 + }, + { + "epoch": 0.39922, + "grad_norm": 0.6350825569358769, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 39922 + }, + { + "epoch": 0.39923, + "grad_norm": 0.6497452196172597, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 39923 + }, + { + "epoch": 0.39924, + "grad_norm": 0.6095233772775743, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 39924 + }, + { + "epoch": 0.39925, + "grad_norm": 0.5227002904672289, + "learning_rate": 0.003, + "loss": 3.9976, + "step": 39925 + }, + { + "epoch": 0.39926, + "grad_norm": 0.5064294181625411, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 39926 + }, + { + "epoch": 0.39927, + "grad_norm": 0.5115838991566898, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 39927 + }, + { + "epoch": 0.39928, + "grad_norm": 0.562004717610802, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 39928 + }, + { + "epoch": 0.39929, + "grad_norm": 0.6539535301884991, + "learning_rate": 0.003, + "loss": 3.969, + "step": 39929 + }, + { + "epoch": 0.3993, + "grad_norm": 0.910665606937335, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 39930 + }, + { + "epoch": 0.39931, + "grad_norm": 1.4194036961004155, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 39931 + }, + { + "epoch": 0.39932, + "grad_norm": 0.6168694498323857, + "learning_rate": 0.003, + "loss": 4.0096, + "step": 39932 + }, + { + "epoch": 0.39933, + "grad_norm": 0.7647496804803852, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 39933 + }, + { + "epoch": 0.39934, + "grad_norm": 0.9838306470069991, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 39934 + }, + { + "epoch": 0.39935, + "grad_norm": 1.0848361902482988, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 39935 + }, + { + "epoch": 0.39936, + "grad_norm": 0.9002331867292719, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 39936 + }, + { + "epoch": 0.39937, + "grad_norm": 0.9241553028400039, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 39937 + }, + { + "epoch": 0.39938, + "grad_norm": 0.902785226704515, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 39938 + }, + { + "epoch": 0.39939, + "grad_norm": 0.9001515286557942, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 39939 + }, + { + "epoch": 0.3994, + "grad_norm": 0.9371378247149169, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 39940 + }, + { + "epoch": 0.39941, + "grad_norm": 0.8965733388626644, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 39941 + }, + { + "epoch": 0.39942, + "grad_norm": 0.8726524724928576, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 39942 + }, + { + "epoch": 0.39943, + "grad_norm": 0.942952885324601, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 39943 + }, + { + "epoch": 0.39944, + "grad_norm": 1.0599350176870408, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 39944 + }, + { + "epoch": 0.39945, + "grad_norm": 0.9593949219868708, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 39945 + }, + { + "epoch": 0.39946, + "grad_norm": 0.8124652157619743, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 39946 + }, + { + "epoch": 0.39947, + "grad_norm": 0.803470125069241, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 39947 + }, + { + "epoch": 0.39948, + "grad_norm": 0.7962154647829404, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 39948 + }, + { + "epoch": 0.39949, + "grad_norm": 0.8521542709523799, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 39949 + }, + { + "epoch": 0.3995, + "grad_norm": 0.8565464931480009, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 39950 + }, + { + "epoch": 0.39951, + "grad_norm": 0.7869795734811442, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 39951 + }, + { + "epoch": 0.39952, + "grad_norm": 0.8190658208826668, + "learning_rate": 0.003, + "loss": 3.9846, + "step": 39952 + }, + { + "epoch": 0.39953, + "grad_norm": 0.7701495705067549, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 39953 + }, + { + "epoch": 0.39954, + "grad_norm": 0.7817024668633192, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 39954 + }, + { + "epoch": 0.39955, + "grad_norm": 0.8969375268102933, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 39955 + }, + { + "epoch": 0.39956, + "grad_norm": 0.896965171572242, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 39956 + }, + { + "epoch": 0.39957, + "grad_norm": 0.8832864409819462, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 39957 + }, + { + "epoch": 0.39958, + "grad_norm": 0.8768825573575972, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 39958 + }, + { + "epoch": 0.39959, + "grad_norm": 0.934253585752655, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 39959 + }, + { + "epoch": 0.3996, + "grad_norm": 0.9247849622006181, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 39960 + }, + { + "epoch": 0.39961, + "grad_norm": 0.8834528408673391, + "learning_rate": 0.003, + "loss": 4.0038, + "step": 39961 + }, + { + "epoch": 0.39962, + "grad_norm": 1.024988903594844, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 39962 + }, + { + "epoch": 0.39963, + "grad_norm": 1.0323499050705247, + "learning_rate": 0.003, + "loss": 4.046, + "step": 39963 + }, + { + "epoch": 0.39964, + "grad_norm": 1.2742938833192627, + "learning_rate": 0.003, + "loss": 4.05, + "step": 39964 + }, + { + "epoch": 0.39965, + "grad_norm": 0.9256322461374934, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 39965 + }, + { + "epoch": 0.39966, + "grad_norm": 0.7697920036477321, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 39966 + }, + { + "epoch": 0.39967, + "grad_norm": 0.652042026062074, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 39967 + }, + { + "epoch": 0.39968, + "grad_norm": 0.6214188022965332, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 39968 + }, + { + "epoch": 0.39969, + "grad_norm": 0.5922075049749252, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 39969 + }, + { + "epoch": 0.3997, + "grad_norm": 0.6605188552256778, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 39970 + }, + { + "epoch": 0.39971, + "grad_norm": 0.8022969365652629, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 39971 + }, + { + "epoch": 0.39972, + "grad_norm": 0.9206019261744504, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 39972 + }, + { + "epoch": 0.39973, + "grad_norm": 0.9485632353465137, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 39973 + }, + { + "epoch": 0.39974, + "grad_norm": 0.9269604922675023, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 39974 + }, + { + "epoch": 0.39975, + "grad_norm": 0.9102636097074517, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 39975 + }, + { + "epoch": 0.39976, + "grad_norm": 0.8287409847297376, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 39976 + }, + { + "epoch": 0.39977, + "grad_norm": 0.6939994934141812, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 39977 + }, + { + "epoch": 0.39978, + "grad_norm": 0.6440650475265259, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 39978 + }, + { + "epoch": 0.39979, + "grad_norm": 0.5919991723977325, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 39979 + }, + { + "epoch": 0.3998, + "grad_norm": 0.5638818213975968, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 39980 + }, + { + "epoch": 0.39981, + "grad_norm": 0.6387450853691357, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 39981 + }, + { + "epoch": 0.39982, + "grad_norm": 0.61524094078093, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 39982 + }, + { + "epoch": 0.39983, + "grad_norm": 0.5663991846276047, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 39983 + }, + { + "epoch": 0.39984, + "grad_norm": 0.6310927730640372, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 39984 + }, + { + "epoch": 0.39985, + "grad_norm": 0.7227601124975787, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 39985 + }, + { + "epoch": 0.39986, + "grad_norm": 0.808743908419311, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 39986 + }, + { + "epoch": 0.39987, + "grad_norm": 0.8144588570824686, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 39987 + }, + { + "epoch": 0.39988, + "grad_norm": 0.8202985863586726, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 39988 + }, + { + "epoch": 0.39989, + "grad_norm": 0.851804970528301, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 39989 + }, + { + "epoch": 0.3999, + "grad_norm": 0.8112807367180187, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 39990 + }, + { + "epoch": 0.39991, + "grad_norm": 0.7237208898584554, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 39991 + }, + { + "epoch": 0.39992, + "grad_norm": 0.6593873168362036, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 39992 + }, + { + "epoch": 0.39993, + "grad_norm": 0.670586915496413, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 39993 + }, + { + "epoch": 0.39994, + "grad_norm": 0.7464083928652252, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 39994 + }, + { + "epoch": 0.39995, + "grad_norm": 0.9383074937541285, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 39995 + }, + { + "epoch": 0.39996, + "grad_norm": 1.2135691645805053, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 39996 + }, + { + "epoch": 0.39997, + "grad_norm": 0.8074507788991258, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 39997 + }, + { + "epoch": 0.39998, + "grad_norm": 0.8432715552945449, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 39998 + }, + { + "epoch": 0.39999, + "grad_norm": 0.9601407013419518, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 39999 + }, + { + "epoch": 0.4, + "grad_norm": 1.0370168297879139, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 40000 + }, + { + "epoch": 0.40001, + "grad_norm": 1.00045447673231, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 40001 + }, + { + "epoch": 0.40002, + "grad_norm": 1.078294154496775, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 40002 + }, + { + "epoch": 0.40003, + "grad_norm": 0.869254451206538, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 40003 + }, + { + "epoch": 0.40004, + "grad_norm": 0.958007945335505, + "learning_rate": 0.003, + "loss": 4.042, + "step": 40004 + }, + { + "epoch": 0.40005, + "grad_norm": 0.9823613718817736, + "learning_rate": 0.003, + "loss": 4.075, + "step": 40005 + }, + { + "epoch": 0.40006, + "grad_norm": 0.993365075827518, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 40006 + }, + { + "epoch": 0.40007, + "grad_norm": 1.1213931335476197, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 40007 + }, + { + "epoch": 0.40008, + "grad_norm": 0.9060975888259689, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 40008 + }, + { + "epoch": 0.40009, + "grad_norm": 0.8800467607473126, + "learning_rate": 0.003, + "loss": 4.062, + "step": 40009 + }, + { + "epoch": 0.4001, + "grad_norm": 0.8252797832026282, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 40010 + }, + { + "epoch": 0.40011, + "grad_norm": 0.721203290558469, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 40011 + }, + { + "epoch": 0.40012, + "grad_norm": 0.7553954235410294, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 40012 + }, + { + "epoch": 0.40013, + "grad_norm": 0.6911410426913801, + "learning_rate": 0.003, + "loss": 3.9935, + "step": 40013 + }, + { + "epoch": 0.40014, + "grad_norm": 0.6071724533049602, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 40014 + }, + { + "epoch": 0.40015, + "grad_norm": 0.6742932329790485, + "learning_rate": 0.003, + "loss": 4.009, + "step": 40015 + }, + { + "epoch": 0.40016, + "grad_norm": 0.690477686607103, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 40016 + }, + { + "epoch": 0.40017, + "grad_norm": 0.6101064914048299, + "learning_rate": 0.003, + "loss": 4.0003, + "step": 40017 + }, + { + "epoch": 0.40018, + "grad_norm": 0.6286788495995583, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 40018 + }, + { + "epoch": 0.40019, + "grad_norm": 0.6116990048345327, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 40019 + }, + { + "epoch": 0.4002, + "grad_norm": 0.7125794430957472, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 40020 + }, + { + "epoch": 0.40021, + "grad_norm": 0.9858129101145033, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 40021 + }, + { + "epoch": 0.40022, + "grad_norm": 1.415964061903525, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 40022 + }, + { + "epoch": 0.40023, + "grad_norm": 0.8043189054384752, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 40023 + }, + { + "epoch": 0.40024, + "grad_norm": 0.9016766778792197, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 40024 + }, + { + "epoch": 0.40025, + "grad_norm": 0.7886129990450343, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 40025 + }, + { + "epoch": 0.40026, + "grad_norm": 0.875903349307254, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 40026 + }, + { + "epoch": 0.40027, + "grad_norm": 0.904546038696535, + "learning_rate": 0.003, + "loss": 4.044, + "step": 40027 + }, + { + "epoch": 0.40028, + "grad_norm": 0.7925638639352174, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 40028 + }, + { + "epoch": 0.40029, + "grad_norm": 0.7312256678591178, + "learning_rate": 0.003, + "loss": 4.0051, + "step": 40029 + }, + { + "epoch": 0.4003, + "grad_norm": 0.7438334116832548, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 40030 + }, + { + "epoch": 0.40031, + "grad_norm": 0.8317902029701906, + "learning_rate": 0.003, + "loss": 3.9868, + "step": 40031 + }, + { + "epoch": 0.40032, + "grad_norm": 0.9231826233110185, + "learning_rate": 0.003, + "loss": 4.051, + "step": 40032 + }, + { + "epoch": 0.40033, + "grad_norm": 0.9414631601867126, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 40033 + }, + { + "epoch": 0.40034, + "grad_norm": 1.0092935616347882, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 40034 + }, + { + "epoch": 0.40035, + "grad_norm": 0.9713308743657061, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 40035 + }, + { + "epoch": 0.40036, + "grad_norm": 0.9561400440564841, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 40036 + }, + { + "epoch": 0.40037, + "grad_norm": 1.0022236589836426, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 40037 + }, + { + "epoch": 0.40038, + "grad_norm": 0.987447062762229, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 40038 + }, + { + "epoch": 0.40039, + "grad_norm": 1.068365932501467, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 40039 + }, + { + "epoch": 0.4004, + "grad_norm": 1.0002741325648032, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 40040 + }, + { + "epoch": 0.40041, + "grad_norm": 1.0450848950248266, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 40041 + }, + { + "epoch": 0.40042, + "grad_norm": 0.9618268057919468, + "learning_rate": 0.003, + "loss": 4.027, + "step": 40042 + }, + { + "epoch": 0.40043, + "grad_norm": 0.9293368338828245, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 40043 + }, + { + "epoch": 0.40044, + "grad_norm": 0.81657510535878, + "learning_rate": 0.003, + "loss": 4.026, + "step": 40044 + }, + { + "epoch": 0.40045, + "grad_norm": 0.7232501570445381, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 40045 + }, + { + "epoch": 0.40046, + "grad_norm": 0.6086787005989142, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 40046 + }, + { + "epoch": 0.40047, + "grad_norm": 0.5944468790894742, + "learning_rate": 0.003, + "loss": 4.0061, + "step": 40047 + }, + { + "epoch": 0.40048, + "grad_norm": 0.707459680130766, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 40048 + }, + { + "epoch": 0.40049, + "grad_norm": 0.7194375044415474, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 40049 + }, + { + "epoch": 0.4005, + "grad_norm": 0.7081949196872671, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 40050 + }, + { + "epoch": 0.40051, + "grad_norm": 0.7494189961490921, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 40051 + }, + { + "epoch": 0.40052, + "grad_norm": 0.7679618436557729, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 40052 + }, + { + "epoch": 0.40053, + "grad_norm": 0.7831910643485286, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 40053 + }, + { + "epoch": 0.40054, + "grad_norm": 0.8086785878511971, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 40054 + }, + { + "epoch": 0.40055, + "grad_norm": 0.8462370914847474, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 40055 + }, + { + "epoch": 0.40056, + "grad_norm": 0.9375510217547776, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 40056 + }, + { + "epoch": 0.40057, + "grad_norm": 0.9959172225341835, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 40057 + }, + { + "epoch": 0.40058, + "grad_norm": 0.9872317900825467, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 40058 + }, + { + "epoch": 0.40059, + "grad_norm": 0.9698558147321784, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 40059 + }, + { + "epoch": 0.4006, + "grad_norm": 0.8427759487805414, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 40060 + }, + { + "epoch": 0.40061, + "grad_norm": 0.7821214396932209, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 40061 + }, + { + "epoch": 0.40062, + "grad_norm": 0.7621218411916376, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 40062 + }, + { + "epoch": 0.40063, + "grad_norm": 0.7884359484313174, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 40063 + }, + { + "epoch": 0.40064, + "grad_norm": 0.7577410183849254, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 40064 + }, + { + "epoch": 0.40065, + "grad_norm": 0.7501061194223814, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 40065 + }, + { + "epoch": 0.40066, + "grad_norm": 0.779933011490471, + "learning_rate": 0.003, + "loss": 4.0058, + "step": 40066 + }, + { + "epoch": 0.40067, + "grad_norm": 0.9001782793198669, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 40067 + }, + { + "epoch": 0.40068, + "grad_norm": 0.8893801523284084, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 40068 + }, + { + "epoch": 0.40069, + "grad_norm": 0.8949641084555903, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 40069 + }, + { + "epoch": 0.4007, + "grad_norm": 0.8650112296449216, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 40070 + }, + { + "epoch": 0.40071, + "grad_norm": 0.8103007729627766, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 40071 + }, + { + "epoch": 0.40072, + "grad_norm": 0.7607225182398005, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 40072 + }, + { + "epoch": 0.40073, + "grad_norm": 0.8535470499499616, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 40073 + }, + { + "epoch": 0.40074, + "grad_norm": 0.8273999488723094, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 40074 + }, + { + "epoch": 0.40075, + "grad_norm": 0.9019414714392378, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 40075 + }, + { + "epoch": 0.40076, + "grad_norm": 1.0017405216808448, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 40076 + }, + { + "epoch": 0.40077, + "grad_norm": 1.0293494762197823, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 40077 + }, + { + "epoch": 0.40078, + "grad_norm": 0.9446974425818785, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 40078 + }, + { + "epoch": 0.40079, + "grad_norm": 0.8725956824911905, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 40079 + }, + { + "epoch": 0.4008, + "grad_norm": 0.7113050715153187, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 40080 + }, + { + "epoch": 0.40081, + "grad_norm": 0.6776170668393584, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 40081 + }, + { + "epoch": 0.40082, + "grad_norm": 0.7219246114270517, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 40082 + }, + { + "epoch": 0.40083, + "grad_norm": 0.6722315249073904, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 40083 + }, + { + "epoch": 0.40084, + "grad_norm": 0.6206363068476854, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 40084 + }, + { + "epoch": 0.40085, + "grad_norm": 0.580380339712111, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 40085 + }, + { + "epoch": 0.40086, + "grad_norm": 0.5904982421570182, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 40086 + }, + { + "epoch": 0.40087, + "grad_norm": 0.6196109871498349, + "learning_rate": 0.003, + "loss": 4.0084, + "step": 40087 + }, + { + "epoch": 0.40088, + "grad_norm": 0.5936749345608227, + "learning_rate": 0.003, + "loss": 3.9873, + "step": 40088 + }, + { + "epoch": 0.40089, + "grad_norm": 0.6510991119340511, + "learning_rate": 0.003, + "loss": 4.031, + "step": 40089 + }, + { + "epoch": 0.4009, + "grad_norm": 0.6972820021238495, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 40090 + }, + { + "epoch": 0.40091, + "grad_norm": 0.7481523388766433, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 40091 + }, + { + "epoch": 0.40092, + "grad_norm": 0.7720163545432449, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 40092 + }, + { + "epoch": 0.40093, + "grad_norm": 0.7723550314815705, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 40093 + }, + { + "epoch": 0.40094, + "grad_norm": 0.8385761939669133, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 40094 + }, + { + "epoch": 0.40095, + "grad_norm": 1.0294210784494477, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 40095 + }, + { + "epoch": 0.40096, + "grad_norm": 1.3477482756215016, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 40096 + }, + { + "epoch": 0.40097, + "grad_norm": 0.7777963220134422, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 40097 + }, + { + "epoch": 0.40098, + "grad_norm": 0.7573778053632105, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 40098 + }, + { + "epoch": 0.40099, + "grad_norm": 0.8590578036810077, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 40099 + }, + { + "epoch": 0.401, + "grad_norm": 0.763579084908119, + "learning_rate": 0.003, + "loss": 4.033, + "step": 40100 + }, + { + "epoch": 0.40101, + "grad_norm": 0.7136019601722527, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 40101 + }, + { + "epoch": 0.40102, + "grad_norm": 0.7248919634758634, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 40102 + }, + { + "epoch": 0.40103, + "grad_norm": 0.7016846847467078, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 40103 + }, + { + "epoch": 0.40104, + "grad_norm": 0.5923729376870462, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 40104 + }, + { + "epoch": 0.40105, + "grad_norm": 0.6643255692393376, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 40105 + }, + { + "epoch": 0.40106, + "grad_norm": 0.7955468557474125, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 40106 + }, + { + "epoch": 0.40107, + "grad_norm": 0.9780875669617708, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 40107 + }, + { + "epoch": 0.40108, + "grad_norm": 1.1849428985630295, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 40108 + }, + { + "epoch": 0.40109, + "grad_norm": 0.8381401733291173, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 40109 + }, + { + "epoch": 0.4011, + "grad_norm": 0.8174844010645355, + "learning_rate": 0.003, + "loss": 4.0088, + "step": 40110 + }, + { + "epoch": 0.40111, + "grad_norm": 0.8110852959497247, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 40111 + }, + { + "epoch": 0.40112, + "grad_norm": 0.8153359425343681, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 40112 + }, + { + "epoch": 0.40113, + "grad_norm": 0.7285214492544558, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 40113 + }, + { + "epoch": 0.40114, + "grad_norm": 0.6527306513492912, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 40114 + }, + { + "epoch": 0.40115, + "grad_norm": 0.6940834060860941, + "learning_rate": 0.003, + "loss": 4.026, + "step": 40115 + }, + { + "epoch": 0.40116, + "grad_norm": 0.693408070716768, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 40116 + }, + { + "epoch": 0.40117, + "grad_norm": 0.7699501049630675, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 40117 + }, + { + "epoch": 0.40118, + "grad_norm": 0.9088706156698804, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 40118 + }, + { + "epoch": 0.40119, + "grad_norm": 1.1227248328944346, + "learning_rate": 0.003, + "loss": 4.056, + "step": 40119 + }, + { + "epoch": 0.4012, + "grad_norm": 0.9870430409335226, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 40120 + }, + { + "epoch": 0.40121, + "grad_norm": 0.9717846145084467, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 40121 + }, + { + "epoch": 0.40122, + "grad_norm": 0.9726372446202354, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 40122 + }, + { + "epoch": 0.40123, + "grad_norm": 0.8973002290334428, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 40123 + }, + { + "epoch": 0.40124, + "grad_norm": 0.860138657694179, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 40124 + }, + { + "epoch": 0.40125, + "grad_norm": 0.8057190949273684, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 40125 + }, + { + "epoch": 0.40126, + "grad_norm": 0.8765971873726284, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 40126 + }, + { + "epoch": 0.40127, + "grad_norm": 0.9313801660343684, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 40127 + }, + { + "epoch": 0.40128, + "grad_norm": 0.931074797146026, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 40128 + }, + { + "epoch": 0.40129, + "grad_norm": 1.1142839530195126, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 40129 + }, + { + "epoch": 0.4013, + "grad_norm": 0.9937960004514929, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 40130 + }, + { + "epoch": 0.40131, + "grad_norm": 1.0491760450393355, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 40131 + }, + { + "epoch": 0.40132, + "grad_norm": 0.9448234823759891, + "learning_rate": 0.003, + "loss": 4.039, + "step": 40132 + }, + { + "epoch": 0.40133, + "grad_norm": 0.8421770454260095, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 40133 + }, + { + "epoch": 0.40134, + "grad_norm": 0.7583091962033065, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 40134 + }, + { + "epoch": 0.40135, + "grad_norm": 0.7194338097395028, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 40135 + }, + { + "epoch": 0.40136, + "grad_norm": 0.7033507982351141, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 40136 + }, + { + "epoch": 0.40137, + "grad_norm": 0.6591259991783077, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 40137 + }, + { + "epoch": 0.40138, + "grad_norm": 0.7224363232258033, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 40138 + }, + { + "epoch": 0.40139, + "grad_norm": 0.6679668858771259, + "learning_rate": 0.003, + "loss": 4.0027, + "step": 40139 + }, + { + "epoch": 0.4014, + "grad_norm": 0.6951989625424991, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 40140 + }, + { + "epoch": 0.40141, + "grad_norm": 0.8821949871053683, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 40141 + }, + { + "epoch": 0.40142, + "grad_norm": 1.0018380178266424, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 40142 + }, + { + "epoch": 0.40143, + "grad_norm": 0.8358571201653826, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 40143 + }, + { + "epoch": 0.40144, + "grad_norm": 0.6586708537389303, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 40144 + }, + { + "epoch": 0.40145, + "grad_norm": 0.6435218077433613, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 40145 + }, + { + "epoch": 0.40146, + "grad_norm": 0.6686347792412729, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 40146 + }, + { + "epoch": 0.40147, + "grad_norm": 0.8291097901175846, + "learning_rate": 0.003, + "loss": 4.037, + "step": 40147 + }, + { + "epoch": 0.40148, + "grad_norm": 1.1189442680301438, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 40148 + }, + { + "epoch": 0.40149, + "grad_norm": 1.081029400240327, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 40149 + }, + { + "epoch": 0.4015, + "grad_norm": 0.9436538387813708, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 40150 + }, + { + "epoch": 0.40151, + "grad_norm": 0.8949332222008625, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 40151 + }, + { + "epoch": 0.40152, + "grad_norm": 0.7324710668673572, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 40152 + }, + { + "epoch": 0.40153, + "grad_norm": 0.6901724662950016, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 40153 + }, + { + "epoch": 0.40154, + "grad_norm": 0.6443891547983366, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 40154 + }, + { + "epoch": 0.40155, + "grad_norm": 0.7378394464095811, + "learning_rate": 0.003, + "loss": 4.0096, + "step": 40155 + }, + { + "epoch": 0.40156, + "grad_norm": 0.7532966778594785, + "learning_rate": 0.003, + "loss": 3.9884, + "step": 40156 + }, + { + "epoch": 0.40157, + "grad_norm": 0.8651962566603679, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 40157 + }, + { + "epoch": 0.40158, + "grad_norm": 0.7206256153689046, + "learning_rate": 0.003, + "loss": 3.9973, + "step": 40158 + }, + { + "epoch": 0.40159, + "grad_norm": 0.649955205671867, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 40159 + }, + { + "epoch": 0.4016, + "grad_norm": 0.6637044018451869, + "learning_rate": 0.003, + "loss": 4.0008, + "step": 40160 + }, + { + "epoch": 0.40161, + "grad_norm": 0.7283677714270147, + "learning_rate": 0.003, + "loss": 3.9778, + "step": 40161 + }, + { + "epoch": 0.40162, + "grad_norm": 0.921827947066717, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 40162 + }, + { + "epoch": 0.40163, + "grad_norm": 1.2784504652543596, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 40163 + }, + { + "epoch": 0.40164, + "grad_norm": 0.7987842935038688, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 40164 + }, + { + "epoch": 0.40165, + "grad_norm": 0.7657490290816994, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 40165 + }, + { + "epoch": 0.40166, + "grad_norm": 0.8923856817327086, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 40166 + }, + { + "epoch": 0.40167, + "grad_norm": 0.8341792199068364, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 40167 + }, + { + "epoch": 0.40168, + "grad_norm": 0.7968235445982186, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 40168 + }, + { + "epoch": 0.40169, + "grad_norm": 0.8021453832783247, + "learning_rate": 0.003, + "loss": 4.0089, + "step": 40169 + }, + { + "epoch": 0.4017, + "grad_norm": 0.8275430966987795, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 40170 + }, + { + "epoch": 0.40171, + "grad_norm": 1.0064814613368582, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 40171 + }, + { + "epoch": 0.40172, + "grad_norm": 1.1431859498230865, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 40172 + }, + { + "epoch": 0.40173, + "grad_norm": 0.8584744094981099, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 40173 + }, + { + "epoch": 0.40174, + "grad_norm": 0.7872856679920145, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 40174 + }, + { + "epoch": 0.40175, + "grad_norm": 0.7055561944098889, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 40175 + }, + { + "epoch": 0.40176, + "grad_norm": 0.731117471813053, + "learning_rate": 0.003, + "loss": 4.0828, + "step": 40176 + }, + { + "epoch": 0.40177, + "grad_norm": 0.7911861694193656, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 40177 + }, + { + "epoch": 0.40178, + "grad_norm": 0.7716215640106561, + "learning_rate": 0.003, + "loss": 3.9913, + "step": 40178 + }, + { + "epoch": 0.40179, + "grad_norm": 0.8227959720704623, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 40179 + }, + { + "epoch": 0.4018, + "grad_norm": 0.8952704173343604, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 40180 + }, + { + "epoch": 0.40181, + "grad_norm": 1.1248508763809222, + "learning_rate": 0.003, + "loss": 3.9967, + "step": 40181 + }, + { + "epoch": 0.40182, + "grad_norm": 1.171394648023366, + "learning_rate": 0.003, + "loss": 4.0856, + "step": 40182 + }, + { + "epoch": 0.40183, + "grad_norm": 0.803644708722843, + "learning_rate": 0.003, + "loss": 4.0715, + "step": 40183 + }, + { + "epoch": 0.40184, + "grad_norm": 0.666998051844857, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 40184 + }, + { + "epoch": 0.40185, + "grad_norm": 0.5840323410397374, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 40185 + }, + { + "epoch": 0.40186, + "grad_norm": 0.5935042649752317, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 40186 + }, + { + "epoch": 0.40187, + "grad_norm": 0.6336293679658526, + "learning_rate": 0.003, + "loss": 4.039, + "step": 40187 + }, + { + "epoch": 0.40188, + "grad_norm": 0.601143891123838, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 40188 + }, + { + "epoch": 0.40189, + "grad_norm": 0.5816650754365542, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 40189 + }, + { + "epoch": 0.4019, + "grad_norm": 0.5602200610659657, + "learning_rate": 0.003, + "loss": 4.0026, + "step": 40190 + }, + { + "epoch": 0.40191, + "grad_norm": 0.6724636725916762, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 40191 + }, + { + "epoch": 0.40192, + "grad_norm": 0.8405647102334493, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 40192 + }, + { + "epoch": 0.40193, + "grad_norm": 1.0757735901835754, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 40193 + }, + { + "epoch": 0.40194, + "grad_norm": 1.034138457059314, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 40194 + }, + { + "epoch": 0.40195, + "grad_norm": 0.8263769604004672, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 40195 + }, + { + "epoch": 0.40196, + "grad_norm": 0.61762149760184, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 40196 + }, + { + "epoch": 0.40197, + "grad_norm": 0.7885259297947611, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 40197 + }, + { + "epoch": 0.40198, + "grad_norm": 0.9727799077766711, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 40198 + }, + { + "epoch": 0.40199, + "grad_norm": 1.1540014925139552, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 40199 + }, + { + "epoch": 0.402, + "grad_norm": 0.7405448790736892, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 40200 + }, + { + "epoch": 0.40201, + "grad_norm": 0.780371495452436, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 40201 + }, + { + "epoch": 0.40202, + "grad_norm": 0.8367403851306825, + "learning_rate": 0.003, + "loss": 3.9937, + "step": 40202 + }, + { + "epoch": 0.40203, + "grad_norm": 0.8373540141011974, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 40203 + }, + { + "epoch": 0.40204, + "grad_norm": 0.7581119467997864, + "learning_rate": 0.003, + "loss": 3.997, + "step": 40204 + }, + { + "epoch": 0.40205, + "grad_norm": 0.7472376812744831, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 40205 + }, + { + "epoch": 0.40206, + "grad_norm": 0.7562668225096967, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 40206 + }, + { + "epoch": 0.40207, + "grad_norm": 0.8110246267436324, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 40207 + }, + { + "epoch": 0.40208, + "grad_norm": 0.9911535007631047, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 40208 + }, + { + "epoch": 0.40209, + "grad_norm": 1.1761511316400024, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 40209 + }, + { + "epoch": 0.4021, + "grad_norm": 0.8208552564590346, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 40210 + }, + { + "epoch": 0.40211, + "grad_norm": 0.7380786254948744, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 40211 + }, + { + "epoch": 0.40212, + "grad_norm": 0.6980207030853519, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 40212 + }, + { + "epoch": 0.40213, + "grad_norm": 0.828693806860441, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 40213 + }, + { + "epoch": 0.40214, + "grad_norm": 0.9855186554354105, + "learning_rate": 0.003, + "loss": 3.9994, + "step": 40214 + }, + { + "epoch": 0.40215, + "grad_norm": 1.0463369066871924, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 40215 + }, + { + "epoch": 0.40216, + "grad_norm": 1.0020499741222948, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 40216 + }, + { + "epoch": 0.40217, + "grad_norm": 1.0030761037238891, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 40217 + }, + { + "epoch": 0.40218, + "grad_norm": 1.0227481491888941, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 40218 + }, + { + "epoch": 0.40219, + "grad_norm": 0.8324251957010884, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 40219 + }, + { + "epoch": 0.4022, + "grad_norm": 0.8483590165015267, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 40220 + }, + { + "epoch": 0.40221, + "grad_norm": 0.7630906697447978, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 40221 + }, + { + "epoch": 0.40222, + "grad_norm": 0.7860926507128877, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 40222 + }, + { + "epoch": 0.40223, + "grad_norm": 0.794210992614144, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 40223 + }, + { + "epoch": 0.40224, + "grad_norm": 0.767103036127882, + "learning_rate": 0.003, + "loss": 4.024, + "step": 40224 + }, + { + "epoch": 0.40225, + "grad_norm": 0.763358826945836, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 40225 + }, + { + "epoch": 0.40226, + "grad_norm": 0.8379306399634388, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 40226 + }, + { + "epoch": 0.40227, + "grad_norm": 0.8894180435674407, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 40227 + }, + { + "epoch": 0.40228, + "grad_norm": 1.0066565410542028, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 40228 + }, + { + "epoch": 0.40229, + "grad_norm": 1.2753277047856426, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 40229 + }, + { + "epoch": 0.4023, + "grad_norm": 0.9721478934260966, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 40230 + }, + { + "epoch": 0.40231, + "grad_norm": 1.1307801835794293, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 40231 + }, + { + "epoch": 0.40232, + "grad_norm": 1.041874706924044, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 40232 + }, + { + "epoch": 0.40233, + "grad_norm": 0.9621573549598934, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 40233 + }, + { + "epoch": 0.40234, + "grad_norm": 0.9900470872597853, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 40234 + }, + { + "epoch": 0.40235, + "grad_norm": 1.0099079971571552, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 40235 + }, + { + "epoch": 0.40236, + "grad_norm": 0.9417364799503227, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 40236 + }, + { + "epoch": 0.40237, + "grad_norm": 0.8894155844035977, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 40237 + }, + { + "epoch": 0.40238, + "grad_norm": 0.7306484409037909, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 40238 + }, + { + "epoch": 0.40239, + "grad_norm": 0.5757779707839559, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 40239 + }, + { + "epoch": 0.4024, + "grad_norm": 0.588444470776011, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 40240 + }, + { + "epoch": 0.40241, + "grad_norm": 0.6251140805349582, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 40241 + }, + { + "epoch": 0.40242, + "grad_norm": 0.5916437233214104, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 40242 + }, + { + "epoch": 0.40243, + "grad_norm": 0.6259488534586004, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 40243 + }, + { + "epoch": 0.40244, + "grad_norm": 0.6272357209005868, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 40244 + }, + { + "epoch": 0.40245, + "grad_norm": 0.6845066157189782, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 40245 + }, + { + "epoch": 0.40246, + "grad_norm": 0.8191513550317235, + "learning_rate": 0.003, + "loss": 4.024, + "step": 40246 + }, + { + "epoch": 0.40247, + "grad_norm": 0.9803780206101088, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 40247 + }, + { + "epoch": 0.40248, + "grad_norm": 1.1383970395922962, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 40248 + }, + { + "epoch": 0.40249, + "grad_norm": 0.85298090719121, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 40249 + }, + { + "epoch": 0.4025, + "grad_norm": 0.8067250916237387, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 40250 + }, + { + "epoch": 0.40251, + "grad_norm": 0.8742459014375256, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 40251 + }, + { + "epoch": 0.40252, + "grad_norm": 0.8876177894913039, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 40252 + }, + { + "epoch": 0.40253, + "grad_norm": 0.8537480093011293, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 40253 + }, + { + "epoch": 0.40254, + "grad_norm": 0.8981946922996291, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 40254 + }, + { + "epoch": 0.40255, + "grad_norm": 0.8385721399945271, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 40255 + }, + { + "epoch": 0.40256, + "grad_norm": 0.7432833055121797, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 40256 + }, + { + "epoch": 0.40257, + "grad_norm": 0.7152165405736418, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 40257 + }, + { + "epoch": 0.40258, + "grad_norm": 0.6254098704354351, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 40258 + }, + { + "epoch": 0.40259, + "grad_norm": 0.6275213543487366, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 40259 + }, + { + "epoch": 0.4026, + "grad_norm": 0.6839410099996038, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 40260 + }, + { + "epoch": 0.40261, + "grad_norm": 0.8672038707290991, + "learning_rate": 0.003, + "loss": 4.023, + "step": 40261 + }, + { + "epoch": 0.40262, + "grad_norm": 1.138866762922741, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 40262 + }, + { + "epoch": 0.40263, + "grad_norm": 0.9633106938634763, + "learning_rate": 0.003, + "loss": 4.0698, + "step": 40263 + }, + { + "epoch": 0.40264, + "grad_norm": 0.8060273637383, + "learning_rate": 0.003, + "loss": 3.9919, + "step": 40264 + }, + { + "epoch": 0.40265, + "grad_norm": 0.7992456380185645, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 40265 + }, + { + "epoch": 0.40266, + "grad_norm": 0.9041007466545875, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 40266 + }, + { + "epoch": 0.40267, + "grad_norm": 0.9024950286068945, + "learning_rate": 0.003, + "loss": 4.009, + "step": 40267 + }, + { + "epoch": 0.40268, + "grad_norm": 0.7429656335937598, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 40268 + }, + { + "epoch": 0.40269, + "grad_norm": 0.7749008877740139, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 40269 + }, + { + "epoch": 0.4027, + "grad_norm": 0.8815732690911222, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 40270 + }, + { + "epoch": 0.40271, + "grad_norm": 1.0108781430170592, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 40271 + }, + { + "epoch": 0.40272, + "grad_norm": 1.193846961131245, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 40272 + }, + { + "epoch": 0.40273, + "grad_norm": 0.7448941857357045, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 40273 + }, + { + "epoch": 0.40274, + "grad_norm": 0.7020155883836455, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 40274 + }, + { + "epoch": 0.40275, + "grad_norm": 0.6720911205890657, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 40275 + }, + { + "epoch": 0.40276, + "grad_norm": 0.616977246021629, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 40276 + }, + { + "epoch": 0.40277, + "grad_norm": 0.6130824391541146, + "learning_rate": 0.003, + "loss": 3.9849, + "step": 40277 + }, + { + "epoch": 0.40278, + "grad_norm": 0.5787401264877151, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 40278 + }, + { + "epoch": 0.40279, + "grad_norm": 0.5697908616451316, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 40279 + }, + { + "epoch": 0.4028, + "grad_norm": 0.6941499499447147, + "learning_rate": 0.003, + "loss": 3.9994, + "step": 40280 + }, + { + "epoch": 0.40281, + "grad_norm": 0.8611588797119712, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 40281 + }, + { + "epoch": 0.40282, + "grad_norm": 1.0660395225652513, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 40282 + }, + { + "epoch": 0.40283, + "grad_norm": 0.971233727062291, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 40283 + }, + { + "epoch": 0.40284, + "grad_norm": 0.8739990979059644, + "learning_rate": 0.003, + "loss": 4.0052, + "step": 40284 + }, + { + "epoch": 0.40285, + "grad_norm": 0.8680423387401129, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 40285 + }, + { + "epoch": 0.40286, + "grad_norm": 0.7837554803819291, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 40286 + }, + { + "epoch": 0.40287, + "grad_norm": 0.7957478043252167, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 40287 + }, + { + "epoch": 0.40288, + "grad_norm": 0.7886430163088065, + "learning_rate": 0.003, + "loss": 4.027, + "step": 40288 + }, + { + "epoch": 0.40289, + "grad_norm": 0.8497055275523985, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 40289 + }, + { + "epoch": 0.4029, + "grad_norm": 0.9498911562811737, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 40290 + }, + { + "epoch": 0.40291, + "grad_norm": 0.965807672803012, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 40291 + }, + { + "epoch": 0.40292, + "grad_norm": 0.8853420260684081, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 40292 + }, + { + "epoch": 0.40293, + "grad_norm": 0.8407052902672728, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 40293 + }, + { + "epoch": 0.40294, + "grad_norm": 0.970149784790269, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 40294 + }, + { + "epoch": 0.40295, + "grad_norm": 1.0403589873544907, + "learning_rate": 0.003, + "loss": 4.0871, + "step": 40295 + }, + { + "epoch": 0.40296, + "grad_norm": 0.9348035730352089, + "learning_rate": 0.003, + "loss": 4.044, + "step": 40296 + }, + { + "epoch": 0.40297, + "grad_norm": 0.9251563347764701, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 40297 + }, + { + "epoch": 0.40298, + "grad_norm": 0.886352114076072, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 40298 + }, + { + "epoch": 0.40299, + "grad_norm": 0.9748979606849196, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 40299 + }, + { + "epoch": 0.403, + "grad_norm": 0.8839192868308815, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 40300 + }, + { + "epoch": 0.40301, + "grad_norm": 0.8330921831053306, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 40301 + }, + { + "epoch": 0.40302, + "grad_norm": 0.8255156414847878, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 40302 + }, + { + "epoch": 0.40303, + "grad_norm": 0.8852037892300746, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 40303 + }, + { + "epoch": 0.40304, + "grad_norm": 1.1378190521124663, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 40304 + }, + { + "epoch": 0.40305, + "grad_norm": 0.9707953980288605, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 40305 + }, + { + "epoch": 0.40306, + "grad_norm": 0.8526706436133626, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 40306 + }, + { + "epoch": 0.40307, + "grad_norm": 0.8461051886691864, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 40307 + }, + { + "epoch": 0.40308, + "grad_norm": 0.837619672077325, + "learning_rate": 0.003, + "loss": 4.041, + "step": 40308 + }, + { + "epoch": 0.40309, + "grad_norm": 0.8185449600237644, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 40309 + }, + { + "epoch": 0.4031, + "grad_norm": 0.7586385130598383, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 40310 + }, + { + "epoch": 0.40311, + "grad_norm": 0.7914576531068984, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 40311 + }, + { + "epoch": 0.40312, + "grad_norm": 0.671467912751566, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 40312 + }, + { + "epoch": 0.40313, + "grad_norm": 0.5778628419819043, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 40313 + }, + { + "epoch": 0.40314, + "grad_norm": 0.5757345687353711, + "learning_rate": 0.003, + "loss": 3.9822, + "step": 40314 + }, + { + "epoch": 0.40315, + "grad_norm": 0.6320950195545753, + "learning_rate": 0.003, + "loss": 3.9966, + "step": 40315 + }, + { + "epoch": 0.40316, + "grad_norm": 0.6985925726026321, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 40316 + }, + { + "epoch": 0.40317, + "grad_norm": 0.7598744974045429, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 40317 + }, + { + "epoch": 0.40318, + "grad_norm": 0.8577204626712727, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 40318 + }, + { + "epoch": 0.40319, + "grad_norm": 1.1365576323001454, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 40319 + }, + { + "epoch": 0.4032, + "grad_norm": 0.9854788394744164, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 40320 + }, + { + "epoch": 0.40321, + "grad_norm": 0.7955499945851551, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 40321 + }, + { + "epoch": 0.40322, + "grad_norm": 0.6835352374642252, + "learning_rate": 0.003, + "loss": 4.0004, + "step": 40322 + }, + { + "epoch": 0.40323, + "grad_norm": 0.5984626443309559, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 40323 + }, + { + "epoch": 0.40324, + "grad_norm": 0.5885358633612684, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 40324 + }, + { + "epoch": 0.40325, + "grad_norm": 0.5497612906282935, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 40325 + }, + { + "epoch": 0.40326, + "grad_norm": 0.5458421433956424, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 40326 + }, + { + "epoch": 0.40327, + "grad_norm": 0.5455411733283558, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 40327 + }, + { + "epoch": 0.40328, + "grad_norm": 0.5982104276652399, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 40328 + }, + { + "epoch": 0.40329, + "grad_norm": 0.6855486888239797, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 40329 + }, + { + "epoch": 0.4033, + "grad_norm": 0.7810319115365573, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 40330 + }, + { + "epoch": 0.40331, + "grad_norm": 0.8515297322244417, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 40331 + }, + { + "epoch": 0.40332, + "grad_norm": 0.7302279712201848, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 40332 + }, + { + "epoch": 0.40333, + "grad_norm": 0.6883280101817448, + "learning_rate": 0.003, + "loss": 4.0057, + "step": 40333 + }, + { + "epoch": 0.40334, + "grad_norm": 0.681330075831331, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 40334 + }, + { + "epoch": 0.40335, + "grad_norm": 0.7333135536797755, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 40335 + }, + { + "epoch": 0.40336, + "grad_norm": 0.8947512806403763, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 40336 + }, + { + "epoch": 0.40337, + "grad_norm": 1.0680572684967102, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 40337 + }, + { + "epoch": 0.40338, + "grad_norm": 1.1768652060250773, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 40338 + }, + { + "epoch": 0.40339, + "grad_norm": 1.1058988085213932, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 40339 + }, + { + "epoch": 0.4034, + "grad_norm": 0.8945940307838679, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 40340 + }, + { + "epoch": 0.40341, + "grad_norm": 0.7698207233281685, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 40341 + }, + { + "epoch": 0.40342, + "grad_norm": 0.6947363881586862, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 40342 + }, + { + "epoch": 0.40343, + "grad_norm": 0.747409034139942, + "learning_rate": 0.003, + "loss": 4.018, + "step": 40343 + }, + { + "epoch": 0.40344, + "grad_norm": 0.8326390168787529, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 40344 + }, + { + "epoch": 0.40345, + "grad_norm": 0.8849153403317068, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 40345 + }, + { + "epoch": 0.40346, + "grad_norm": 0.9314065746254077, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 40346 + }, + { + "epoch": 0.40347, + "grad_norm": 1.1001644400297206, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 40347 + }, + { + "epoch": 0.40348, + "grad_norm": 0.9184135739746233, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 40348 + }, + { + "epoch": 0.40349, + "grad_norm": 0.9157648681305788, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 40349 + }, + { + "epoch": 0.4035, + "grad_norm": 1.0422034105562414, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 40350 + }, + { + "epoch": 0.40351, + "grad_norm": 1.1208340525766571, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 40351 + }, + { + "epoch": 0.40352, + "grad_norm": 0.8286167830150581, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 40352 + }, + { + "epoch": 0.40353, + "grad_norm": 0.8491326147055663, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 40353 + }, + { + "epoch": 0.40354, + "grad_norm": 0.8876260453491566, + "learning_rate": 0.003, + "loss": 4.04, + "step": 40354 + }, + { + "epoch": 0.40355, + "grad_norm": 1.0229544392168708, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 40355 + }, + { + "epoch": 0.40356, + "grad_norm": 1.0518093233376236, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 40356 + }, + { + "epoch": 0.40357, + "grad_norm": 0.8307212322723726, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 40357 + }, + { + "epoch": 0.40358, + "grad_norm": 0.7698011681201861, + "learning_rate": 0.003, + "loss": 4.035, + "step": 40358 + }, + { + "epoch": 0.40359, + "grad_norm": 0.7043162733303675, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 40359 + }, + { + "epoch": 0.4036, + "grad_norm": 0.6826339671116013, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 40360 + }, + { + "epoch": 0.40361, + "grad_norm": 0.7240696097853836, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 40361 + }, + { + "epoch": 0.40362, + "grad_norm": 0.8460320435810212, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 40362 + }, + { + "epoch": 0.40363, + "grad_norm": 0.970772226918229, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 40363 + }, + { + "epoch": 0.40364, + "grad_norm": 1.0615820830392795, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 40364 + }, + { + "epoch": 0.40365, + "grad_norm": 0.8995101740532825, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 40365 + }, + { + "epoch": 0.40366, + "grad_norm": 0.6758252926172729, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 40366 + }, + { + "epoch": 0.40367, + "grad_norm": 0.6383886737238892, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 40367 + }, + { + "epoch": 0.40368, + "grad_norm": 0.6846442445887517, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 40368 + }, + { + "epoch": 0.40369, + "grad_norm": 0.7701004207683876, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 40369 + }, + { + "epoch": 0.4037, + "grad_norm": 0.7641189703467606, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 40370 + }, + { + "epoch": 0.40371, + "grad_norm": 0.7897694556123869, + "learning_rate": 0.003, + "loss": 4.0101, + "step": 40371 + }, + { + "epoch": 0.40372, + "grad_norm": 0.9477719216740801, + "learning_rate": 0.003, + "loss": 4.04, + "step": 40372 + }, + { + "epoch": 0.40373, + "grad_norm": 1.0621596093844974, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 40373 + }, + { + "epoch": 0.40374, + "grad_norm": 0.9057111427863623, + "learning_rate": 0.003, + "loss": 4.036, + "step": 40374 + }, + { + "epoch": 0.40375, + "grad_norm": 0.8283763736086356, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 40375 + }, + { + "epoch": 0.40376, + "grad_norm": 0.7612744600479716, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 40376 + }, + { + "epoch": 0.40377, + "grad_norm": 0.5775947980215246, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 40377 + }, + { + "epoch": 0.40378, + "grad_norm": 0.6566696997119805, + "learning_rate": 0.003, + "loss": 4.0597, + "step": 40378 + }, + { + "epoch": 0.40379, + "grad_norm": 0.6819048509622235, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 40379 + }, + { + "epoch": 0.4038, + "grad_norm": 0.7079624703406339, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 40380 + }, + { + "epoch": 0.40381, + "grad_norm": 0.7457876603218582, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 40381 + }, + { + "epoch": 0.40382, + "grad_norm": 0.8253721086848306, + "learning_rate": 0.003, + "loss": 4.027, + "step": 40382 + }, + { + "epoch": 0.40383, + "grad_norm": 0.880423451909034, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 40383 + }, + { + "epoch": 0.40384, + "grad_norm": 0.8488829811453291, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 40384 + }, + { + "epoch": 0.40385, + "grad_norm": 0.9086736681190093, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 40385 + }, + { + "epoch": 0.40386, + "grad_norm": 1.0067427910301656, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 40386 + }, + { + "epoch": 0.40387, + "grad_norm": 1.0071192725679707, + "learning_rate": 0.003, + "loss": 4.0014, + "step": 40387 + }, + { + "epoch": 0.40388, + "grad_norm": 0.9057302586428068, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 40388 + }, + { + "epoch": 0.40389, + "grad_norm": 0.8911130814574153, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 40389 + }, + { + "epoch": 0.4039, + "grad_norm": 0.7827840423412274, + "learning_rate": 0.003, + "loss": 4.0015, + "step": 40390 + }, + { + "epoch": 0.40391, + "grad_norm": 0.710656918123492, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 40391 + }, + { + "epoch": 0.40392, + "grad_norm": 0.7449495233117728, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 40392 + }, + { + "epoch": 0.40393, + "grad_norm": 0.7943412636954995, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 40393 + }, + { + "epoch": 0.40394, + "grad_norm": 0.7350243607045573, + "learning_rate": 0.003, + "loss": 4.0007, + "step": 40394 + }, + { + "epoch": 0.40395, + "grad_norm": 0.7993860404490449, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 40395 + }, + { + "epoch": 0.40396, + "grad_norm": 0.8381124786752404, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 40396 + }, + { + "epoch": 0.40397, + "grad_norm": 0.9072649525830604, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 40397 + }, + { + "epoch": 0.40398, + "grad_norm": 1.073725092084696, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 40398 + }, + { + "epoch": 0.40399, + "grad_norm": 1.0272400029559656, + "learning_rate": 0.003, + "loss": 4.0758, + "step": 40399 + }, + { + "epoch": 0.404, + "grad_norm": 0.9254299923519196, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 40400 + }, + { + "epoch": 0.40401, + "grad_norm": 0.8969801582066533, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 40401 + }, + { + "epoch": 0.40402, + "grad_norm": 1.0154521912534575, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 40402 + }, + { + "epoch": 0.40403, + "grad_norm": 1.1216908729934025, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 40403 + }, + { + "epoch": 0.40404, + "grad_norm": 0.875160487726724, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 40404 + }, + { + "epoch": 0.40405, + "grad_norm": 0.8848583581592318, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 40405 + }, + { + "epoch": 0.40406, + "grad_norm": 0.9553721698927512, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 40406 + }, + { + "epoch": 0.40407, + "grad_norm": 0.93237221495692, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 40407 + }, + { + "epoch": 0.40408, + "grad_norm": 0.906828600289083, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 40408 + }, + { + "epoch": 0.40409, + "grad_norm": 0.9170771588637473, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 40409 + }, + { + "epoch": 0.4041, + "grad_norm": 0.8373349241741137, + "learning_rate": 0.003, + "loss": 4.045, + "step": 40410 + }, + { + "epoch": 0.40411, + "grad_norm": 0.8229004379239764, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 40411 + }, + { + "epoch": 0.40412, + "grad_norm": 0.950319615740957, + "learning_rate": 0.003, + "loss": 4.045, + "step": 40412 + }, + { + "epoch": 0.40413, + "grad_norm": 0.9219714119735934, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 40413 + }, + { + "epoch": 0.40414, + "grad_norm": 0.8667513380197038, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 40414 + }, + { + "epoch": 0.40415, + "grad_norm": 0.8314750896213515, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 40415 + }, + { + "epoch": 0.40416, + "grad_norm": 0.8234140034260011, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 40416 + }, + { + "epoch": 0.40417, + "grad_norm": 0.8497105534045447, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 40417 + }, + { + "epoch": 0.40418, + "grad_norm": 0.6999791741501713, + "learning_rate": 0.003, + "loss": 3.9766, + "step": 40418 + }, + { + "epoch": 0.40419, + "grad_norm": 0.6168189826660061, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 40419 + }, + { + "epoch": 0.4042, + "grad_norm": 0.5935175711452432, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 40420 + }, + { + "epoch": 0.40421, + "grad_norm": 0.5650021719129295, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 40421 + }, + { + "epoch": 0.40422, + "grad_norm": 0.5844613080794481, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 40422 + }, + { + "epoch": 0.40423, + "grad_norm": 0.5591872945965851, + "learning_rate": 0.003, + "loss": 3.9995, + "step": 40423 + }, + { + "epoch": 0.40424, + "grad_norm": 0.5331874394432486, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 40424 + }, + { + "epoch": 0.40425, + "grad_norm": 0.4726481049997843, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 40425 + }, + { + "epoch": 0.40426, + "grad_norm": 0.5185729216235547, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 40426 + }, + { + "epoch": 0.40427, + "grad_norm": 0.6117643281960957, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 40427 + }, + { + "epoch": 0.40428, + "grad_norm": 0.6301186124762385, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 40428 + }, + { + "epoch": 0.40429, + "grad_norm": 0.6084606398296117, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 40429 + }, + { + "epoch": 0.4043, + "grad_norm": 0.8533234668050929, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 40430 + }, + { + "epoch": 0.40431, + "grad_norm": 1.1754145843826143, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 40431 + }, + { + "epoch": 0.40432, + "grad_norm": 0.7589463941680733, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 40432 + }, + { + "epoch": 0.40433, + "grad_norm": 0.6069200465549541, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 40433 + }, + { + "epoch": 0.40434, + "grad_norm": 0.5616880841260814, + "learning_rate": 0.003, + "loss": 4.0077, + "step": 40434 + }, + { + "epoch": 0.40435, + "grad_norm": 0.6960486072992945, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 40435 + }, + { + "epoch": 0.40436, + "grad_norm": 1.0116442176073561, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 40436 + }, + { + "epoch": 0.40437, + "grad_norm": 1.117580773997503, + "learning_rate": 0.003, + "loss": 3.9785, + "step": 40437 + }, + { + "epoch": 0.40438, + "grad_norm": 0.8361525719455627, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 40438 + }, + { + "epoch": 0.40439, + "grad_norm": 0.9128852688996856, + "learning_rate": 0.003, + "loss": 4.0048, + "step": 40439 + }, + { + "epoch": 0.4044, + "grad_norm": 1.0894972017313491, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 40440 + }, + { + "epoch": 0.40441, + "grad_norm": 0.9779148914694062, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 40441 + }, + { + "epoch": 0.40442, + "grad_norm": 0.7977363843409132, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 40442 + }, + { + "epoch": 0.40443, + "grad_norm": 0.8042096368390105, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 40443 + }, + { + "epoch": 0.40444, + "grad_norm": 0.8995193297323695, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 40444 + }, + { + "epoch": 0.40445, + "grad_norm": 0.9555950749500092, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 40445 + }, + { + "epoch": 0.40446, + "grad_norm": 0.9807026743486132, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 40446 + }, + { + "epoch": 0.40447, + "grad_norm": 1.0021057658692307, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 40447 + }, + { + "epoch": 0.40448, + "grad_norm": 1.050356240592281, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 40448 + }, + { + "epoch": 0.40449, + "grad_norm": 1.05841553916658, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 40449 + }, + { + "epoch": 0.4045, + "grad_norm": 0.9930561193831076, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 40450 + }, + { + "epoch": 0.40451, + "grad_norm": 1.0041627355033504, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 40451 + }, + { + "epoch": 0.40452, + "grad_norm": 0.8258890391587952, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 40452 + }, + { + "epoch": 0.40453, + "grad_norm": 0.7717444891068446, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 40453 + }, + { + "epoch": 0.40454, + "grad_norm": 0.7208112692716727, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 40454 + }, + { + "epoch": 0.40455, + "grad_norm": 0.683262165506888, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 40455 + }, + { + "epoch": 0.40456, + "grad_norm": 0.655890208901634, + "learning_rate": 0.003, + "loss": 4.029, + "step": 40456 + }, + { + "epoch": 0.40457, + "grad_norm": 0.7012448926492795, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 40457 + }, + { + "epoch": 0.40458, + "grad_norm": 0.6859405494412313, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 40458 + }, + { + "epoch": 0.40459, + "grad_norm": 0.5786470720550242, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 40459 + }, + { + "epoch": 0.4046, + "grad_norm": 0.6180725968162006, + "learning_rate": 0.003, + "loss": 4.015, + "step": 40460 + }, + { + "epoch": 0.40461, + "grad_norm": 0.8467163538293735, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 40461 + }, + { + "epoch": 0.40462, + "grad_norm": 1.0520949749477229, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 40462 + }, + { + "epoch": 0.40463, + "grad_norm": 0.9168990569800771, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 40463 + }, + { + "epoch": 0.40464, + "grad_norm": 0.8531917541256011, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 40464 + }, + { + "epoch": 0.40465, + "grad_norm": 0.9085100099341425, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 40465 + }, + { + "epoch": 0.40466, + "grad_norm": 1.005183188366628, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 40466 + }, + { + "epoch": 0.40467, + "grad_norm": 0.9399920584577037, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 40467 + }, + { + "epoch": 0.40468, + "grad_norm": 0.907764745703909, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 40468 + }, + { + "epoch": 0.40469, + "grad_norm": 0.9618084043541386, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 40469 + }, + { + "epoch": 0.4047, + "grad_norm": 1.0425693123663, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 40470 + }, + { + "epoch": 0.40471, + "grad_norm": 1.0945834346837355, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 40471 + }, + { + "epoch": 0.40472, + "grad_norm": 0.896778794840225, + "learning_rate": 0.003, + "loss": 4.018, + "step": 40472 + }, + { + "epoch": 0.40473, + "grad_norm": 0.818432099923369, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 40473 + }, + { + "epoch": 0.40474, + "grad_norm": 0.9955193239934789, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 40474 + }, + { + "epoch": 0.40475, + "grad_norm": 1.2413674852189185, + "learning_rate": 0.003, + "loss": 4.037, + "step": 40475 + }, + { + "epoch": 0.40476, + "grad_norm": 0.8691920013716206, + "learning_rate": 0.003, + "loss": 4.0635, + "step": 40476 + }, + { + "epoch": 0.40477, + "grad_norm": 0.7813709028091708, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 40477 + }, + { + "epoch": 0.40478, + "grad_norm": 0.687143036716298, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 40478 + }, + { + "epoch": 0.40479, + "grad_norm": 0.5906265078210889, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 40479 + }, + { + "epoch": 0.4048, + "grad_norm": 0.5736592329100647, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 40480 + }, + { + "epoch": 0.40481, + "grad_norm": 0.5856011826785127, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 40481 + }, + { + "epoch": 0.40482, + "grad_norm": 0.7131258790795455, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 40482 + }, + { + "epoch": 0.40483, + "grad_norm": 0.7591519709472924, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 40483 + }, + { + "epoch": 0.40484, + "grad_norm": 0.8040700091508342, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 40484 + }, + { + "epoch": 0.40485, + "grad_norm": 0.835567224015876, + "learning_rate": 0.003, + "loss": 4.0753, + "step": 40485 + }, + { + "epoch": 0.40486, + "grad_norm": 0.8971737491317144, + "learning_rate": 0.003, + "loss": 4.045, + "step": 40486 + }, + { + "epoch": 0.40487, + "grad_norm": 0.8836796334609982, + "learning_rate": 0.003, + "loss": 4.037, + "step": 40487 + }, + { + "epoch": 0.40488, + "grad_norm": 0.8603319328200554, + "learning_rate": 0.003, + "loss": 4.032, + "step": 40488 + }, + { + "epoch": 0.40489, + "grad_norm": 0.8147824174000035, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 40489 + }, + { + "epoch": 0.4049, + "grad_norm": 0.7580501075540387, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 40490 + }, + { + "epoch": 0.40491, + "grad_norm": 0.7993893400298053, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 40491 + }, + { + "epoch": 0.40492, + "grad_norm": 0.9559922473487256, + "learning_rate": 0.003, + "loss": 4.052, + "step": 40492 + }, + { + "epoch": 0.40493, + "grad_norm": 0.9997652983579914, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 40493 + }, + { + "epoch": 0.40494, + "grad_norm": 0.8835896772439319, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 40494 + }, + { + "epoch": 0.40495, + "grad_norm": 0.8255823093071889, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 40495 + }, + { + "epoch": 0.40496, + "grad_norm": 0.6717170171857599, + "learning_rate": 0.003, + "loss": 4.057, + "step": 40496 + }, + { + "epoch": 0.40497, + "grad_norm": 0.6326140383716203, + "learning_rate": 0.003, + "loss": 4.035, + "step": 40497 + }, + { + "epoch": 0.40498, + "grad_norm": 0.650289119334694, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 40498 + }, + { + "epoch": 0.40499, + "grad_norm": 0.7060388140773709, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 40499 + }, + { + "epoch": 0.405, + "grad_norm": 0.7403199287228714, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 40500 + }, + { + "epoch": 0.40501, + "grad_norm": 0.8168649634187917, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 40501 + }, + { + "epoch": 0.40502, + "grad_norm": 0.8120536344511433, + "learning_rate": 0.003, + "loss": 3.9894, + "step": 40502 + }, + { + "epoch": 0.40503, + "grad_norm": 0.7253142709111741, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 40503 + }, + { + "epoch": 0.40504, + "grad_norm": 0.7240480088608268, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 40504 + }, + { + "epoch": 0.40505, + "grad_norm": 0.6386350466416557, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 40505 + }, + { + "epoch": 0.40506, + "grad_norm": 0.6719678368363629, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 40506 + }, + { + "epoch": 0.40507, + "grad_norm": 0.7705324941713547, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 40507 + }, + { + "epoch": 0.40508, + "grad_norm": 0.800859521323462, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 40508 + }, + { + "epoch": 0.40509, + "grad_norm": 0.8089957441127332, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 40509 + }, + { + "epoch": 0.4051, + "grad_norm": 0.9356879089765386, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 40510 + }, + { + "epoch": 0.40511, + "grad_norm": 1.025041554155601, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 40511 + }, + { + "epoch": 0.40512, + "grad_norm": 0.890302792302846, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 40512 + }, + { + "epoch": 0.40513, + "grad_norm": 1.000565751839163, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 40513 + }, + { + "epoch": 0.40514, + "grad_norm": 1.1567194912316179, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 40514 + }, + { + "epoch": 0.40515, + "grad_norm": 0.8403733692265882, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 40515 + }, + { + "epoch": 0.40516, + "grad_norm": 0.762390875549954, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 40516 + }, + { + "epoch": 0.40517, + "grad_norm": 0.7646913064196337, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 40517 + }, + { + "epoch": 0.40518, + "grad_norm": 0.8998919167946234, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 40518 + }, + { + "epoch": 0.40519, + "grad_norm": 1.2235893884448483, + "learning_rate": 0.003, + "loss": 4.051, + "step": 40519 + }, + { + "epoch": 0.4052, + "grad_norm": 0.7666416601217003, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 40520 + }, + { + "epoch": 0.40521, + "grad_norm": 0.7417437133096468, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 40521 + }, + { + "epoch": 0.40522, + "grad_norm": 0.7683745674332696, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 40522 + }, + { + "epoch": 0.40523, + "grad_norm": 0.7534892924563364, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 40523 + }, + { + "epoch": 0.40524, + "grad_norm": 0.7187266667266279, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 40524 + }, + { + "epoch": 0.40525, + "grad_norm": 0.7297271969791725, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 40525 + }, + { + "epoch": 0.40526, + "grad_norm": 0.863411933502107, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 40526 + }, + { + "epoch": 0.40527, + "grad_norm": 1.0072732981529529, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 40527 + }, + { + "epoch": 0.40528, + "grad_norm": 1.1498553844314832, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 40528 + }, + { + "epoch": 0.40529, + "grad_norm": 0.801263011909251, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 40529 + }, + { + "epoch": 0.4053, + "grad_norm": 0.7476487598878427, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 40530 + }, + { + "epoch": 0.40531, + "grad_norm": 0.7987434775620748, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 40531 + }, + { + "epoch": 0.40532, + "grad_norm": 0.928952651451731, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 40532 + }, + { + "epoch": 0.40533, + "grad_norm": 1.1234616303099785, + "learning_rate": 0.003, + "loss": 4.055, + "step": 40533 + }, + { + "epoch": 0.40534, + "grad_norm": 0.8973601809680385, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 40534 + }, + { + "epoch": 0.40535, + "grad_norm": 0.8326041089194102, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 40535 + }, + { + "epoch": 0.40536, + "grad_norm": 0.7259614481549386, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 40536 + }, + { + "epoch": 0.40537, + "grad_norm": 0.6888845202330824, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 40537 + }, + { + "epoch": 0.40538, + "grad_norm": 0.6014375558915922, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 40538 + }, + { + "epoch": 0.40539, + "grad_norm": 0.6307961586108699, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 40539 + }, + { + "epoch": 0.4054, + "grad_norm": 0.6987566607116572, + "learning_rate": 0.003, + "loss": 4.051, + "step": 40540 + }, + { + "epoch": 0.40541, + "grad_norm": 0.9329809103725983, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 40541 + }, + { + "epoch": 0.40542, + "grad_norm": 1.1798580534083134, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 40542 + }, + { + "epoch": 0.40543, + "grad_norm": 0.9538896535613658, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 40543 + }, + { + "epoch": 0.40544, + "grad_norm": 0.7963920665740135, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 40544 + }, + { + "epoch": 0.40545, + "grad_norm": 0.7064366758676707, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 40545 + }, + { + "epoch": 0.40546, + "grad_norm": 0.662110769490856, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 40546 + }, + { + "epoch": 0.40547, + "grad_norm": 0.5233295379247399, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 40547 + }, + { + "epoch": 0.40548, + "grad_norm": 0.5413606965618916, + "learning_rate": 0.003, + "loss": 3.9944, + "step": 40548 + }, + { + "epoch": 0.40549, + "grad_norm": 0.5650003363425216, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 40549 + }, + { + "epoch": 0.4055, + "grad_norm": 0.6489682275815425, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 40550 + }, + { + "epoch": 0.40551, + "grad_norm": 0.7918901615269399, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 40551 + }, + { + "epoch": 0.40552, + "grad_norm": 0.9217071861260202, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 40552 + }, + { + "epoch": 0.40553, + "grad_norm": 1.03874511983708, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 40553 + }, + { + "epoch": 0.40554, + "grad_norm": 0.8915408788533142, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 40554 + }, + { + "epoch": 0.40555, + "grad_norm": 0.7945545959329372, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 40555 + }, + { + "epoch": 0.40556, + "grad_norm": 0.8202667980170005, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 40556 + }, + { + "epoch": 0.40557, + "grad_norm": 0.8772259203871463, + "learning_rate": 0.003, + "loss": 4.012, + "step": 40557 + }, + { + "epoch": 0.40558, + "grad_norm": 0.9239207030998172, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 40558 + }, + { + "epoch": 0.40559, + "grad_norm": 0.9644035382968068, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 40559 + }, + { + "epoch": 0.4056, + "grad_norm": 1.130310481601523, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 40560 + }, + { + "epoch": 0.40561, + "grad_norm": 0.8976782033888179, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 40561 + }, + { + "epoch": 0.40562, + "grad_norm": 0.8385564937307948, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 40562 + }, + { + "epoch": 0.40563, + "grad_norm": 0.8005045312710788, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 40563 + }, + { + "epoch": 0.40564, + "grad_norm": 0.683352863046562, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 40564 + }, + { + "epoch": 0.40565, + "grad_norm": 0.6451114189775812, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 40565 + }, + { + "epoch": 0.40566, + "grad_norm": 0.6980216772220135, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 40566 + }, + { + "epoch": 0.40567, + "grad_norm": 0.7409793090124818, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 40567 + }, + { + "epoch": 0.40568, + "grad_norm": 0.6434377025889135, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 40568 + }, + { + "epoch": 0.40569, + "grad_norm": 0.6019746455366226, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 40569 + }, + { + "epoch": 0.4057, + "grad_norm": 0.7088455037320082, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 40570 + }, + { + "epoch": 0.40571, + "grad_norm": 0.9059408439445751, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 40571 + }, + { + "epoch": 0.40572, + "grad_norm": 1.2563927063231544, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 40572 + }, + { + "epoch": 0.40573, + "grad_norm": 0.7621328093468056, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 40573 + }, + { + "epoch": 0.40574, + "grad_norm": 0.5929653083132198, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 40574 + }, + { + "epoch": 0.40575, + "grad_norm": 0.7989853396950962, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 40575 + }, + { + "epoch": 0.40576, + "grad_norm": 0.9924289721558043, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 40576 + }, + { + "epoch": 0.40577, + "grad_norm": 1.0222781533575527, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 40577 + }, + { + "epoch": 0.40578, + "grad_norm": 0.9891732868854751, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 40578 + }, + { + "epoch": 0.40579, + "grad_norm": 0.7869678597867742, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 40579 + }, + { + "epoch": 0.4058, + "grad_norm": 0.6583625762896089, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 40580 + }, + { + "epoch": 0.40581, + "grad_norm": 0.6708066814000967, + "learning_rate": 0.003, + "loss": 4.032, + "step": 40581 + }, + { + "epoch": 0.40582, + "grad_norm": 0.6957371779987331, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 40582 + }, + { + "epoch": 0.40583, + "grad_norm": 0.6803268007729619, + "learning_rate": 0.003, + "loss": 4.056, + "step": 40583 + }, + { + "epoch": 0.40584, + "grad_norm": 0.7292967174965306, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 40584 + }, + { + "epoch": 0.40585, + "grad_norm": 1.0856464963843602, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 40585 + }, + { + "epoch": 0.40586, + "grad_norm": 1.325527909740051, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 40586 + }, + { + "epoch": 0.40587, + "grad_norm": 0.6623949151733238, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 40587 + }, + { + "epoch": 0.40588, + "grad_norm": 0.742915184408894, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 40588 + }, + { + "epoch": 0.40589, + "grad_norm": 0.8055409039226644, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 40589 + }, + { + "epoch": 0.4059, + "grad_norm": 0.927109647222577, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 40590 + }, + { + "epoch": 0.40591, + "grad_norm": 1.041897640020531, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 40591 + }, + { + "epoch": 0.40592, + "grad_norm": 1.0312596659385138, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 40592 + }, + { + "epoch": 0.40593, + "grad_norm": 0.9928322051930432, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 40593 + }, + { + "epoch": 0.40594, + "grad_norm": 0.9070880986511168, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 40594 + }, + { + "epoch": 0.40595, + "grad_norm": 0.7761605342421717, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 40595 + }, + { + "epoch": 0.40596, + "grad_norm": 0.7819512306708455, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 40596 + }, + { + "epoch": 0.40597, + "grad_norm": 0.7954864474246699, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 40597 + }, + { + "epoch": 0.40598, + "grad_norm": 0.824912607416007, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 40598 + }, + { + "epoch": 0.40599, + "grad_norm": 0.8544130023139433, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 40599 + }, + { + "epoch": 0.406, + "grad_norm": 0.8410713818623994, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 40600 + }, + { + "epoch": 0.40601, + "grad_norm": 0.88056307402186, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 40601 + }, + { + "epoch": 0.40602, + "grad_norm": 0.9282320742658277, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 40602 + }, + { + "epoch": 0.40603, + "grad_norm": 1.0110307670755487, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 40603 + }, + { + "epoch": 0.40604, + "grad_norm": 1.0040526079788559, + "learning_rate": 0.003, + "loss": 4.037, + "step": 40604 + }, + { + "epoch": 0.40605, + "grad_norm": 1.0507367963068064, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 40605 + }, + { + "epoch": 0.40606, + "grad_norm": 0.876360302481889, + "learning_rate": 0.003, + "loss": 4.043, + "step": 40606 + }, + { + "epoch": 0.40607, + "grad_norm": 0.8337806788099126, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 40607 + }, + { + "epoch": 0.40608, + "grad_norm": 0.7606966772502363, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 40608 + }, + { + "epoch": 0.40609, + "grad_norm": 0.6787729673582923, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 40609 + }, + { + "epoch": 0.4061, + "grad_norm": 0.8218308533711198, + "learning_rate": 0.003, + "loss": 4.036, + "step": 40610 + }, + { + "epoch": 0.40611, + "grad_norm": 0.8571652559908324, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 40611 + }, + { + "epoch": 0.40612, + "grad_norm": 0.7930277116100504, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 40612 + }, + { + "epoch": 0.40613, + "grad_norm": 0.7140541819884468, + "learning_rate": 0.003, + "loss": 4.035, + "step": 40613 + }, + { + "epoch": 0.40614, + "grad_norm": 0.6408642167322689, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 40614 + }, + { + "epoch": 0.40615, + "grad_norm": 0.7045181563304858, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 40615 + }, + { + "epoch": 0.40616, + "grad_norm": 0.7695569244919401, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 40616 + }, + { + "epoch": 0.40617, + "grad_norm": 0.7746160227477299, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 40617 + }, + { + "epoch": 0.40618, + "grad_norm": 0.7576494506921554, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 40618 + }, + { + "epoch": 0.40619, + "grad_norm": 0.739605585510894, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 40619 + }, + { + "epoch": 0.4062, + "grad_norm": 0.7037645940277226, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 40620 + }, + { + "epoch": 0.40621, + "grad_norm": 0.6519636012346238, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 40621 + }, + { + "epoch": 0.40622, + "grad_norm": 0.736244084830229, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 40622 + }, + { + "epoch": 0.40623, + "grad_norm": 0.8408180090563959, + "learning_rate": 0.003, + "loss": 4.019, + "step": 40623 + }, + { + "epoch": 0.40624, + "grad_norm": 1.1094771579858271, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 40624 + }, + { + "epoch": 0.40625, + "grad_norm": 1.0443834073333753, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 40625 + }, + { + "epoch": 0.40626, + "grad_norm": 0.8173664069167892, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 40626 + }, + { + "epoch": 0.40627, + "grad_norm": 0.7626786311015811, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 40627 + }, + { + "epoch": 0.40628, + "grad_norm": 0.7292994851466237, + "learning_rate": 0.003, + "loss": 4.0068, + "step": 40628 + }, + { + "epoch": 0.40629, + "grad_norm": 0.6592117743276267, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 40629 + }, + { + "epoch": 0.4063, + "grad_norm": 0.8263124412597801, + "learning_rate": 0.003, + "loss": 3.9832, + "step": 40630 + }, + { + "epoch": 0.40631, + "grad_norm": 0.8942270316450308, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 40631 + }, + { + "epoch": 0.40632, + "grad_norm": 0.9947485240250089, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 40632 + }, + { + "epoch": 0.40633, + "grad_norm": 0.9516351464806108, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 40633 + }, + { + "epoch": 0.40634, + "grad_norm": 1.0648689007943632, + "learning_rate": 0.003, + "loss": 4.05, + "step": 40634 + }, + { + "epoch": 0.40635, + "grad_norm": 1.1574071197130515, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 40635 + }, + { + "epoch": 0.40636, + "grad_norm": 0.9936328381014666, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 40636 + }, + { + "epoch": 0.40637, + "grad_norm": 0.9849483208631196, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 40637 + }, + { + "epoch": 0.40638, + "grad_norm": 0.9403912485877065, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 40638 + }, + { + "epoch": 0.40639, + "grad_norm": 0.7750294142378805, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 40639 + }, + { + "epoch": 0.4064, + "grad_norm": 0.6807450668048215, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 40640 + }, + { + "epoch": 0.40641, + "grad_norm": 0.63832029822281, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 40641 + }, + { + "epoch": 0.40642, + "grad_norm": 0.5835293898603916, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 40642 + }, + { + "epoch": 0.40643, + "grad_norm": 0.6458314319512517, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 40643 + }, + { + "epoch": 0.40644, + "grad_norm": 0.599844993393481, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 40644 + }, + { + "epoch": 0.40645, + "grad_norm": 0.5331968943895079, + "learning_rate": 0.003, + "loss": 4.0629, + "step": 40645 + }, + { + "epoch": 0.40646, + "grad_norm": 0.5694070149987412, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 40646 + }, + { + "epoch": 0.40647, + "grad_norm": 0.6211541825911381, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 40647 + }, + { + "epoch": 0.40648, + "grad_norm": 0.8410091364570149, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 40648 + }, + { + "epoch": 0.40649, + "grad_norm": 1.2224571224050682, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 40649 + }, + { + "epoch": 0.4065, + "grad_norm": 0.744379434121908, + "learning_rate": 0.003, + "loss": 4.0035, + "step": 40650 + }, + { + "epoch": 0.40651, + "grad_norm": 0.5779789192157779, + "learning_rate": 0.003, + "loss": 4.02, + "step": 40651 + }, + { + "epoch": 0.40652, + "grad_norm": 0.5794129141474471, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 40652 + }, + { + "epoch": 0.40653, + "grad_norm": 0.6040302110103019, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 40653 + }, + { + "epoch": 0.40654, + "grad_norm": 0.7123636526345154, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 40654 + }, + { + "epoch": 0.40655, + "grad_norm": 0.7865709263853687, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 40655 + }, + { + "epoch": 0.40656, + "grad_norm": 0.7615545298528625, + "learning_rate": 0.003, + "loss": 4.045, + "step": 40656 + }, + { + "epoch": 0.40657, + "grad_norm": 0.7862273403548818, + "learning_rate": 0.003, + "loss": 4.0025, + "step": 40657 + }, + { + "epoch": 0.40658, + "grad_norm": 0.8770609164714464, + "learning_rate": 0.003, + "loss": 4.0034, + "step": 40658 + }, + { + "epoch": 0.40659, + "grad_norm": 1.0727421138910158, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 40659 + }, + { + "epoch": 0.4066, + "grad_norm": 1.0785448238473165, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 40660 + }, + { + "epoch": 0.40661, + "grad_norm": 0.9865492138462785, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 40661 + }, + { + "epoch": 0.40662, + "grad_norm": 1.0214522795596863, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 40662 + }, + { + "epoch": 0.40663, + "grad_norm": 0.9131128068315617, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 40663 + }, + { + "epoch": 0.40664, + "grad_norm": 0.9988111273155575, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 40664 + }, + { + "epoch": 0.40665, + "grad_norm": 1.1643127903076473, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 40665 + }, + { + "epoch": 0.40666, + "grad_norm": 0.8859832006759629, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 40666 + }, + { + "epoch": 0.40667, + "grad_norm": 0.8213877294426055, + "learning_rate": 0.003, + "loss": 4.0041, + "step": 40667 + }, + { + "epoch": 0.40668, + "grad_norm": 0.8522775876180705, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 40668 + }, + { + "epoch": 0.40669, + "grad_norm": 1.0103305080281135, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 40669 + }, + { + "epoch": 0.4067, + "grad_norm": 1.0654623934691405, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 40670 + }, + { + "epoch": 0.40671, + "grad_norm": 0.9536484859362132, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 40671 + }, + { + "epoch": 0.40672, + "grad_norm": 0.8700740182787066, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 40672 + }, + { + "epoch": 0.40673, + "grad_norm": 0.8571669996428086, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 40673 + }, + { + "epoch": 0.40674, + "grad_norm": 0.8221710379568509, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 40674 + }, + { + "epoch": 0.40675, + "grad_norm": 0.8977370078865483, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 40675 + }, + { + "epoch": 0.40676, + "grad_norm": 0.8905292224861913, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 40676 + }, + { + "epoch": 0.40677, + "grad_norm": 0.9270666658896413, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 40677 + }, + { + "epoch": 0.40678, + "grad_norm": 1.031974208384142, + "learning_rate": 0.003, + "loss": 4.0677, + "step": 40678 + }, + { + "epoch": 0.40679, + "grad_norm": 1.0726580991900512, + "learning_rate": 0.003, + "loss": 4.0805, + "step": 40679 + }, + { + "epoch": 0.4068, + "grad_norm": 0.8677010107237402, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 40680 + }, + { + "epoch": 0.40681, + "grad_norm": 0.7440276075316133, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 40681 + }, + { + "epoch": 0.40682, + "grad_norm": 0.709685484586134, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 40682 + }, + { + "epoch": 0.40683, + "grad_norm": 0.7562804618940562, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 40683 + }, + { + "epoch": 0.40684, + "grad_norm": 0.693054619555085, + "learning_rate": 0.003, + "loss": 4.057, + "step": 40684 + }, + { + "epoch": 0.40685, + "grad_norm": 0.669426303407173, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 40685 + }, + { + "epoch": 0.40686, + "grad_norm": 0.6560446067003249, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 40686 + }, + { + "epoch": 0.40687, + "grad_norm": 0.7058924727556057, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 40687 + }, + { + "epoch": 0.40688, + "grad_norm": 0.7524678292807517, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 40688 + }, + { + "epoch": 0.40689, + "grad_norm": 1.0279807762179993, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 40689 + }, + { + "epoch": 0.4069, + "grad_norm": 1.363429715803741, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 40690 + }, + { + "epoch": 0.40691, + "grad_norm": 0.7644139442810512, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 40691 + }, + { + "epoch": 0.40692, + "grad_norm": 0.683142949407516, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 40692 + }, + { + "epoch": 0.40693, + "grad_norm": 0.7274740294968437, + "learning_rate": 0.003, + "loss": 4.044, + "step": 40693 + }, + { + "epoch": 0.40694, + "grad_norm": 0.7615615645374336, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 40694 + }, + { + "epoch": 0.40695, + "grad_norm": 0.6215035693541652, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 40695 + }, + { + "epoch": 0.40696, + "grad_norm": 0.536911366950701, + "learning_rate": 0.003, + "loss": 3.9956, + "step": 40696 + }, + { + "epoch": 0.40697, + "grad_norm": 0.46948610830292786, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 40697 + }, + { + "epoch": 0.40698, + "grad_norm": 0.499523405221516, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 40698 + }, + { + "epoch": 0.40699, + "grad_norm": 0.4565794048328924, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 40699 + }, + { + "epoch": 0.407, + "grad_norm": 0.4484986026253502, + "learning_rate": 0.003, + "loss": 3.9948, + "step": 40700 + }, + { + "epoch": 0.40701, + "grad_norm": 0.48943116941652987, + "learning_rate": 0.003, + "loss": 4.017, + "step": 40701 + }, + { + "epoch": 0.40702, + "grad_norm": 0.5697932852621088, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 40702 + }, + { + "epoch": 0.40703, + "grad_norm": 0.7785870091881888, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 40703 + }, + { + "epoch": 0.40704, + "grad_norm": 1.1152929389848674, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 40704 + }, + { + "epoch": 0.40705, + "grad_norm": 1.1146079462026566, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 40705 + }, + { + "epoch": 0.40706, + "grad_norm": 0.7478694646844233, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 40706 + }, + { + "epoch": 0.40707, + "grad_norm": 0.7766488750637294, + "learning_rate": 0.003, + "loss": 3.9928, + "step": 40707 + }, + { + "epoch": 0.40708, + "grad_norm": 0.9487292650471946, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 40708 + }, + { + "epoch": 0.40709, + "grad_norm": 1.0633262537438446, + "learning_rate": 0.003, + "loss": 4.0004, + "step": 40709 + }, + { + "epoch": 0.4071, + "grad_norm": 1.0353033314544273, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 40710 + }, + { + "epoch": 0.40711, + "grad_norm": 1.0298251296628032, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 40711 + }, + { + "epoch": 0.40712, + "grad_norm": 0.9090386339717332, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 40712 + }, + { + "epoch": 0.40713, + "grad_norm": 0.8419454355574181, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 40713 + }, + { + "epoch": 0.40714, + "grad_norm": 0.788246198224765, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 40714 + }, + { + "epoch": 0.40715, + "grad_norm": 0.772953306265508, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 40715 + }, + { + "epoch": 0.40716, + "grad_norm": 0.7610069920625087, + "learning_rate": 0.003, + "loss": 4.036, + "step": 40716 + }, + { + "epoch": 0.40717, + "grad_norm": 0.7588809389883687, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 40717 + }, + { + "epoch": 0.40718, + "grad_norm": 0.7538293917867337, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 40718 + }, + { + "epoch": 0.40719, + "grad_norm": 0.7700880764900533, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 40719 + }, + { + "epoch": 0.4072, + "grad_norm": 0.7158224855303892, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 40720 + }, + { + "epoch": 0.40721, + "grad_norm": 0.746453551294441, + "learning_rate": 0.003, + "loss": 4.054, + "step": 40721 + }, + { + "epoch": 0.40722, + "grad_norm": 0.7478786558937991, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 40722 + }, + { + "epoch": 0.40723, + "grad_norm": 0.9027017531612992, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 40723 + }, + { + "epoch": 0.40724, + "grad_norm": 1.2400389548070767, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 40724 + }, + { + "epoch": 0.40725, + "grad_norm": 0.8194432012546754, + "learning_rate": 0.003, + "loss": 4.0731, + "step": 40725 + }, + { + "epoch": 0.40726, + "grad_norm": 0.7153695687720698, + "learning_rate": 0.003, + "loss": 3.9987, + "step": 40726 + }, + { + "epoch": 0.40727, + "grad_norm": 0.6679007461022355, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 40727 + }, + { + "epoch": 0.40728, + "grad_norm": 0.7063755062276852, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 40728 + }, + { + "epoch": 0.40729, + "grad_norm": 0.7714256543381556, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 40729 + }, + { + "epoch": 0.4073, + "grad_norm": 0.923080094148508, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 40730 + }, + { + "epoch": 0.40731, + "grad_norm": 0.996446632540524, + "learning_rate": 0.003, + "loss": 4.0096, + "step": 40731 + }, + { + "epoch": 0.40732, + "grad_norm": 1.015013379545939, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 40732 + }, + { + "epoch": 0.40733, + "grad_norm": 0.9804019568402763, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 40733 + }, + { + "epoch": 0.40734, + "grad_norm": 0.9875088905594354, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 40734 + }, + { + "epoch": 0.40735, + "grad_norm": 1.1135358518323395, + "learning_rate": 0.003, + "loss": 4.0675, + "step": 40735 + }, + { + "epoch": 0.40736, + "grad_norm": 0.8738212844157298, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 40736 + }, + { + "epoch": 0.40737, + "grad_norm": 0.9931736271882313, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 40737 + }, + { + "epoch": 0.40738, + "grad_norm": 1.0776389765583059, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 40738 + }, + { + "epoch": 0.40739, + "grad_norm": 0.9828839571907081, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 40739 + }, + { + "epoch": 0.4074, + "grad_norm": 1.1042558888698764, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 40740 + }, + { + "epoch": 0.40741, + "grad_norm": 0.9575430223302421, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 40741 + }, + { + "epoch": 0.40742, + "grad_norm": 0.9791789088978395, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 40742 + }, + { + "epoch": 0.40743, + "grad_norm": 0.9214793046820784, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 40743 + }, + { + "epoch": 0.40744, + "grad_norm": 0.8769708038259765, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 40744 + }, + { + "epoch": 0.40745, + "grad_norm": 0.8447939295138118, + "learning_rate": 0.003, + "loss": 4.032, + "step": 40745 + }, + { + "epoch": 0.40746, + "grad_norm": 0.9587013716766315, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 40746 + }, + { + "epoch": 0.40747, + "grad_norm": 0.9529982434138514, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 40747 + }, + { + "epoch": 0.40748, + "grad_norm": 1.0612590550307242, + "learning_rate": 0.003, + "loss": 4.042, + "step": 40748 + }, + { + "epoch": 0.40749, + "grad_norm": 1.0039843080749102, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 40749 + }, + { + "epoch": 0.4075, + "grad_norm": 0.8596216429243799, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 40750 + }, + { + "epoch": 0.40751, + "grad_norm": 0.8636548093538, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 40751 + }, + { + "epoch": 0.40752, + "grad_norm": 0.8078459091692346, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 40752 + }, + { + "epoch": 0.40753, + "grad_norm": 0.860010774094071, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 40753 + }, + { + "epoch": 0.40754, + "grad_norm": 0.8754819362209557, + "learning_rate": 0.003, + "loss": 4.0886, + "step": 40754 + }, + { + "epoch": 0.40755, + "grad_norm": 0.8203664630845963, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 40755 + }, + { + "epoch": 0.40756, + "grad_norm": 0.8480050633668286, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 40756 + }, + { + "epoch": 0.40757, + "grad_norm": 0.7854659374684668, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 40757 + }, + { + "epoch": 0.40758, + "grad_norm": 0.7971579950775693, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 40758 + }, + { + "epoch": 0.40759, + "grad_norm": 0.8685890695718128, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 40759 + }, + { + "epoch": 0.4076, + "grad_norm": 0.9693084272553204, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 40760 + }, + { + "epoch": 0.40761, + "grad_norm": 1.0762322161951616, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 40761 + }, + { + "epoch": 0.40762, + "grad_norm": 0.9066384836794253, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 40762 + }, + { + "epoch": 0.40763, + "grad_norm": 0.7774453591753624, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 40763 + }, + { + "epoch": 0.40764, + "grad_norm": 0.6037438172513471, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 40764 + }, + { + "epoch": 0.40765, + "grad_norm": 0.5256489806976825, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 40765 + }, + { + "epoch": 0.40766, + "grad_norm": 0.47735203133313897, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 40766 + }, + { + "epoch": 0.40767, + "grad_norm": 0.5304793698008422, + "learning_rate": 0.003, + "loss": 4.011, + "step": 40767 + }, + { + "epoch": 0.40768, + "grad_norm": 0.6234487160627705, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 40768 + }, + { + "epoch": 0.40769, + "grad_norm": 0.7800924413861733, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 40769 + }, + { + "epoch": 0.4077, + "grad_norm": 0.7842176743161323, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 40770 + }, + { + "epoch": 0.40771, + "grad_norm": 0.8361900243429635, + "learning_rate": 0.003, + "loss": 3.9958, + "step": 40771 + }, + { + "epoch": 0.40772, + "grad_norm": 0.8983854682485343, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 40772 + }, + { + "epoch": 0.40773, + "grad_norm": 0.871375068979319, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 40773 + }, + { + "epoch": 0.40774, + "grad_norm": 0.9997472490660584, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 40774 + }, + { + "epoch": 0.40775, + "grad_norm": 0.9658971327077182, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 40775 + }, + { + "epoch": 0.40776, + "grad_norm": 0.6907891534000522, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 40776 + }, + { + "epoch": 0.40777, + "grad_norm": 0.7108130205554818, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 40777 + }, + { + "epoch": 0.40778, + "grad_norm": 0.8073088056866252, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 40778 + }, + { + "epoch": 0.40779, + "grad_norm": 0.85053713656124, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 40779 + }, + { + "epoch": 0.4078, + "grad_norm": 1.0057565995294278, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 40780 + }, + { + "epoch": 0.40781, + "grad_norm": 1.0050099631146916, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 40781 + }, + { + "epoch": 0.40782, + "grad_norm": 0.8352610969788128, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 40782 + }, + { + "epoch": 0.40783, + "grad_norm": 0.6455520638626876, + "learning_rate": 0.003, + "loss": 4.036, + "step": 40783 + }, + { + "epoch": 0.40784, + "grad_norm": 0.6801885269829244, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 40784 + }, + { + "epoch": 0.40785, + "grad_norm": 0.7184108729576679, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 40785 + }, + { + "epoch": 0.40786, + "grad_norm": 0.7818205023620498, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 40786 + }, + { + "epoch": 0.40787, + "grad_norm": 0.8546638635632696, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 40787 + }, + { + "epoch": 0.40788, + "grad_norm": 0.8876988140329735, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 40788 + }, + { + "epoch": 0.40789, + "grad_norm": 0.9123653743957798, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 40789 + }, + { + "epoch": 0.4079, + "grad_norm": 0.8766981865236388, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 40790 + }, + { + "epoch": 0.40791, + "grad_norm": 0.8586079651578068, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 40791 + }, + { + "epoch": 0.40792, + "grad_norm": 0.787164544320404, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 40792 + }, + { + "epoch": 0.40793, + "grad_norm": 0.9342748325390651, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 40793 + }, + { + "epoch": 0.40794, + "grad_norm": 1.1236592085622887, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 40794 + }, + { + "epoch": 0.40795, + "grad_norm": 0.9077161089923367, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 40795 + }, + { + "epoch": 0.40796, + "grad_norm": 0.8667983010783505, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 40796 + }, + { + "epoch": 0.40797, + "grad_norm": 0.9240196999317687, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 40797 + }, + { + "epoch": 0.40798, + "grad_norm": 1.033175091945612, + "learning_rate": 0.003, + "loss": 4.037, + "step": 40798 + }, + { + "epoch": 0.40799, + "grad_norm": 0.9169769415427773, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 40799 + }, + { + "epoch": 0.408, + "grad_norm": 0.7685390105085074, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 40800 + }, + { + "epoch": 0.40801, + "grad_norm": 0.5728425509983573, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 40801 + }, + { + "epoch": 0.40802, + "grad_norm": 0.5684913854298054, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 40802 + }, + { + "epoch": 0.40803, + "grad_norm": 0.48637170227239473, + "learning_rate": 0.003, + "loss": 4.015, + "step": 40803 + }, + { + "epoch": 0.40804, + "grad_norm": 0.5343918096720105, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 40804 + }, + { + "epoch": 0.40805, + "grad_norm": 0.5288483180937478, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 40805 + }, + { + "epoch": 0.40806, + "grad_norm": 0.6460984289937696, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 40806 + }, + { + "epoch": 0.40807, + "grad_norm": 0.7002561267715006, + "learning_rate": 0.003, + "loss": 3.9763, + "step": 40807 + }, + { + "epoch": 0.40808, + "grad_norm": 0.6923095185201152, + "learning_rate": 0.003, + "loss": 4.006, + "step": 40808 + }, + { + "epoch": 0.40809, + "grad_norm": 0.7233571127668211, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 40809 + }, + { + "epoch": 0.4081, + "grad_norm": 0.8899407811624944, + "learning_rate": 0.003, + "loss": 4.017, + "step": 40810 + }, + { + "epoch": 0.40811, + "grad_norm": 0.9396445121337709, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 40811 + }, + { + "epoch": 0.40812, + "grad_norm": 1.0975694746260647, + "learning_rate": 0.003, + "loss": 4.001, + "step": 40812 + }, + { + "epoch": 0.40813, + "grad_norm": 0.9846900440940011, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 40813 + }, + { + "epoch": 0.40814, + "grad_norm": 0.9042282146670075, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 40814 + }, + { + "epoch": 0.40815, + "grad_norm": 0.9054297134605257, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 40815 + }, + { + "epoch": 0.40816, + "grad_norm": 0.9398414333350806, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 40816 + }, + { + "epoch": 0.40817, + "grad_norm": 0.8982517863508153, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 40817 + }, + { + "epoch": 0.40818, + "grad_norm": 0.9073855286127029, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 40818 + }, + { + "epoch": 0.40819, + "grad_norm": 0.8692833955023627, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 40819 + }, + { + "epoch": 0.4082, + "grad_norm": 0.8178447853702111, + "learning_rate": 0.003, + "loss": 4.05, + "step": 40820 + }, + { + "epoch": 0.40821, + "grad_norm": 0.772043204882941, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 40821 + }, + { + "epoch": 0.40822, + "grad_norm": 0.7335421215698175, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 40822 + }, + { + "epoch": 0.40823, + "grad_norm": 0.6925185681495892, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 40823 + }, + { + "epoch": 0.40824, + "grad_norm": 0.6083586508830244, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 40824 + }, + { + "epoch": 0.40825, + "grad_norm": 0.6828439091803195, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 40825 + }, + { + "epoch": 0.40826, + "grad_norm": 0.872302513527167, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 40826 + }, + { + "epoch": 0.40827, + "grad_norm": 1.1613106188193976, + "learning_rate": 0.003, + "loss": 4.015, + "step": 40827 + }, + { + "epoch": 0.40828, + "grad_norm": 0.9586173701775971, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 40828 + }, + { + "epoch": 0.40829, + "grad_norm": 0.86898085395113, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 40829 + }, + { + "epoch": 0.4083, + "grad_norm": 0.7558543024219008, + "learning_rate": 0.003, + "loss": 4.044, + "step": 40830 + }, + { + "epoch": 0.40831, + "grad_norm": 0.8157511607137323, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 40831 + }, + { + "epoch": 0.40832, + "grad_norm": 1.0374005648501312, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 40832 + }, + { + "epoch": 0.40833, + "grad_norm": 1.0676765320862374, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 40833 + }, + { + "epoch": 0.40834, + "grad_norm": 0.8335568315754923, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 40834 + }, + { + "epoch": 0.40835, + "grad_norm": 0.7229595827668133, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 40835 + }, + { + "epoch": 0.40836, + "grad_norm": 0.689562843727104, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 40836 + }, + { + "epoch": 0.40837, + "grad_norm": 0.7956089516182507, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 40837 + }, + { + "epoch": 0.40838, + "grad_norm": 0.7754240461457443, + "learning_rate": 0.003, + "loss": 4.005, + "step": 40838 + }, + { + "epoch": 0.40839, + "grad_norm": 0.8044351824454331, + "learning_rate": 0.003, + "loss": 4.044, + "step": 40839 + }, + { + "epoch": 0.4084, + "grad_norm": 0.7920469052911953, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 40840 + }, + { + "epoch": 0.40841, + "grad_norm": 0.8525497769774544, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 40841 + }, + { + "epoch": 0.40842, + "grad_norm": 0.7488448962189167, + "learning_rate": 0.003, + "loss": 4.04, + "step": 40842 + }, + { + "epoch": 0.40843, + "grad_norm": 0.790726215502166, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 40843 + }, + { + "epoch": 0.40844, + "grad_norm": 0.74283019158228, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 40844 + }, + { + "epoch": 0.40845, + "grad_norm": 0.7217166827796646, + "learning_rate": 0.003, + "loss": 4.021, + "step": 40845 + }, + { + "epoch": 0.40846, + "grad_norm": 0.6873276029008326, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 40846 + }, + { + "epoch": 0.40847, + "grad_norm": 0.677575633910664, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 40847 + }, + { + "epoch": 0.40848, + "grad_norm": 0.8270433392929029, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 40848 + }, + { + "epoch": 0.40849, + "grad_norm": 1.2700307395182895, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 40849 + }, + { + "epoch": 0.4085, + "grad_norm": 1.0108062985655941, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 40850 + }, + { + "epoch": 0.40851, + "grad_norm": 0.8890623299272633, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 40851 + }, + { + "epoch": 0.40852, + "grad_norm": 0.7760910057985854, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 40852 + }, + { + "epoch": 0.40853, + "grad_norm": 0.6997054842406879, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 40853 + }, + { + "epoch": 0.40854, + "grad_norm": 0.6336491242417774, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 40854 + }, + { + "epoch": 0.40855, + "grad_norm": 0.6357086023906137, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 40855 + }, + { + "epoch": 0.40856, + "grad_norm": 0.676336452392408, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 40856 + }, + { + "epoch": 0.40857, + "grad_norm": 0.682838087777738, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 40857 + }, + { + "epoch": 0.40858, + "grad_norm": 0.8085604094482841, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 40858 + }, + { + "epoch": 0.40859, + "grad_norm": 0.8399421156207374, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 40859 + }, + { + "epoch": 0.4086, + "grad_norm": 0.7693234076905623, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 40860 + }, + { + "epoch": 0.40861, + "grad_norm": 0.8385320664734098, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 40861 + }, + { + "epoch": 0.40862, + "grad_norm": 0.7618347377928338, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 40862 + }, + { + "epoch": 0.40863, + "grad_norm": 0.8466678141981029, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 40863 + }, + { + "epoch": 0.40864, + "grad_norm": 0.9916524186535738, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 40864 + }, + { + "epoch": 0.40865, + "grad_norm": 1.1598098045448408, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 40865 + }, + { + "epoch": 0.40866, + "grad_norm": 0.7962140940107728, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 40866 + }, + { + "epoch": 0.40867, + "grad_norm": 0.6885706730153588, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 40867 + }, + { + "epoch": 0.40868, + "grad_norm": 0.7173568829092994, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 40868 + }, + { + "epoch": 0.40869, + "grad_norm": 0.7152771588755302, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 40869 + }, + { + "epoch": 0.4087, + "grad_norm": 0.6807372977907474, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 40870 + }, + { + "epoch": 0.40871, + "grad_norm": 0.813300487535765, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 40871 + }, + { + "epoch": 0.40872, + "grad_norm": 0.9804278943362238, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 40872 + }, + { + "epoch": 0.40873, + "grad_norm": 1.2510809695337584, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 40873 + }, + { + "epoch": 0.40874, + "grad_norm": 0.8730818673426651, + "learning_rate": 0.003, + "loss": 4.036, + "step": 40874 + }, + { + "epoch": 0.40875, + "grad_norm": 0.7188063241536783, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 40875 + }, + { + "epoch": 0.40876, + "grad_norm": 0.8008977919142061, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 40876 + }, + { + "epoch": 0.40877, + "grad_norm": 0.871092283026392, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 40877 + }, + { + "epoch": 0.40878, + "grad_norm": 0.9225241654376999, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 40878 + }, + { + "epoch": 0.40879, + "grad_norm": 0.9816451531532937, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 40879 + }, + { + "epoch": 0.4088, + "grad_norm": 1.063164779733969, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 40880 + }, + { + "epoch": 0.40881, + "grad_norm": 0.9380730967742764, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 40881 + }, + { + "epoch": 0.40882, + "grad_norm": 0.9553071861153347, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 40882 + }, + { + "epoch": 0.40883, + "grad_norm": 0.8618725115244718, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 40883 + }, + { + "epoch": 0.40884, + "grad_norm": 0.8068410746960385, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 40884 + }, + { + "epoch": 0.40885, + "grad_norm": 0.814150839491807, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 40885 + }, + { + "epoch": 0.40886, + "grad_norm": 0.7393973241159452, + "learning_rate": 0.003, + "loss": 4.031, + "step": 40886 + }, + { + "epoch": 0.40887, + "grad_norm": 0.8415990164280072, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 40887 + }, + { + "epoch": 0.40888, + "grad_norm": 0.8816361736155715, + "learning_rate": 0.003, + "loss": 4.0013, + "step": 40888 + }, + { + "epoch": 0.40889, + "grad_norm": 1.0268852439325324, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 40889 + }, + { + "epoch": 0.4089, + "grad_norm": 1.1166483354426966, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 40890 + }, + { + "epoch": 0.40891, + "grad_norm": 0.8791067204472717, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 40891 + }, + { + "epoch": 0.40892, + "grad_norm": 0.7331833000314836, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 40892 + }, + { + "epoch": 0.40893, + "grad_norm": 0.8115937697730363, + "learning_rate": 0.003, + "loss": 3.9954, + "step": 40893 + }, + { + "epoch": 0.40894, + "grad_norm": 0.9457753769349042, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 40894 + }, + { + "epoch": 0.40895, + "grad_norm": 0.9851575020243899, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 40895 + }, + { + "epoch": 0.40896, + "grad_norm": 0.8868708687185959, + "learning_rate": 0.003, + "loss": 4.0725, + "step": 40896 + }, + { + "epoch": 0.40897, + "grad_norm": 0.7986660081373802, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 40897 + }, + { + "epoch": 0.40898, + "grad_norm": 0.8295073697629808, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 40898 + }, + { + "epoch": 0.40899, + "grad_norm": 0.8287534881438012, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 40899 + }, + { + "epoch": 0.409, + "grad_norm": 0.7235474533107421, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 40900 + }, + { + "epoch": 0.40901, + "grad_norm": 0.7403557674485564, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 40901 + }, + { + "epoch": 0.40902, + "grad_norm": 0.7646719211484271, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 40902 + }, + { + "epoch": 0.40903, + "grad_norm": 0.7899219244180721, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 40903 + }, + { + "epoch": 0.40904, + "grad_norm": 0.764390593343992, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 40904 + }, + { + "epoch": 0.40905, + "grad_norm": 0.6815835920625857, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 40905 + }, + { + "epoch": 0.40906, + "grad_norm": 0.7793727569659127, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 40906 + }, + { + "epoch": 0.40907, + "grad_norm": 0.9036220930774713, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 40907 + }, + { + "epoch": 0.40908, + "grad_norm": 1.0300143245373437, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 40908 + }, + { + "epoch": 0.40909, + "grad_norm": 1.1355178490098536, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 40909 + }, + { + "epoch": 0.4091, + "grad_norm": 0.825242872430475, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 40910 + }, + { + "epoch": 0.40911, + "grad_norm": 0.8151850327129223, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 40911 + }, + { + "epoch": 0.40912, + "grad_norm": 0.8821718778303121, + "learning_rate": 0.003, + "loss": 4.016, + "step": 40912 + }, + { + "epoch": 0.40913, + "grad_norm": 0.8761668151612025, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 40913 + }, + { + "epoch": 0.40914, + "grad_norm": 0.8902083141017818, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 40914 + }, + { + "epoch": 0.40915, + "grad_norm": 0.8143259680548188, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 40915 + }, + { + "epoch": 0.40916, + "grad_norm": 0.7724976073395788, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 40916 + }, + { + "epoch": 0.40917, + "grad_norm": 0.7758044644534762, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 40917 + }, + { + "epoch": 0.40918, + "grad_norm": 0.8223764815706067, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 40918 + }, + { + "epoch": 0.40919, + "grad_norm": 1.0597364128617675, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 40919 + }, + { + "epoch": 0.4092, + "grad_norm": 0.9981230235250073, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 40920 + }, + { + "epoch": 0.40921, + "grad_norm": 1.0131767781150607, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 40921 + }, + { + "epoch": 0.40922, + "grad_norm": 0.9566504761982882, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 40922 + }, + { + "epoch": 0.40923, + "grad_norm": 0.8359046729360927, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 40923 + }, + { + "epoch": 0.40924, + "grad_norm": 0.7409640681161732, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 40924 + }, + { + "epoch": 0.40925, + "grad_norm": 0.6643388897014562, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 40925 + }, + { + "epoch": 0.40926, + "grad_norm": 0.6009394587520659, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 40926 + }, + { + "epoch": 0.40927, + "grad_norm": 0.6374749995534023, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 40927 + }, + { + "epoch": 0.40928, + "grad_norm": 0.6762480098377844, + "learning_rate": 0.003, + "loss": 4.0076, + "step": 40928 + }, + { + "epoch": 0.40929, + "grad_norm": 0.6895317022057029, + "learning_rate": 0.003, + "loss": 3.981, + "step": 40929 + }, + { + "epoch": 0.4093, + "grad_norm": 0.687323215243416, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 40930 + }, + { + "epoch": 0.40931, + "grad_norm": 0.7361841862955574, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 40931 + }, + { + "epoch": 0.40932, + "grad_norm": 0.8348238341823646, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 40932 + }, + { + "epoch": 0.40933, + "grad_norm": 0.8093049101229601, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 40933 + }, + { + "epoch": 0.40934, + "grad_norm": 0.7165342128535132, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 40934 + }, + { + "epoch": 0.40935, + "grad_norm": 0.6493651526010146, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 40935 + }, + { + "epoch": 0.40936, + "grad_norm": 0.8473468140190036, + "learning_rate": 0.003, + "loss": 4.054, + "step": 40936 + }, + { + "epoch": 0.40937, + "grad_norm": 0.932034138157833, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 40937 + }, + { + "epoch": 0.40938, + "grad_norm": 1.002635317767649, + "learning_rate": 0.003, + "loss": 4.036, + "step": 40938 + }, + { + "epoch": 0.40939, + "grad_norm": 0.9080302013626591, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 40939 + }, + { + "epoch": 0.4094, + "grad_norm": 0.8440677690256484, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 40940 + }, + { + "epoch": 0.40941, + "grad_norm": 0.8933926959077878, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 40941 + }, + { + "epoch": 0.40942, + "grad_norm": 0.8996300081552084, + "learning_rate": 0.003, + "loss": 4.031, + "step": 40942 + }, + { + "epoch": 0.40943, + "grad_norm": 0.912249942182995, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 40943 + }, + { + "epoch": 0.40944, + "grad_norm": 0.9488003723722807, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 40944 + }, + { + "epoch": 0.40945, + "grad_norm": 0.9260879590238404, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 40945 + }, + { + "epoch": 0.40946, + "grad_norm": 0.996321912257887, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 40946 + }, + { + "epoch": 0.40947, + "grad_norm": 0.9606665324945317, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 40947 + }, + { + "epoch": 0.40948, + "grad_norm": 0.914864063918192, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 40948 + }, + { + "epoch": 0.40949, + "grad_norm": 0.874356609315819, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 40949 + }, + { + "epoch": 0.4095, + "grad_norm": 0.9318885940013885, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 40950 + }, + { + "epoch": 0.40951, + "grad_norm": 0.9625260228124511, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 40951 + }, + { + "epoch": 0.40952, + "grad_norm": 0.9255032855139605, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 40952 + }, + { + "epoch": 0.40953, + "grad_norm": 0.9002624487460023, + "learning_rate": 0.003, + "loss": 4.045, + "step": 40953 + }, + { + "epoch": 0.40954, + "grad_norm": 0.9077057468900199, + "learning_rate": 0.003, + "loss": 4.06, + "step": 40954 + }, + { + "epoch": 0.40955, + "grad_norm": 1.0730235340239112, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 40955 + }, + { + "epoch": 0.40956, + "grad_norm": 0.8837779089441214, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 40956 + }, + { + "epoch": 0.40957, + "grad_norm": 0.8262947796056652, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 40957 + }, + { + "epoch": 0.40958, + "grad_norm": 0.8667044414373618, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 40958 + }, + { + "epoch": 0.40959, + "grad_norm": 0.9137314584401689, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 40959 + }, + { + "epoch": 0.4096, + "grad_norm": 0.9060916313022457, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 40960 + }, + { + "epoch": 0.40961, + "grad_norm": 0.894329892740314, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 40961 + }, + { + "epoch": 0.40962, + "grad_norm": 0.7859880723344137, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 40962 + }, + { + "epoch": 0.40963, + "grad_norm": 0.6723926452368492, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 40963 + }, + { + "epoch": 0.40964, + "grad_norm": 0.6208198377619079, + "learning_rate": 0.003, + "loss": 4.034, + "step": 40964 + }, + { + "epoch": 0.40965, + "grad_norm": 0.5743738080607017, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 40965 + }, + { + "epoch": 0.40966, + "grad_norm": 0.5669184899535666, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 40966 + }, + { + "epoch": 0.40967, + "grad_norm": 0.5797936958270055, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 40967 + }, + { + "epoch": 0.40968, + "grad_norm": 0.5650655913872386, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 40968 + }, + { + "epoch": 0.40969, + "grad_norm": 0.5735464498482236, + "learning_rate": 0.003, + "loss": 4.021, + "step": 40969 + }, + { + "epoch": 0.4097, + "grad_norm": 0.5521411717213294, + "learning_rate": 0.003, + "loss": 4.0034, + "step": 40970 + }, + { + "epoch": 0.40971, + "grad_norm": 0.6757691613468655, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 40971 + }, + { + "epoch": 0.40972, + "grad_norm": 0.7719106469444069, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 40972 + }, + { + "epoch": 0.40973, + "grad_norm": 0.8655238430718235, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 40973 + }, + { + "epoch": 0.40974, + "grad_norm": 0.8793194891819346, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 40974 + }, + { + "epoch": 0.40975, + "grad_norm": 0.8557216100212356, + "learning_rate": 0.003, + "loss": 4.0088, + "step": 40975 + }, + { + "epoch": 0.40976, + "grad_norm": 0.8660300787524305, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 40976 + }, + { + "epoch": 0.40977, + "grad_norm": 1.0513774590457738, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 40977 + }, + { + "epoch": 0.40978, + "grad_norm": 1.1456876048197866, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 40978 + }, + { + "epoch": 0.40979, + "grad_norm": 0.8877017688746748, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 40979 + }, + { + "epoch": 0.4098, + "grad_norm": 0.8802529901885124, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 40980 + }, + { + "epoch": 0.40981, + "grad_norm": 0.9091686050156909, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 40981 + }, + { + "epoch": 0.40982, + "grad_norm": 0.8992353079442572, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 40982 + }, + { + "epoch": 0.40983, + "grad_norm": 0.8323121218711984, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 40983 + }, + { + "epoch": 0.40984, + "grad_norm": 0.8008599327577688, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 40984 + }, + { + "epoch": 0.40985, + "grad_norm": 0.84816669319281, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 40985 + }, + { + "epoch": 0.40986, + "grad_norm": 0.7496771857924835, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 40986 + }, + { + "epoch": 0.40987, + "grad_norm": 0.7335927321298061, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 40987 + }, + { + "epoch": 0.40988, + "grad_norm": 0.7728693285472573, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 40988 + }, + { + "epoch": 0.40989, + "grad_norm": 0.793359313016997, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 40989 + }, + { + "epoch": 0.4099, + "grad_norm": 0.7629360943335045, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 40990 + }, + { + "epoch": 0.40991, + "grad_norm": 0.7939361051795056, + "learning_rate": 0.003, + "loss": 4.043, + "step": 40991 + }, + { + "epoch": 0.40992, + "grad_norm": 0.940247775687133, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 40992 + }, + { + "epoch": 0.40993, + "grad_norm": 0.9621590005498153, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 40993 + }, + { + "epoch": 0.40994, + "grad_norm": 0.9514371423440585, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 40994 + }, + { + "epoch": 0.40995, + "grad_norm": 0.8652758220195738, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 40995 + }, + { + "epoch": 0.40996, + "grad_norm": 0.7324202358621188, + "learning_rate": 0.003, + "loss": 4.0002, + "step": 40996 + }, + { + "epoch": 0.40997, + "grad_norm": 0.6639163011095354, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 40997 + }, + { + "epoch": 0.40998, + "grad_norm": 0.6181817530546684, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 40998 + }, + { + "epoch": 0.40999, + "grad_norm": 0.6101356170925898, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 40999 + }, + { + "epoch": 0.41, + "grad_norm": 0.6693325039850728, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 41000 + }, + { + "epoch": 0.41001, + "grad_norm": 0.8446679387463023, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 41001 + }, + { + "epoch": 0.41002, + "grad_norm": 1.0039343062661814, + "learning_rate": 0.003, + "loss": 3.996, + "step": 41002 + }, + { + "epoch": 0.41003, + "grad_norm": 1.1438521397175117, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 41003 + }, + { + "epoch": 0.41004, + "grad_norm": 0.8703842781304816, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 41004 + }, + { + "epoch": 0.41005, + "grad_norm": 0.6684261244804749, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 41005 + }, + { + "epoch": 0.41006, + "grad_norm": 0.5707461306083469, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 41006 + }, + { + "epoch": 0.41007, + "grad_norm": 0.6362136063529341, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 41007 + }, + { + "epoch": 0.41008, + "grad_norm": 0.7260652889945468, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 41008 + }, + { + "epoch": 0.41009, + "grad_norm": 0.7503214147907497, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 41009 + }, + { + "epoch": 0.4101, + "grad_norm": 0.8135711960260188, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 41010 + }, + { + "epoch": 0.41011, + "grad_norm": 0.9494507417026326, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 41011 + }, + { + "epoch": 0.41012, + "grad_norm": 0.9749710810146849, + "learning_rate": 0.003, + "loss": 4.046, + "step": 41012 + }, + { + "epoch": 0.41013, + "grad_norm": 1.0686843722444965, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 41013 + }, + { + "epoch": 0.41014, + "grad_norm": 0.9445318404960034, + "learning_rate": 0.003, + "loss": 4.019, + "step": 41014 + }, + { + "epoch": 0.41015, + "grad_norm": 1.0400499835830779, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 41015 + }, + { + "epoch": 0.41016, + "grad_norm": 1.072566530444389, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 41016 + }, + { + "epoch": 0.41017, + "grad_norm": 0.9514393952380165, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 41017 + }, + { + "epoch": 0.41018, + "grad_norm": 0.9252943094884505, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 41018 + }, + { + "epoch": 0.41019, + "grad_norm": 0.9149268878509048, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 41019 + }, + { + "epoch": 0.4102, + "grad_norm": 0.8910848245488123, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 41020 + }, + { + "epoch": 0.41021, + "grad_norm": 0.8567990676453459, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 41021 + }, + { + "epoch": 0.41022, + "grad_norm": 0.7182456592556503, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 41022 + }, + { + "epoch": 0.41023, + "grad_norm": 0.8777157539451741, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 41023 + }, + { + "epoch": 0.41024, + "grad_norm": 1.1783571467823881, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 41024 + }, + { + "epoch": 0.41025, + "grad_norm": 1.1075398968807568, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 41025 + }, + { + "epoch": 0.41026, + "grad_norm": 1.0451592696978402, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 41026 + }, + { + "epoch": 0.41027, + "grad_norm": 0.834872196933323, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 41027 + }, + { + "epoch": 0.41028, + "grad_norm": 0.7192436309124767, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 41028 + }, + { + "epoch": 0.41029, + "grad_norm": 0.6679449136653817, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 41029 + }, + { + "epoch": 0.4103, + "grad_norm": 0.7235221206233866, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 41030 + }, + { + "epoch": 0.41031, + "grad_norm": 0.6279887456001403, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 41031 + }, + { + "epoch": 0.41032, + "grad_norm": 0.6269266432127605, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 41032 + }, + { + "epoch": 0.41033, + "grad_norm": 0.7945313480711093, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 41033 + }, + { + "epoch": 0.41034, + "grad_norm": 0.9374856473643307, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 41034 + }, + { + "epoch": 0.41035, + "grad_norm": 1.068592261197841, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 41035 + }, + { + "epoch": 0.41036, + "grad_norm": 1.096018705886443, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 41036 + }, + { + "epoch": 0.41037, + "grad_norm": 0.8811345946596936, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 41037 + }, + { + "epoch": 0.41038, + "grad_norm": 0.6601236374822609, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 41038 + }, + { + "epoch": 0.41039, + "grad_norm": 0.6080652293995255, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 41039 + }, + { + "epoch": 0.4104, + "grad_norm": 0.5948962032782898, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 41040 + }, + { + "epoch": 0.41041, + "grad_norm": 0.560282865818051, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 41041 + }, + { + "epoch": 0.41042, + "grad_norm": 0.5121153190231453, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 41042 + }, + { + "epoch": 0.41043, + "grad_norm": 0.5870067515046341, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 41043 + }, + { + "epoch": 0.41044, + "grad_norm": 0.544924532177811, + "learning_rate": 0.003, + "loss": 3.9923, + "step": 41044 + }, + { + "epoch": 0.41045, + "grad_norm": 0.578989334294794, + "learning_rate": 0.003, + "loss": 4.029, + "step": 41045 + }, + { + "epoch": 0.41046, + "grad_norm": 0.5563988574104444, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 41046 + }, + { + "epoch": 0.41047, + "grad_norm": 0.5983792788166519, + "learning_rate": 0.003, + "loss": 4.031, + "step": 41047 + }, + { + "epoch": 0.41048, + "grad_norm": 0.685758932777021, + "learning_rate": 0.003, + "loss": 3.9887, + "step": 41048 + }, + { + "epoch": 0.41049, + "grad_norm": 0.7372871956282293, + "learning_rate": 0.003, + "loss": 4.0055, + "step": 41049 + }, + { + "epoch": 0.4105, + "grad_norm": 0.8556900810569279, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 41050 + }, + { + "epoch": 0.41051, + "grad_norm": 1.0365196902986766, + "learning_rate": 0.003, + "loss": 3.9839, + "step": 41051 + }, + { + "epoch": 0.41052, + "grad_norm": 1.1619084862595705, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 41052 + }, + { + "epoch": 0.41053, + "grad_norm": 0.8769444062429467, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 41053 + }, + { + "epoch": 0.41054, + "grad_norm": 0.8045134498179467, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 41054 + }, + { + "epoch": 0.41055, + "grad_norm": 0.686816892256383, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 41055 + }, + { + "epoch": 0.41056, + "grad_norm": 0.7679893580795613, + "learning_rate": 0.003, + "loss": 3.9983, + "step": 41056 + }, + { + "epoch": 0.41057, + "grad_norm": 0.7427863343848925, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 41057 + }, + { + "epoch": 0.41058, + "grad_norm": 0.9507285597020331, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 41058 + }, + { + "epoch": 0.41059, + "grad_norm": 0.9288879746300116, + "learning_rate": 0.003, + "loss": 4.051, + "step": 41059 + }, + { + "epoch": 0.4106, + "grad_norm": 0.7813314337484499, + "learning_rate": 0.003, + "loss": 3.9937, + "step": 41060 + }, + { + "epoch": 0.41061, + "grad_norm": 0.7827997465222403, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 41061 + }, + { + "epoch": 0.41062, + "grad_norm": 0.8675895695020414, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 41062 + }, + { + "epoch": 0.41063, + "grad_norm": 1.0363229492704569, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 41063 + }, + { + "epoch": 0.41064, + "grad_norm": 1.0125290961698497, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 41064 + }, + { + "epoch": 0.41065, + "grad_norm": 0.9131268694145436, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 41065 + }, + { + "epoch": 0.41066, + "grad_norm": 0.8859343794997379, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 41066 + }, + { + "epoch": 0.41067, + "grad_norm": 0.8915879232995241, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 41067 + }, + { + "epoch": 0.41068, + "grad_norm": 0.9312430013317544, + "learning_rate": 0.003, + "loss": 4.055, + "step": 41068 + }, + { + "epoch": 0.41069, + "grad_norm": 1.084532727177219, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 41069 + }, + { + "epoch": 0.4107, + "grad_norm": 1.0656511515726224, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 41070 + }, + { + "epoch": 0.41071, + "grad_norm": 1.0229379355013704, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 41071 + }, + { + "epoch": 0.41072, + "grad_norm": 1.0552666834326951, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 41072 + }, + { + "epoch": 0.41073, + "grad_norm": 0.7378894260417789, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 41073 + }, + { + "epoch": 0.41074, + "grad_norm": 0.6750966019133275, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 41074 + }, + { + "epoch": 0.41075, + "grad_norm": 0.7222422868892602, + "learning_rate": 0.003, + "loss": 4.0026, + "step": 41075 + }, + { + "epoch": 0.41076, + "grad_norm": 0.7701486742833423, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 41076 + }, + { + "epoch": 0.41077, + "grad_norm": 0.7828937899073711, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 41077 + }, + { + "epoch": 0.41078, + "grad_norm": 0.7740662142500963, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 41078 + }, + { + "epoch": 0.41079, + "grad_norm": 0.8872406608693411, + "learning_rate": 0.003, + "loss": 4.0101, + "step": 41079 + }, + { + "epoch": 0.4108, + "grad_norm": 1.034538306925756, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 41080 + }, + { + "epoch": 0.41081, + "grad_norm": 1.039177317304755, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 41081 + }, + { + "epoch": 0.41082, + "grad_norm": 0.975181536967628, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 41082 + }, + { + "epoch": 0.41083, + "grad_norm": 0.8798514707604411, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 41083 + }, + { + "epoch": 0.41084, + "grad_norm": 0.8897622137523187, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 41084 + }, + { + "epoch": 0.41085, + "grad_norm": 0.9132223808597066, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 41085 + }, + { + "epoch": 0.41086, + "grad_norm": 0.903939095468428, + "learning_rate": 0.003, + "loss": 4.023, + "step": 41086 + }, + { + "epoch": 0.41087, + "grad_norm": 0.7992537935001205, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 41087 + }, + { + "epoch": 0.41088, + "grad_norm": 0.6964515626885265, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 41088 + }, + { + "epoch": 0.41089, + "grad_norm": 0.671442156141881, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 41089 + }, + { + "epoch": 0.4109, + "grad_norm": 0.6788840244078225, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 41090 + }, + { + "epoch": 0.41091, + "grad_norm": 0.7382498182945886, + "learning_rate": 0.003, + "loss": 4.031, + "step": 41091 + }, + { + "epoch": 0.41092, + "grad_norm": 0.7811064862498814, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 41092 + }, + { + "epoch": 0.41093, + "grad_norm": 0.8254623807243268, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 41093 + }, + { + "epoch": 0.41094, + "grad_norm": 1.0358142357291067, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 41094 + }, + { + "epoch": 0.41095, + "grad_norm": 1.0792006548328472, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 41095 + }, + { + "epoch": 0.41096, + "grad_norm": 0.7788729027443094, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 41096 + }, + { + "epoch": 0.41097, + "grad_norm": 0.7855951372098939, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 41097 + }, + { + "epoch": 0.41098, + "grad_norm": 0.8052350806202465, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 41098 + }, + { + "epoch": 0.41099, + "grad_norm": 0.8693420717717928, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 41099 + }, + { + "epoch": 0.411, + "grad_norm": 0.7496148384163789, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 41100 + }, + { + "epoch": 0.41101, + "grad_norm": 0.7362610558396911, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 41101 + }, + { + "epoch": 0.41102, + "grad_norm": 0.8771234919429561, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 41102 + }, + { + "epoch": 0.41103, + "grad_norm": 1.0641535473835884, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 41103 + }, + { + "epoch": 0.41104, + "grad_norm": 0.993820039323082, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 41104 + }, + { + "epoch": 0.41105, + "grad_norm": 0.9150722621712903, + "learning_rate": 0.003, + "loss": 4.0027, + "step": 41105 + }, + { + "epoch": 0.41106, + "grad_norm": 0.8452328915210968, + "learning_rate": 0.003, + "loss": 4.0076, + "step": 41106 + }, + { + "epoch": 0.41107, + "grad_norm": 0.8338553341009218, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 41107 + }, + { + "epoch": 0.41108, + "grad_norm": 0.7741187416869341, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 41108 + }, + { + "epoch": 0.41109, + "grad_norm": 0.6837964886109514, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 41109 + }, + { + "epoch": 0.4111, + "grad_norm": 0.6665228149050002, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 41110 + }, + { + "epoch": 0.41111, + "grad_norm": 0.6410787192789139, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 41111 + }, + { + "epoch": 0.41112, + "grad_norm": 0.7401684043469544, + "learning_rate": 0.003, + "loss": 4.018, + "step": 41112 + }, + { + "epoch": 0.41113, + "grad_norm": 0.7662754290510961, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 41113 + }, + { + "epoch": 0.41114, + "grad_norm": 0.7770792082486152, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 41114 + }, + { + "epoch": 0.41115, + "grad_norm": 0.8090316189513748, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 41115 + }, + { + "epoch": 0.41116, + "grad_norm": 1.0210305609080228, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 41116 + }, + { + "epoch": 0.41117, + "grad_norm": 1.0034692851525102, + "learning_rate": 0.003, + "loss": 4.014, + "step": 41117 + }, + { + "epoch": 0.41118, + "grad_norm": 0.9121600672115379, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 41118 + }, + { + "epoch": 0.41119, + "grad_norm": 0.7652561150698047, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 41119 + }, + { + "epoch": 0.4112, + "grad_norm": 0.7556506045138908, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 41120 + }, + { + "epoch": 0.41121, + "grad_norm": 0.7113393138275675, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 41121 + }, + { + "epoch": 0.41122, + "grad_norm": 0.6710466804002306, + "learning_rate": 0.003, + "loss": 4.0076, + "step": 41122 + }, + { + "epoch": 0.41123, + "grad_norm": 0.6893960072153362, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 41123 + }, + { + "epoch": 0.41124, + "grad_norm": 0.776771258407106, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 41124 + }, + { + "epoch": 0.41125, + "grad_norm": 0.8586182474207341, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 41125 + }, + { + "epoch": 0.41126, + "grad_norm": 0.8806278862188475, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 41126 + }, + { + "epoch": 0.41127, + "grad_norm": 0.8586650238439855, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 41127 + }, + { + "epoch": 0.41128, + "grad_norm": 0.8206669548462521, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 41128 + }, + { + "epoch": 0.41129, + "grad_norm": 0.950337208420504, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 41129 + }, + { + "epoch": 0.4113, + "grad_norm": 1.072752117487651, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 41130 + }, + { + "epoch": 0.41131, + "grad_norm": 1.1686201693141345, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 41131 + }, + { + "epoch": 0.41132, + "grad_norm": 0.7899868459780172, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 41132 + }, + { + "epoch": 0.41133, + "grad_norm": 0.7529383269353697, + "learning_rate": 0.003, + "loss": 4.0007, + "step": 41133 + }, + { + "epoch": 0.41134, + "grad_norm": 0.7963543765833762, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 41134 + }, + { + "epoch": 0.41135, + "grad_norm": 0.9360995775168529, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 41135 + }, + { + "epoch": 0.41136, + "grad_norm": 1.1518479321244457, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 41136 + }, + { + "epoch": 0.41137, + "grad_norm": 0.8593987213080494, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 41137 + }, + { + "epoch": 0.41138, + "grad_norm": 0.9867944719080448, + "learning_rate": 0.003, + "loss": 4.0835, + "step": 41138 + }, + { + "epoch": 0.41139, + "grad_norm": 1.1112437454843087, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 41139 + }, + { + "epoch": 0.4114, + "grad_norm": 0.8854542668879101, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 41140 + }, + { + "epoch": 0.41141, + "grad_norm": 0.8666281686160724, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 41141 + }, + { + "epoch": 0.41142, + "grad_norm": 0.7801757584997883, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 41142 + }, + { + "epoch": 0.41143, + "grad_norm": 0.8421022576050862, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 41143 + }, + { + "epoch": 0.41144, + "grad_norm": 0.766108407443986, + "learning_rate": 0.003, + "loss": 3.9981, + "step": 41144 + }, + { + "epoch": 0.41145, + "grad_norm": 0.6500606437207092, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 41145 + }, + { + "epoch": 0.41146, + "grad_norm": 0.6100038990953279, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 41146 + }, + { + "epoch": 0.41147, + "grad_norm": 0.6370419591335661, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 41147 + }, + { + "epoch": 0.41148, + "grad_norm": 0.6646866949119945, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 41148 + }, + { + "epoch": 0.41149, + "grad_norm": 0.7199732940164044, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 41149 + }, + { + "epoch": 0.4115, + "grad_norm": 0.73688093188137, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 41150 + }, + { + "epoch": 0.41151, + "grad_norm": 0.7772695812603754, + "learning_rate": 0.003, + "loss": 4.033, + "step": 41151 + }, + { + "epoch": 0.41152, + "grad_norm": 0.8096550200658884, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 41152 + }, + { + "epoch": 0.41153, + "grad_norm": 0.954868647582412, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 41153 + }, + { + "epoch": 0.41154, + "grad_norm": 1.0914419954059096, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 41154 + }, + { + "epoch": 0.41155, + "grad_norm": 0.8292708969635442, + "learning_rate": 0.003, + "loss": 3.9843, + "step": 41155 + }, + { + "epoch": 0.41156, + "grad_norm": 0.6809461041840613, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 41156 + }, + { + "epoch": 0.41157, + "grad_norm": 0.6418225170873598, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 41157 + }, + { + "epoch": 0.41158, + "grad_norm": 0.7543353818283901, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 41158 + }, + { + "epoch": 0.41159, + "grad_norm": 0.8066478467820705, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 41159 + }, + { + "epoch": 0.4116, + "grad_norm": 0.9842478187260744, + "learning_rate": 0.003, + "loss": 3.9968, + "step": 41160 + }, + { + "epoch": 0.41161, + "grad_norm": 1.1038711751876455, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 41161 + }, + { + "epoch": 0.41162, + "grad_norm": 0.8554913512796519, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 41162 + }, + { + "epoch": 0.41163, + "grad_norm": 0.837903935898597, + "learning_rate": 0.003, + "loss": 4.037, + "step": 41163 + }, + { + "epoch": 0.41164, + "grad_norm": 0.8233195842583741, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 41164 + }, + { + "epoch": 0.41165, + "grad_norm": 0.9244784258706805, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 41165 + }, + { + "epoch": 0.41166, + "grad_norm": 0.950247479337941, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 41166 + }, + { + "epoch": 0.41167, + "grad_norm": 0.968044453069044, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 41167 + }, + { + "epoch": 0.41168, + "grad_norm": 1.0655139330258447, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 41168 + }, + { + "epoch": 0.41169, + "grad_norm": 1.0082436548887015, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 41169 + }, + { + "epoch": 0.4117, + "grad_norm": 0.9940111053506717, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 41170 + }, + { + "epoch": 0.41171, + "grad_norm": 0.9686104600543033, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 41171 + }, + { + "epoch": 0.41172, + "grad_norm": 0.9372734210547266, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 41172 + }, + { + "epoch": 0.41173, + "grad_norm": 0.8978722464450931, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 41173 + }, + { + "epoch": 0.41174, + "grad_norm": 0.8589601793445771, + "learning_rate": 0.003, + "loss": 4.081, + "step": 41174 + }, + { + "epoch": 0.41175, + "grad_norm": 0.8384489936082264, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 41175 + }, + { + "epoch": 0.41176, + "grad_norm": 0.9004554173907618, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 41176 + }, + { + "epoch": 0.41177, + "grad_norm": 1.1377851722520873, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 41177 + }, + { + "epoch": 0.41178, + "grad_norm": 0.9878120471560768, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 41178 + }, + { + "epoch": 0.41179, + "grad_norm": 0.9391305154792782, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 41179 + }, + { + "epoch": 0.4118, + "grad_norm": 0.9609281175939257, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 41180 + }, + { + "epoch": 0.41181, + "grad_norm": 0.9123549931747948, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 41181 + }, + { + "epoch": 0.41182, + "grad_norm": 0.8662734947012057, + "learning_rate": 0.003, + "loss": 4.063, + "step": 41182 + }, + { + "epoch": 0.41183, + "grad_norm": 0.8985426495491737, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 41183 + }, + { + "epoch": 0.41184, + "grad_norm": 0.8905859217680677, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 41184 + }, + { + "epoch": 0.41185, + "grad_norm": 0.825180504082727, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 41185 + }, + { + "epoch": 0.41186, + "grad_norm": 0.74727394937028, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 41186 + }, + { + "epoch": 0.41187, + "grad_norm": 0.6454068427790368, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 41187 + }, + { + "epoch": 0.41188, + "grad_norm": 0.6110580063505217, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 41188 + }, + { + "epoch": 0.41189, + "grad_norm": 0.5205542432973683, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 41189 + }, + { + "epoch": 0.4119, + "grad_norm": 0.5187216544671066, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 41190 + }, + { + "epoch": 0.41191, + "grad_norm": 0.4880578552291109, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 41191 + }, + { + "epoch": 0.41192, + "grad_norm": 0.513328997907413, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 41192 + }, + { + "epoch": 0.41193, + "grad_norm": 0.48104854291875776, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 41193 + }, + { + "epoch": 0.41194, + "grad_norm": 0.4756983499216302, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 41194 + }, + { + "epoch": 0.41195, + "grad_norm": 0.5025079178558753, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 41195 + }, + { + "epoch": 0.41196, + "grad_norm": 0.48499747762828954, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 41196 + }, + { + "epoch": 0.41197, + "grad_norm": 0.5451622587693593, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 41197 + }, + { + "epoch": 0.41198, + "grad_norm": 0.584609073703484, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 41198 + }, + { + "epoch": 0.41199, + "grad_norm": 0.741261980878168, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 41199 + }, + { + "epoch": 0.412, + "grad_norm": 1.1329280011815568, + "learning_rate": 0.003, + "loss": 3.9947, + "step": 41200 + }, + { + "epoch": 0.41201, + "grad_norm": 1.0769010528912797, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 41201 + }, + { + "epoch": 0.41202, + "grad_norm": 0.8706715859969992, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 41202 + }, + { + "epoch": 0.41203, + "grad_norm": 0.8661479654814918, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 41203 + }, + { + "epoch": 0.41204, + "grad_norm": 0.8826196960242766, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 41204 + }, + { + "epoch": 0.41205, + "grad_norm": 0.9654918900645749, + "learning_rate": 0.003, + "loss": 3.993, + "step": 41205 + }, + { + "epoch": 0.41206, + "grad_norm": 0.9133709738291751, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 41206 + }, + { + "epoch": 0.41207, + "grad_norm": 0.9121300409333735, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 41207 + }, + { + "epoch": 0.41208, + "grad_norm": 1.0637843743867392, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 41208 + }, + { + "epoch": 0.41209, + "grad_norm": 0.9535375570101222, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 41209 + }, + { + "epoch": 0.4121, + "grad_norm": 1.0586725499815142, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 41210 + }, + { + "epoch": 0.41211, + "grad_norm": 1.0521436838656826, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 41211 + }, + { + "epoch": 0.41212, + "grad_norm": 0.9498715093506634, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 41212 + }, + { + "epoch": 0.41213, + "grad_norm": 0.9455137811857024, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 41213 + }, + { + "epoch": 0.41214, + "grad_norm": 0.8689945458200127, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 41214 + }, + { + "epoch": 0.41215, + "grad_norm": 0.9379067322299177, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 41215 + }, + { + "epoch": 0.41216, + "grad_norm": 0.9525587098854665, + "learning_rate": 0.003, + "loss": 4.0755, + "step": 41216 + }, + { + "epoch": 0.41217, + "grad_norm": 0.9562453744160846, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 41217 + }, + { + "epoch": 0.41218, + "grad_norm": 1.0357583105371035, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 41218 + }, + { + "epoch": 0.41219, + "grad_norm": 0.9819810455057847, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 41219 + }, + { + "epoch": 0.4122, + "grad_norm": 0.9250809940239145, + "learning_rate": 0.003, + "loss": 4.041, + "step": 41220 + }, + { + "epoch": 0.41221, + "grad_norm": 0.8400106560904625, + "learning_rate": 0.003, + "loss": 4.0941, + "step": 41221 + }, + { + "epoch": 0.41222, + "grad_norm": 0.6681574082002021, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 41222 + }, + { + "epoch": 0.41223, + "grad_norm": 0.6058477251615277, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 41223 + }, + { + "epoch": 0.41224, + "grad_norm": 0.6676846560969717, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 41224 + }, + { + "epoch": 0.41225, + "grad_norm": 0.7255089122526762, + "learning_rate": 0.003, + "loss": 4.007, + "step": 41225 + }, + { + "epoch": 0.41226, + "grad_norm": 0.729000592221114, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 41226 + }, + { + "epoch": 0.41227, + "grad_norm": 0.6914560876007164, + "learning_rate": 0.003, + "loss": 4.017, + "step": 41227 + }, + { + "epoch": 0.41228, + "grad_norm": 0.6586340751746034, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 41228 + }, + { + "epoch": 0.41229, + "grad_norm": 0.6428003521259569, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 41229 + }, + { + "epoch": 0.4123, + "grad_norm": 0.6889168272361788, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 41230 + }, + { + "epoch": 0.41231, + "grad_norm": 0.7893271434383877, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 41231 + }, + { + "epoch": 0.41232, + "grad_norm": 0.8817717203710609, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 41232 + }, + { + "epoch": 0.41233, + "grad_norm": 0.9336927708581558, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 41233 + }, + { + "epoch": 0.41234, + "grad_norm": 0.9017998061757619, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 41234 + }, + { + "epoch": 0.41235, + "grad_norm": 0.7741404421162533, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 41235 + }, + { + "epoch": 0.41236, + "grad_norm": 0.7323339978729703, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 41236 + }, + { + "epoch": 0.41237, + "grad_norm": 0.7130072237296119, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 41237 + }, + { + "epoch": 0.41238, + "grad_norm": 0.751674834302372, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 41238 + }, + { + "epoch": 0.41239, + "grad_norm": 0.7684622764004859, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 41239 + }, + { + "epoch": 0.4124, + "grad_norm": 0.7936134002118607, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 41240 + }, + { + "epoch": 0.41241, + "grad_norm": 0.8167947369620491, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 41241 + }, + { + "epoch": 0.41242, + "grad_norm": 0.8473059404839743, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 41242 + }, + { + "epoch": 0.41243, + "grad_norm": 1.0089639494122637, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 41243 + }, + { + "epoch": 0.41244, + "grad_norm": 1.1240384333476736, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 41244 + }, + { + "epoch": 0.41245, + "grad_norm": 0.9789491640329714, + "learning_rate": 0.003, + "loss": 4.025, + "step": 41245 + }, + { + "epoch": 0.41246, + "grad_norm": 0.8781129926263445, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 41246 + }, + { + "epoch": 0.41247, + "grad_norm": 0.7209381406363374, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 41247 + }, + { + "epoch": 0.41248, + "grad_norm": 0.6972246477728092, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 41248 + }, + { + "epoch": 0.41249, + "grad_norm": 0.6940285826731691, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 41249 + }, + { + "epoch": 0.4125, + "grad_norm": 0.6800942958422977, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 41250 + }, + { + "epoch": 0.41251, + "grad_norm": 0.6239179430185843, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 41251 + }, + { + "epoch": 0.41252, + "grad_norm": 0.6564936937702064, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 41252 + }, + { + "epoch": 0.41253, + "grad_norm": 0.6679894526455364, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 41253 + }, + { + "epoch": 0.41254, + "grad_norm": 0.6588254443906326, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 41254 + }, + { + "epoch": 0.41255, + "grad_norm": 0.7036882479251749, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 41255 + }, + { + "epoch": 0.41256, + "grad_norm": 0.7980643300221129, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 41256 + }, + { + "epoch": 0.41257, + "grad_norm": 0.8794523969713947, + "learning_rate": 0.003, + "loss": 3.9927, + "step": 41257 + }, + { + "epoch": 0.41258, + "grad_norm": 0.9994456114373321, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 41258 + }, + { + "epoch": 0.41259, + "grad_norm": 1.0466151160179662, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 41259 + }, + { + "epoch": 0.4126, + "grad_norm": 1.061039722393568, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 41260 + }, + { + "epoch": 0.41261, + "grad_norm": 1.0754654241669395, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 41261 + }, + { + "epoch": 0.41262, + "grad_norm": 0.9789360745023993, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 41262 + }, + { + "epoch": 0.41263, + "grad_norm": 0.9589936563127959, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 41263 + }, + { + "epoch": 0.41264, + "grad_norm": 0.8274427923852596, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 41264 + }, + { + "epoch": 0.41265, + "grad_norm": 0.8687113545001337, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 41265 + }, + { + "epoch": 0.41266, + "grad_norm": 1.0891446005019698, + "learning_rate": 0.003, + "loss": 4.052, + "step": 41266 + }, + { + "epoch": 0.41267, + "grad_norm": 0.9127473804453109, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 41267 + }, + { + "epoch": 0.41268, + "grad_norm": 0.7740909535983451, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 41268 + }, + { + "epoch": 0.41269, + "grad_norm": 0.7758378189201449, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 41269 + }, + { + "epoch": 0.4127, + "grad_norm": 0.7256950098279239, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 41270 + }, + { + "epoch": 0.41271, + "grad_norm": 0.7080212937711433, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 41271 + }, + { + "epoch": 0.41272, + "grad_norm": 0.8323016781084036, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 41272 + }, + { + "epoch": 0.41273, + "grad_norm": 0.940292425123799, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 41273 + }, + { + "epoch": 0.41274, + "grad_norm": 1.0983475864188499, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 41274 + }, + { + "epoch": 0.41275, + "grad_norm": 1.0210780657090908, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 41275 + }, + { + "epoch": 0.41276, + "grad_norm": 0.8540758719463502, + "learning_rate": 0.003, + "loss": 4.0628, + "step": 41276 + }, + { + "epoch": 0.41277, + "grad_norm": 0.673994802602468, + "learning_rate": 0.003, + "loss": 4.009, + "step": 41277 + }, + { + "epoch": 0.41278, + "grad_norm": 0.7237302897870851, + "learning_rate": 0.003, + "loss": 4.01, + "step": 41278 + }, + { + "epoch": 0.41279, + "grad_norm": 0.6668353383099072, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 41279 + }, + { + "epoch": 0.4128, + "grad_norm": 0.7198668910122107, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 41280 + }, + { + "epoch": 0.41281, + "grad_norm": 0.8330026367866864, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 41281 + }, + { + "epoch": 0.41282, + "grad_norm": 0.8810673020695089, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 41282 + }, + { + "epoch": 0.41283, + "grad_norm": 0.9680236058547859, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 41283 + }, + { + "epoch": 0.41284, + "grad_norm": 0.9770874963925197, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 41284 + }, + { + "epoch": 0.41285, + "grad_norm": 0.8855165389004482, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 41285 + }, + { + "epoch": 0.41286, + "grad_norm": 0.7950263440832258, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 41286 + }, + { + "epoch": 0.41287, + "grad_norm": 0.6711980334593222, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 41287 + }, + { + "epoch": 0.41288, + "grad_norm": 0.6537443384124156, + "learning_rate": 0.003, + "loss": 4.063, + "step": 41288 + }, + { + "epoch": 0.41289, + "grad_norm": 0.7471076036719541, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 41289 + }, + { + "epoch": 0.4129, + "grad_norm": 0.7337305737316917, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 41290 + }, + { + "epoch": 0.41291, + "grad_norm": 0.676632465230445, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 41291 + }, + { + "epoch": 0.41292, + "grad_norm": 0.6468216292565117, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 41292 + }, + { + "epoch": 0.41293, + "grad_norm": 0.7352644306382258, + "learning_rate": 0.003, + "loss": 4.0018, + "step": 41293 + }, + { + "epoch": 0.41294, + "grad_norm": 0.7776649255787152, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 41294 + }, + { + "epoch": 0.41295, + "grad_norm": 0.7884955642521218, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 41295 + }, + { + "epoch": 0.41296, + "grad_norm": 0.6509971771002545, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 41296 + }, + { + "epoch": 0.41297, + "grad_norm": 0.5558002691046035, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 41297 + }, + { + "epoch": 0.41298, + "grad_norm": 0.6310705944348001, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 41298 + }, + { + "epoch": 0.41299, + "grad_norm": 0.7041623851523033, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 41299 + }, + { + "epoch": 0.413, + "grad_norm": 0.7907735706507346, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 41300 + }, + { + "epoch": 0.41301, + "grad_norm": 1.0417358546877853, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 41301 + }, + { + "epoch": 0.41302, + "grad_norm": 1.3193939890003257, + "learning_rate": 0.003, + "loss": 4.018, + "step": 41302 + }, + { + "epoch": 0.41303, + "grad_norm": 0.6940079240108473, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 41303 + }, + { + "epoch": 0.41304, + "grad_norm": 0.7490979599383891, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 41304 + }, + { + "epoch": 0.41305, + "grad_norm": 0.8143645874417587, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 41305 + }, + { + "epoch": 0.41306, + "grad_norm": 0.8279990621327098, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 41306 + }, + { + "epoch": 0.41307, + "grad_norm": 1.0378579505295202, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 41307 + }, + { + "epoch": 0.41308, + "grad_norm": 0.9910898510542322, + "learning_rate": 0.003, + "loss": 3.9988, + "step": 41308 + }, + { + "epoch": 0.41309, + "grad_norm": 0.8738488651790394, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 41309 + }, + { + "epoch": 0.4131, + "grad_norm": 0.7693741003272365, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 41310 + }, + { + "epoch": 0.41311, + "grad_norm": 0.7688245965536484, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 41311 + }, + { + "epoch": 0.41312, + "grad_norm": 0.7443084606890495, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 41312 + }, + { + "epoch": 0.41313, + "grad_norm": 0.6911778074378494, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 41313 + }, + { + "epoch": 0.41314, + "grad_norm": 0.6373261653012805, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 41314 + }, + { + "epoch": 0.41315, + "grad_norm": 0.7978014136170231, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 41315 + }, + { + "epoch": 0.41316, + "grad_norm": 1.0402195595230663, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 41316 + }, + { + "epoch": 0.41317, + "grad_norm": 1.083027574665018, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 41317 + }, + { + "epoch": 0.41318, + "grad_norm": 0.9182980116661682, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 41318 + }, + { + "epoch": 0.41319, + "grad_norm": 0.8143445402735738, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 41319 + }, + { + "epoch": 0.4132, + "grad_norm": 0.7926637930801441, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 41320 + }, + { + "epoch": 0.41321, + "grad_norm": 0.7703931486830056, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 41321 + }, + { + "epoch": 0.41322, + "grad_norm": 0.7991446887266185, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 41322 + }, + { + "epoch": 0.41323, + "grad_norm": 0.8616788279582379, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 41323 + }, + { + "epoch": 0.41324, + "grad_norm": 0.9018580748326572, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 41324 + }, + { + "epoch": 0.41325, + "grad_norm": 0.9453974894552397, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 41325 + }, + { + "epoch": 0.41326, + "grad_norm": 0.9148267893477126, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 41326 + }, + { + "epoch": 0.41327, + "grad_norm": 0.9470399885572501, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 41327 + }, + { + "epoch": 0.41328, + "grad_norm": 1.0850215578236762, + "learning_rate": 0.003, + "loss": 4.0717, + "step": 41328 + }, + { + "epoch": 0.41329, + "grad_norm": 0.8871677122717544, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 41329 + }, + { + "epoch": 0.4133, + "grad_norm": 0.834973126878877, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 41330 + }, + { + "epoch": 0.41331, + "grad_norm": 0.7959029675214334, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 41331 + }, + { + "epoch": 0.41332, + "grad_norm": 0.8895681785956633, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 41332 + }, + { + "epoch": 0.41333, + "grad_norm": 1.151558607026426, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 41333 + }, + { + "epoch": 0.41334, + "grad_norm": 0.997539022376843, + "learning_rate": 0.003, + "loss": 4.03, + "step": 41334 + }, + { + "epoch": 0.41335, + "grad_norm": 1.0502918932525993, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 41335 + }, + { + "epoch": 0.41336, + "grad_norm": 0.9094377184385034, + "learning_rate": 0.003, + "loss": 4.045, + "step": 41336 + }, + { + "epoch": 0.41337, + "grad_norm": 0.937454105925035, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 41337 + }, + { + "epoch": 0.41338, + "grad_norm": 0.930580903448127, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 41338 + }, + { + "epoch": 0.41339, + "grad_norm": 0.999562778890824, + "learning_rate": 0.003, + "loss": 4.053, + "step": 41339 + }, + { + "epoch": 0.4134, + "grad_norm": 0.9284296027651691, + "learning_rate": 0.003, + "loss": 4.053, + "step": 41340 + }, + { + "epoch": 0.41341, + "grad_norm": 0.8709001914815626, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 41341 + }, + { + "epoch": 0.41342, + "grad_norm": 0.8206479836059507, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 41342 + }, + { + "epoch": 0.41343, + "grad_norm": 0.8236615842509429, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 41343 + }, + { + "epoch": 0.41344, + "grad_norm": 0.9925175423169267, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 41344 + }, + { + "epoch": 0.41345, + "grad_norm": 1.1698041175827745, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 41345 + }, + { + "epoch": 0.41346, + "grad_norm": 0.8265568875075868, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 41346 + }, + { + "epoch": 0.41347, + "grad_norm": 0.8199170229830811, + "learning_rate": 0.003, + "loss": 4.051, + "step": 41347 + }, + { + "epoch": 0.41348, + "grad_norm": 0.9829900971384059, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 41348 + }, + { + "epoch": 0.41349, + "grad_norm": 1.0880162504354471, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 41349 + }, + { + "epoch": 0.4135, + "grad_norm": 0.8005528068085147, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 41350 + }, + { + "epoch": 0.41351, + "grad_norm": 0.7206247713934685, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 41351 + }, + { + "epoch": 0.41352, + "grad_norm": 0.619188932294807, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 41352 + }, + { + "epoch": 0.41353, + "grad_norm": 0.6827542415942438, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 41353 + }, + { + "epoch": 0.41354, + "grad_norm": 0.7068033855264231, + "learning_rate": 0.003, + "loss": 4.048, + "step": 41354 + }, + { + "epoch": 0.41355, + "grad_norm": 0.6433627868830774, + "learning_rate": 0.003, + "loss": 4.034, + "step": 41355 + }, + { + "epoch": 0.41356, + "grad_norm": 0.5773718122245163, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 41356 + }, + { + "epoch": 0.41357, + "grad_norm": 0.4159590995483678, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 41357 + }, + { + "epoch": 0.41358, + "grad_norm": 0.4523264361274608, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 41358 + }, + { + "epoch": 0.41359, + "grad_norm": 0.45452591202403786, + "learning_rate": 0.003, + "loss": 3.9939, + "step": 41359 + }, + { + "epoch": 0.4136, + "grad_norm": 0.47780655381671067, + "learning_rate": 0.003, + "loss": 4.02, + "step": 41360 + }, + { + "epoch": 0.41361, + "grad_norm": 0.4392174452709152, + "learning_rate": 0.003, + "loss": 3.9944, + "step": 41361 + }, + { + "epoch": 0.41362, + "grad_norm": 0.4725685325568285, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 41362 + }, + { + "epoch": 0.41363, + "grad_norm": 0.52163604867902, + "learning_rate": 0.003, + "loss": 3.9999, + "step": 41363 + }, + { + "epoch": 0.41364, + "grad_norm": 0.6037313566035155, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 41364 + }, + { + "epoch": 0.41365, + "grad_norm": 0.7519846649507631, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 41365 + }, + { + "epoch": 0.41366, + "grad_norm": 1.0130563622314288, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 41366 + }, + { + "epoch": 0.41367, + "grad_norm": 1.3910916384124363, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 41367 + }, + { + "epoch": 0.41368, + "grad_norm": 0.7320372806271568, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 41368 + }, + { + "epoch": 0.41369, + "grad_norm": 0.7390122891820392, + "learning_rate": 0.003, + "loss": 4.058, + "step": 41369 + }, + { + "epoch": 0.4137, + "grad_norm": 0.7397419198075607, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 41370 + }, + { + "epoch": 0.41371, + "grad_norm": 0.665491876221156, + "learning_rate": 0.003, + "loss": 4.0088, + "step": 41371 + }, + { + "epoch": 0.41372, + "grad_norm": 0.7276719856841055, + "learning_rate": 0.003, + "loss": 4.035, + "step": 41372 + }, + { + "epoch": 0.41373, + "grad_norm": 0.6972597199028336, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 41373 + }, + { + "epoch": 0.41374, + "grad_norm": 0.6408751068037991, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 41374 + }, + { + "epoch": 0.41375, + "grad_norm": 0.6228193017204248, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 41375 + }, + { + "epoch": 0.41376, + "grad_norm": 0.7200008603565924, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 41376 + }, + { + "epoch": 0.41377, + "grad_norm": 0.9188443197027665, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 41377 + }, + { + "epoch": 0.41378, + "grad_norm": 1.3228165230045725, + "learning_rate": 0.003, + "loss": 3.9898, + "step": 41378 + }, + { + "epoch": 0.41379, + "grad_norm": 0.7142846153264542, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 41379 + }, + { + "epoch": 0.4138, + "grad_norm": 0.7133459172924008, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 41380 + }, + { + "epoch": 0.41381, + "grad_norm": 0.7359537063034953, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 41381 + }, + { + "epoch": 0.41382, + "grad_norm": 0.9140592243369695, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 41382 + }, + { + "epoch": 0.41383, + "grad_norm": 1.1438959252815026, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 41383 + }, + { + "epoch": 0.41384, + "grad_norm": 0.8518943135076035, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 41384 + }, + { + "epoch": 0.41385, + "grad_norm": 0.8781888851958203, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 41385 + }, + { + "epoch": 0.41386, + "grad_norm": 0.8755363909607266, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 41386 + }, + { + "epoch": 0.41387, + "grad_norm": 0.8371442544427347, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 41387 + }, + { + "epoch": 0.41388, + "grad_norm": 0.9328804965224667, + "learning_rate": 0.003, + "loss": 4.0032, + "step": 41388 + }, + { + "epoch": 0.41389, + "grad_norm": 0.8880443335186358, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 41389 + }, + { + "epoch": 0.4139, + "grad_norm": 0.6604881242953033, + "learning_rate": 0.003, + "loss": 3.9946, + "step": 41390 + }, + { + "epoch": 0.41391, + "grad_norm": 0.6922583453007157, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 41391 + }, + { + "epoch": 0.41392, + "grad_norm": 0.8717097687357922, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 41392 + }, + { + "epoch": 0.41393, + "grad_norm": 1.2398304182136561, + "learning_rate": 0.003, + "loss": 4.017, + "step": 41393 + }, + { + "epoch": 0.41394, + "grad_norm": 1.03372771183718, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 41394 + }, + { + "epoch": 0.41395, + "grad_norm": 0.9105054426112892, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 41395 + }, + { + "epoch": 0.41396, + "grad_norm": 0.7700163557305122, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 41396 + }, + { + "epoch": 0.41397, + "grad_norm": 0.7364745940526092, + "learning_rate": 0.003, + "loss": 4.0101, + "step": 41397 + }, + { + "epoch": 0.41398, + "grad_norm": 0.7409647187602777, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 41398 + }, + { + "epoch": 0.41399, + "grad_norm": 0.7228210321951648, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 41399 + }, + { + "epoch": 0.414, + "grad_norm": 0.7091282863737521, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 41400 + }, + { + "epoch": 0.41401, + "grad_norm": 0.7232169629828961, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 41401 + }, + { + "epoch": 0.41402, + "grad_norm": 0.694181076951566, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 41402 + }, + { + "epoch": 0.41403, + "grad_norm": 0.6770361841265067, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 41403 + }, + { + "epoch": 0.41404, + "grad_norm": 0.6925460334678695, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 41404 + }, + { + "epoch": 0.41405, + "grad_norm": 0.7604479849775193, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 41405 + }, + { + "epoch": 0.41406, + "grad_norm": 0.8731837543797286, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 41406 + }, + { + "epoch": 0.41407, + "grad_norm": 0.9094280094225926, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 41407 + }, + { + "epoch": 0.41408, + "grad_norm": 0.9680788727095944, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 41408 + }, + { + "epoch": 0.41409, + "grad_norm": 1.0232220397184133, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 41409 + }, + { + "epoch": 0.4141, + "grad_norm": 1.009489757822601, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 41410 + }, + { + "epoch": 0.41411, + "grad_norm": 0.9409780518340604, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 41411 + }, + { + "epoch": 0.41412, + "grad_norm": 0.8489514520451032, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 41412 + }, + { + "epoch": 0.41413, + "grad_norm": 0.8965872109404871, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 41413 + }, + { + "epoch": 0.41414, + "grad_norm": 0.8526929882654279, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 41414 + }, + { + "epoch": 0.41415, + "grad_norm": 0.8836484899261374, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 41415 + }, + { + "epoch": 0.41416, + "grad_norm": 0.8166540159632327, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 41416 + }, + { + "epoch": 0.41417, + "grad_norm": 0.8244373654514362, + "learning_rate": 0.003, + "loss": 3.9939, + "step": 41417 + }, + { + "epoch": 0.41418, + "grad_norm": 0.9738483174624533, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 41418 + }, + { + "epoch": 0.41419, + "grad_norm": 1.0053258338272903, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 41419 + }, + { + "epoch": 0.4142, + "grad_norm": 0.8571733937326094, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 41420 + }, + { + "epoch": 0.41421, + "grad_norm": 0.751462824888374, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 41421 + }, + { + "epoch": 0.41422, + "grad_norm": 0.725904801709252, + "learning_rate": 0.003, + "loss": 4.05, + "step": 41422 + }, + { + "epoch": 0.41423, + "grad_norm": 0.7788169081975169, + "learning_rate": 0.003, + "loss": 4.034, + "step": 41423 + }, + { + "epoch": 0.41424, + "grad_norm": 0.854104155919683, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 41424 + }, + { + "epoch": 0.41425, + "grad_norm": 0.8723708681926248, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 41425 + }, + { + "epoch": 0.41426, + "grad_norm": 0.9145669663482778, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 41426 + }, + { + "epoch": 0.41427, + "grad_norm": 0.949187431128801, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 41427 + }, + { + "epoch": 0.41428, + "grad_norm": 0.9877791326416386, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 41428 + }, + { + "epoch": 0.41429, + "grad_norm": 1.0992298181493134, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 41429 + }, + { + "epoch": 0.4143, + "grad_norm": 0.9372625339056244, + "learning_rate": 0.003, + "loss": 4.047, + "step": 41430 + }, + { + "epoch": 0.41431, + "grad_norm": 0.8331157837865366, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 41431 + }, + { + "epoch": 0.41432, + "grad_norm": 0.8712198274964325, + "learning_rate": 0.003, + "loss": 4.0089, + "step": 41432 + }, + { + "epoch": 0.41433, + "grad_norm": 0.9110725586543761, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 41433 + }, + { + "epoch": 0.41434, + "grad_norm": 0.9695695174419623, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 41434 + }, + { + "epoch": 0.41435, + "grad_norm": 1.163328898035114, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 41435 + }, + { + "epoch": 0.41436, + "grad_norm": 0.9365547145591686, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 41436 + }, + { + "epoch": 0.41437, + "grad_norm": 0.8454390097759044, + "learning_rate": 0.003, + "loss": 3.9858, + "step": 41437 + }, + { + "epoch": 0.41438, + "grad_norm": 0.838599782245428, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 41438 + }, + { + "epoch": 0.41439, + "grad_norm": 0.7911197917172949, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 41439 + }, + { + "epoch": 0.4144, + "grad_norm": 0.9914025852080218, + "learning_rate": 0.003, + "loss": 4.028, + "step": 41440 + }, + { + "epoch": 0.41441, + "grad_norm": 1.0772392138550606, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 41441 + }, + { + "epoch": 0.41442, + "grad_norm": 0.7826180690515561, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 41442 + }, + { + "epoch": 0.41443, + "grad_norm": 0.9565150276894334, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 41443 + }, + { + "epoch": 0.41444, + "grad_norm": 1.0901982409454976, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 41444 + }, + { + "epoch": 0.41445, + "grad_norm": 0.9240811053827725, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 41445 + }, + { + "epoch": 0.41446, + "grad_norm": 1.0383891719176181, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 41446 + }, + { + "epoch": 0.41447, + "grad_norm": 0.9744863881652345, + "learning_rate": 0.003, + "loss": 4.014, + "step": 41447 + }, + { + "epoch": 0.41448, + "grad_norm": 1.074477343208737, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 41448 + }, + { + "epoch": 0.41449, + "grad_norm": 0.9376749632011739, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 41449 + }, + { + "epoch": 0.4145, + "grad_norm": 0.9246847346396289, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 41450 + }, + { + "epoch": 0.41451, + "grad_norm": 0.7968405842223963, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 41451 + }, + { + "epoch": 0.41452, + "grad_norm": 0.7532826152775823, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 41452 + }, + { + "epoch": 0.41453, + "grad_norm": 0.7589782960922948, + "learning_rate": 0.003, + "loss": 4.045, + "step": 41453 + }, + { + "epoch": 0.41454, + "grad_norm": 0.7327159916346836, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 41454 + }, + { + "epoch": 0.41455, + "grad_norm": 0.8074038211708423, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 41455 + }, + { + "epoch": 0.41456, + "grad_norm": 0.8659352372014212, + "learning_rate": 0.003, + "loss": 4.017, + "step": 41456 + }, + { + "epoch": 0.41457, + "grad_norm": 0.8810144353956164, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 41457 + }, + { + "epoch": 0.41458, + "grad_norm": 0.8096588351295235, + "learning_rate": 0.003, + "loss": 4.055, + "step": 41458 + }, + { + "epoch": 0.41459, + "grad_norm": 0.6562540235775565, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 41459 + }, + { + "epoch": 0.4146, + "grad_norm": 0.5965056147527612, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 41460 + }, + { + "epoch": 0.41461, + "grad_norm": 0.579814711376661, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 41461 + }, + { + "epoch": 0.41462, + "grad_norm": 0.5806039272005842, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 41462 + }, + { + "epoch": 0.41463, + "grad_norm": 0.48199125263490017, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 41463 + }, + { + "epoch": 0.41464, + "grad_norm": 0.4590663804974862, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 41464 + }, + { + "epoch": 0.41465, + "grad_norm": 0.49979184381555675, + "learning_rate": 0.003, + "loss": 4.033, + "step": 41465 + }, + { + "epoch": 0.41466, + "grad_norm": 0.6323536413580602, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 41466 + }, + { + "epoch": 0.41467, + "grad_norm": 0.9188186925843489, + "learning_rate": 0.003, + "loss": 4.012, + "step": 41467 + }, + { + "epoch": 0.41468, + "grad_norm": 1.3596705324268519, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 41468 + }, + { + "epoch": 0.41469, + "grad_norm": 0.6925802542896621, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 41469 + }, + { + "epoch": 0.4147, + "grad_norm": 0.6271733315108431, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 41470 + }, + { + "epoch": 0.41471, + "grad_norm": 0.6726279816670403, + "learning_rate": 0.003, + "loss": 4.0034, + "step": 41471 + }, + { + "epoch": 0.41472, + "grad_norm": 0.6221141814175618, + "learning_rate": 0.003, + "loss": 4.056, + "step": 41472 + }, + { + "epoch": 0.41473, + "grad_norm": 0.632109509095257, + "learning_rate": 0.003, + "loss": 4.031, + "step": 41473 + }, + { + "epoch": 0.41474, + "grad_norm": 0.657191363705597, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 41474 + }, + { + "epoch": 0.41475, + "grad_norm": 0.8078498868085618, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 41475 + }, + { + "epoch": 0.41476, + "grad_norm": 1.1878358040116714, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 41476 + }, + { + "epoch": 0.41477, + "grad_norm": 1.0727409508030163, + "learning_rate": 0.003, + "loss": 4.0014, + "step": 41477 + }, + { + "epoch": 0.41478, + "grad_norm": 0.9015109160901638, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 41478 + }, + { + "epoch": 0.41479, + "grad_norm": 0.8456982627778736, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 41479 + }, + { + "epoch": 0.4148, + "grad_norm": 0.76473117759736, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 41480 + }, + { + "epoch": 0.41481, + "grad_norm": 0.875391382331769, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 41481 + }, + { + "epoch": 0.41482, + "grad_norm": 0.9529710405119013, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 41482 + }, + { + "epoch": 0.41483, + "grad_norm": 0.969978656703324, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 41483 + }, + { + "epoch": 0.41484, + "grad_norm": 0.9466457529132982, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 41484 + }, + { + "epoch": 0.41485, + "grad_norm": 0.9784250229919834, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 41485 + }, + { + "epoch": 0.41486, + "grad_norm": 0.9654206059505329, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 41486 + }, + { + "epoch": 0.41487, + "grad_norm": 0.8905018881096002, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 41487 + }, + { + "epoch": 0.41488, + "grad_norm": 0.8921685494017483, + "learning_rate": 0.003, + "loss": 4.0579, + "step": 41488 + }, + { + "epoch": 0.41489, + "grad_norm": 0.9707967393972621, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 41489 + }, + { + "epoch": 0.4149, + "grad_norm": 1.010944863955838, + "learning_rate": 0.003, + "loss": 3.9967, + "step": 41490 + }, + { + "epoch": 0.41491, + "grad_norm": 0.8283364303151833, + "learning_rate": 0.003, + "loss": 4.0848, + "step": 41491 + }, + { + "epoch": 0.41492, + "grad_norm": 0.8338631592742319, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 41492 + }, + { + "epoch": 0.41493, + "grad_norm": 0.8048208339768631, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 41493 + }, + { + "epoch": 0.41494, + "grad_norm": 0.6605879800198348, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 41494 + }, + { + "epoch": 0.41495, + "grad_norm": 0.6652392868149544, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 41495 + }, + { + "epoch": 0.41496, + "grad_norm": 0.7018306400569204, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 41496 + }, + { + "epoch": 0.41497, + "grad_norm": 0.857592952051622, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 41497 + }, + { + "epoch": 0.41498, + "grad_norm": 1.0429639351591264, + "learning_rate": 0.003, + "loss": 4.0692, + "step": 41498 + }, + { + "epoch": 0.41499, + "grad_norm": 0.9927107533826309, + "learning_rate": 0.003, + "loss": 4.0032, + "step": 41499 + }, + { + "epoch": 0.415, + "grad_norm": 0.9713968499290242, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 41500 + }, + { + "epoch": 0.41501, + "grad_norm": 0.9561055120679415, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 41501 + }, + { + "epoch": 0.41502, + "grad_norm": 1.1479659386953494, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 41502 + }, + { + "epoch": 0.41503, + "grad_norm": 1.150773678178684, + "learning_rate": 0.003, + "loss": 4.0723, + "step": 41503 + }, + { + "epoch": 0.41504, + "grad_norm": 0.8298429686043235, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 41504 + }, + { + "epoch": 0.41505, + "grad_norm": 0.718453832519022, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 41505 + }, + { + "epoch": 0.41506, + "grad_norm": 0.6898908821534957, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 41506 + }, + { + "epoch": 0.41507, + "grad_norm": 0.6181262414835715, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 41507 + }, + { + "epoch": 0.41508, + "grad_norm": 0.6571673820084992, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 41508 + }, + { + "epoch": 0.41509, + "grad_norm": 0.6066963851205339, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 41509 + }, + { + "epoch": 0.4151, + "grad_norm": 0.5843090478516818, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 41510 + }, + { + "epoch": 0.41511, + "grad_norm": 0.5824025251627233, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 41511 + }, + { + "epoch": 0.41512, + "grad_norm": 0.5944311433206649, + "learning_rate": 0.003, + "loss": 3.9971, + "step": 41512 + }, + { + "epoch": 0.41513, + "grad_norm": 0.5572400825537737, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 41513 + }, + { + "epoch": 0.41514, + "grad_norm": 0.6625620953929705, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 41514 + }, + { + "epoch": 0.41515, + "grad_norm": 0.6994200899231047, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 41515 + }, + { + "epoch": 0.41516, + "grad_norm": 0.6229136001858052, + "learning_rate": 0.003, + "loss": 3.996, + "step": 41516 + }, + { + "epoch": 0.41517, + "grad_norm": 0.6672778597744438, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 41517 + }, + { + "epoch": 0.41518, + "grad_norm": 0.7973635092930972, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 41518 + }, + { + "epoch": 0.41519, + "grad_norm": 1.1584103720113446, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 41519 + }, + { + "epoch": 0.4152, + "grad_norm": 1.1128887299946162, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 41520 + }, + { + "epoch": 0.41521, + "grad_norm": 1.004508243774309, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 41521 + }, + { + "epoch": 0.41522, + "grad_norm": 0.9189550683861722, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 41522 + }, + { + "epoch": 0.41523, + "grad_norm": 0.8551354596611043, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 41523 + }, + { + "epoch": 0.41524, + "grad_norm": 0.7419475158682767, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 41524 + }, + { + "epoch": 0.41525, + "grad_norm": 0.822126425651961, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 41525 + }, + { + "epoch": 0.41526, + "grad_norm": 0.8098636258001561, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 41526 + }, + { + "epoch": 0.41527, + "grad_norm": 0.7301710893330292, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 41527 + }, + { + "epoch": 0.41528, + "grad_norm": 0.8381775104523683, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 41528 + }, + { + "epoch": 0.41529, + "grad_norm": 0.9426755471192173, + "learning_rate": 0.003, + "loss": 3.9938, + "step": 41529 + }, + { + "epoch": 0.4153, + "grad_norm": 1.052677916964077, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 41530 + }, + { + "epoch": 0.41531, + "grad_norm": 0.9293614262256498, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 41531 + }, + { + "epoch": 0.41532, + "grad_norm": 0.8172754297992537, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 41532 + }, + { + "epoch": 0.41533, + "grad_norm": 0.7704171494205732, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 41533 + }, + { + "epoch": 0.41534, + "grad_norm": 0.7716447494953421, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 41534 + }, + { + "epoch": 0.41535, + "grad_norm": 0.7883970444158729, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 41535 + }, + { + "epoch": 0.41536, + "grad_norm": 0.7545487304916593, + "learning_rate": 0.003, + "loss": 4.037, + "step": 41536 + }, + { + "epoch": 0.41537, + "grad_norm": 0.7208968167838282, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 41537 + }, + { + "epoch": 0.41538, + "grad_norm": 0.7181474623613547, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 41538 + }, + { + "epoch": 0.41539, + "grad_norm": 0.8231043318515314, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 41539 + }, + { + "epoch": 0.4154, + "grad_norm": 1.1209933553629352, + "learning_rate": 0.003, + "loss": 4.0012, + "step": 41540 + }, + { + "epoch": 0.41541, + "grad_norm": 1.070018383052819, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 41541 + }, + { + "epoch": 0.41542, + "grad_norm": 0.9332563535267573, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 41542 + }, + { + "epoch": 0.41543, + "grad_norm": 0.9570696213598638, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 41543 + }, + { + "epoch": 0.41544, + "grad_norm": 0.9426220880353559, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 41544 + }, + { + "epoch": 0.41545, + "grad_norm": 0.9767358261471377, + "learning_rate": 0.003, + "loss": 4.064, + "step": 41545 + }, + { + "epoch": 0.41546, + "grad_norm": 1.040662206984659, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 41546 + }, + { + "epoch": 0.41547, + "grad_norm": 0.9296786736972716, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 41547 + }, + { + "epoch": 0.41548, + "grad_norm": 0.948889789847003, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 41548 + }, + { + "epoch": 0.41549, + "grad_norm": 0.9682805966235953, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 41549 + }, + { + "epoch": 0.4155, + "grad_norm": 0.8976749853319338, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 41550 + }, + { + "epoch": 0.41551, + "grad_norm": 0.840827487456752, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 41551 + }, + { + "epoch": 0.41552, + "grad_norm": 0.7827845774377521, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 41552 + }, + { + "epoch": 0.41553, + "grad_norm": 0.8033391531202515, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 41553 + }, + { + "epoch": 0.41554, + "grad_norm": 0.8407659321305536, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 41554 + }, + { + "epoch": 0.41555, + "grad_norm": 0.8130549357911102, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 41555 + }, + { + "epoch": 0.41556, + "grad_norm": 0.8587923345719383, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 41556 + }, + { + "epoch": 0.41557, + "grad_norm": 0.8455363436087271, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 41557 + }, + { + "epoch": 0.41558, + "grad_norm": 1.0234199446002277, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 41558 + }, + { + "epoch": 0.41559, + "grad_norm": 1.1632573759080393, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 41559 + }, + { + "epoch": 0.4156, + "grad_norm": 0.8410517357270777, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 41560 + }, + { + "epoch": 0.41561, + "grad_norm": 0.82848844874279, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 41561 + }, + { + "epoch": 0.41562, + "grad_norm": 0.864429258518874, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 41562 + }, + { + "epoch": 0.41563, + "grad_norm": 0.7317608559490932, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 41563 + }, + { + "epoch": 0.41564, + "grad_norm": 0.7549020236915609, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 41564 + }, + { + "epoch": 0.41565, + "grad_norm": 0.7561023581847864, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 41565 + }, + { + "epoch": 0.41566, + "grad_norm": 0.9020883164612737, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 41566 + }, + { + "epoch": 0.41567, + "grad_norm": 1.138102724499249, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 41567 + }, + { + "epoch": 0.41568, + "grad_norm": 0.9035544844130589, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 41568 + }, + { + "epoch": 0.41569, + "grad_norm": 0.7925015005940669, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 41569 + }, + { + "epoch": 0.4157, + "grad_norm": 0.7520176541833588, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 41570 + }, + { + "epoch": 0.41571, + "grad_norm": 0.7893158284208207, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 41571 + }, + { + "epoch": 0.41572, + "grad_norm": 0.6777820163663162, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 41572 + }, + { + "epoch": 0.41573, + "grad_norm": 0.66839452730195, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 41573 + }, + { + "epoch": 0.41574, + "grad_norm": 0.7160088428701203, + "learning_rate": 0.003, + "loss": 4.022, + "step": 41574 + }, + { + "epoch": 0.41575, + "grad_norm": 0.7747564302029627, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 41575 + }, + { + "epoch": 0.41576, + "grad_norm": 0.7441472537011281, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 41576 + }, + { + "epoch": 0.41577, + "grad_norm": 0.7791061563120977, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 41577 + }, + { + "epoch": 0.41578, + "grad_norm": 0.8098426083598712, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 41578 + }, + { + "epoch": 0.41579, + "grad_norm": 0.8795372001520972, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 41579 + }, + { + "epoch": 0.4158, + "grad_norm": 0.9701783477877963, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 41580 + }, + { + "epoch": 0.41581, + "grad_norm": 1.0421676400292237, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 41581 + }, + { + "epoch": 0.41582, + "grad_norm": 1.029584574850979, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 41582 + }, + { + "epoch": 0.41583, + "grad_norm": 0.8708256695002158, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 41583 + }, + { + "epoch": 0.41584, + "grad_norm": 0.7800915964734141, + "learning_rate": 0.003, + "loss": 4.0046, + "step": 41584 + }, + { + "epoch": 0.41585, + "grad_norm": 0.8184565598433605, + "learning_rate": 0.003, + "loss": 3.998, + "step": 41585 + }, + { + "epoch": 0.41586, + "grad_norm": 0.8414307612723624, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 41586 + }, + { + "epoch": 0.41587, + "grad_norm": 0.8171070860478958, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 41587 + }, + { + "epoch": 0.41588, + "grad_norm": 0.7414525065771986, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 41588 + }, + { + "epoch": 0.41589, + "grad_norm": 0.6125279204385754, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 41589 + }, + { + "epoch": 0.4159, + "grad_norm": 0.5953054682023008, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 41590 + }, + { + "epoch": 0.41591, + "grad_norm": 0.561470161142654, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 41591 + }, + { + "epoch": 0.41592, + "grad_norm": 0.5588874962259135, + "learning_rate": 0.003, + "loss": 3.999, + "step": 41592 + }, + { + "epoch": 0.41593, + "grad_norm": 0.5171296842965291, + "learning_rate": 0.003, + "loss": 4.0034, + "step": 41593 + }, + { + "epoch": 0.41594, + "grad_norm": 0.5230465116415102, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 41594 + }, + { + "epoch": 0.41595, + "grad_norm": 0.5295021888365528, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 41595 + }, + { + "epoch": 0.41596, + "grad_norm": 0.49061321225754945, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 41596 + }, + { + "epoch": 0.41597, + "grad_norm": 0.4121598983979464, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 41597 + }, + { + "epoch": 0.41598, + "grad_norm": 0.4819731950110841, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 41598 + }, + { + "epoch": 0.41599, + "grad_norm": 0.5839414170151052, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 41599 + }, + { + "epoch": 0.416, + "grad_norm": 0.8211332370504912, + "learning_rate": 0.003, + "loss": 3.9922, + "step": 41600 + }, + { + "epoch": 0.41601, + "grad_norm": 1.3055565816581396, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 41601 + }, + { + "epoch": 0.41602, + "grad_norm": 0.9327631218425491, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 41602 + }, + { + "epoch": 0.41603, + "grad_norm": 0.9997736593321692, + "learning_rate": 0.003, + "loss": 4.047, + "step": 41603 + }, + { + "epoch": 0.41604, + "grad_norm": 1.0239290192960946, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 41604 + }, + { + "epoch": 0.41605, + "grad_norm": 0.9199519225710017, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 41605 + }, + { + "epoch": 0.41606, + "grad_norm": 0.8298264641373914, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 41606 + }, + { + "epoch": 0.41607, + "grad_norm": 0.7523002194625519, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 41607 + }, + { + "epoch": 0.41608, + "grad_norm": 0.7505734499411191, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 41608 + }, + { + "epoch": 0.41609, + "grad_norm": 0.7505221288021017, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 41609 + }, + { + "epoch": 0.4161, + "grad_norm": 0.7243370040708468, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 41610 + }, + { + "epoch": 0.41611, + "grad_norm": 0.7450204397399396, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 41611 + }, + { + "epoch": 0.41612, + "grad_norm": 0.8447434917639617, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 41612 + }, + { + "epoch": 0.41613, + "grad_norm": 1.164080902989671, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 41613 + }, + { + "epoch": 0.41614, + "grad_norm": 1.0481076573102766, + "learning_rate": 0.003, + "loss": 4.045, + "step": 41614 + }, + { + "epoch": 0.41615, + "grad_norm": 0.8920661967126868, + "learning_rate": 0.003, + "loss": 4.003, + "step": 41615 + }, + { + "epoch": 0.41616, + "grad_norm": 0.8529054227868282, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 41616 + }, + { + "epoch": 0.41617, + "grad_norm": 0.9034786088295973, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 41617 + }, + { + "epoch": 0.41618, + "grad_norm": 0.8999076304654938, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 41618 + }, + { + "epoch": 0.41619, + "grad_norm": 0.9345934060873413, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 41619 + }, + { + "epoch": 0.4162, + "grad_norm": 0.9342300654740657, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 41620 + }, + { + "epoch": 0.41621, + "grad_norm": 0.9361434392472288, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 41621 + }, + { + "epoch": 0.41622, + "grad_norm": 1.1145817143130505, + "learning_rate": 0.003, + "loss": 4.016, + "step": 41622 + }, + { + "epoch": 0.41623, + "grad_norm": 0.9932802313077418, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 41623 + }, + { + "epoch": 0.41624, + "grad_norm": 0.8002907752395756, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 41624 + }, + { + "epoch": 0.41625, + "grad_norm": 0.6603815068282628, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 41625 + }, + { + "epoch": 0.41626, + "grad_norm": 0.6866156444811555, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 41626 + }, + { + "epoch": 0.41627, + "grad_norm": 0.7935943848744715, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 41627 + }, + { + "epoch": 0.41628, + "grad_norm": 0.8728583950930645, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 41628 + }, + { + "epoch": 0.41629, + "grad_norm": 0.8910601643038727, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 41629 + }, + { + "epoch": 0.4163, + "grad_norm": 0.9466729708847269, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 41630 + }, + { + "epoch": 0.41631, + "grad_norm": 0.9135837862945329, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 41631 + }, + { + "epoch": 0.41632, + "grad_norm": 0.8385091580476539, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 41632 + }, + { + "epoch": 0.41633, + "grad_norm": 0.8985127394208975, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 41633 + }, + { + "epoch": 0.41634, + "grad_norm": 0.9659744348126164, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 41634 + }, + { + "epoch": 0.41635, + "grad_norm": 1.040740135535971, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 41635 + }, + { + "epoch": 0.41636, + "grad_norm": 1.0425942772026506, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 41636 + }, + { + "epoch": 0.41637, + "grad_norm": 1.0841120654029799, + "learning_rate": 0.003, + "loss": 4.026, + "step": 41637 + }, + { + "epoch": 0.41638, + "grad_norm": 0.9862890500645249, + "learning_rate": 0.003, + "loss": 4.0764, + "step": 41638 + }, + { + "epoch": 0.41639, + "grad_norm": 0.9891662315650358, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 41639 + }, + { + "epoch": 0.4164, + "grad_norm": 0.8966604398567886, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 41640 + }, + { + "epoch": 0.41641, + "grad_norm": 0.885681769238867, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 41641 + }, + { + "epoch": 0.41642, + "grad_norm": 0.8682133571569973, + "learning_rate": 0.003, + "loss": 4.032, + "step": 41642 + }, + { + "epoch": 0.41643, + "grad_norm": 0.7908626879324446, + "learning_rate": 0.003, + "loss": 4.0621, + "step": 41643 + }, + { + "epoch": 0.41644, + "grad_norm": 0.8419959426412547, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 41644 + }, + { + "epoch": 0.41645, + "grad_norm": 0.8823715310071533, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 41645 + }, + { + "epoch": 0.41646, + "grad_norm": 0.9387040278312548, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 41646 + }, + { + "epoch": 0.41647, + "grad_norm": 0.8563704141086951, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 41647 + }, + { + "epoch": 0.41648, + "grad_norm": 0.829678607034557, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 41648 + }, + { + "epoch": 0.41649, + "grad_norm": 0.8461070160840798, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 41649 + }, + { + "epoch": 0.4165, + "grad_norm": 0.8193138492591616, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 41650 + }, + { + "epoch": 0.41651, + "grad_norm": 0.8728905255530686, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 41651 + }, + { + "epoch": 0.41652, + "grad_norm": 0.939274888872712, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 41652 + }, + { + "epoch": 0.41653, + "grad_norm": 0.838989558426603, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 41653 + }, + { + "epoch": 0.41654, + "grad_norm": 0.7504237242495893, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 41654 + }, + { + "epoch": 0.41655, + "grad_norm": 0.7119355525881398, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 41655 + }, + { + "epoch": 0.41656, + "grad_norm": 0.7429071312198379, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 41656 + }, + { + "epoch": 0.41657, + "grad_norm": 0.8186990768098257, + "learning_rate": 0.003, + "loss": 4.047, + "step": 41657 + }, + { + "epoch": 0.41658, + "grad_norm": 0.9336372277762367, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 41658 + }, + { + "epoch": 0.41659, + "grad_norm": 1.174987567757826, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 41659 + }, + { + "epoch": 0.4166, + "grad_norm": 1.0070726474511549, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 41660 + }, + { + "epoch": 0.41661, + "grad_norm": 0.9211259409474325, + "learning_rate": 0.003, + "loss": 3.9778, + "step": 41661 + }, + { + "epoch": 0.41662, + "grad_norm": 0.7634882283774561, + "learning_rate": 0.003, + "loss": 4.0016, + "step": 41662 + }, + { + "epoch": 0.41663, + "grad_norm": 0.6983479274160134, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 41663 + }, + { + "epoch": 0.41664, + "grad_norm": 0.5796454184058165, + "learning_rate": 0.003, + "loss": 4.02, + "step": 41664 + }, + { + "epoch": 0.41665, + "grad_norm": 0.5921723416259796, + "learning_rate": 0.003, + "loss": 3.9869, + "step": 41665 + }, + { + "epoch": 0.41666, + "grad_norm": 0.6585969718191118, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 41666 + }, + { + "epoch": 0.41667, + "grad_norm": 0.7292971463147138, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 41667 + }, + { + "epoch": 0.41668, + "grad_norm": 0.795732101863224, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 41668 + }, + { + "epoch": 0.41669, + "grad_norm": 0.8086109609685839, + "learning_rate": 0.003, + "loss": 4.053, + "step": 41669 + }, + { + "epoch": 0.4167, + "grad_norm": 0.7777055772466963, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 41670 + }, + { + "epoch": 0.41671, + "grad_norm": 0.6432224914796916, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 41671 + }, + { + "epoch": 0.41672, + "grad_norm": 0.6113296707316782, + "learning_rate": 0.003, + "loss": 4.003, + "step": 41672 + }, + { + "epoch": 0.41673, + "grad_norm": 0.6541684403112763, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 41673 + }, + { + "epoch": 0.41674, + "grad_norm": 0.7295448931224449, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 41674 + }, + { + "epoch": 0.41675, + "grad_norm": 0.8901356351302094, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 41675 + }, + { + "epoch": 0.41676, + "grad_norm": 1.0633184932444588, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 41676 + }, + { + "epoch": 0.41677, + "grad_norm": 0.871083588644609, + "learning_rate": 0.003, + "loss": 4.029, + "step": 41677 + }, + { + "epoch": 0.41678, + "grad_norm": 0.7271367147607154, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 41678 + }, + { + "epoch": 0.41679, + "grad_norm": 0.9158245860807955, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 41679 + }, + { + "epoch": 0.4168, + "grad_norm": 1.0011261779360074, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 41680 + }, + { + "epoch": 0.41681, + "grad_norm": 1.0657301091204765, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 41681 + }, + { + "epoch": 0.41682, + "grad_norm": 0.7628359323113456, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 41682 + }, + { + "epoch": 0.41683, + "grad_norm": 0.8035214700682868, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 41683 + }, + { + "epoch": 0.41684, + "grad_norm": 0.7572532284211864, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 41684 + }, + { + "epoch": 0.41685, + "grad_norm": 0.7292224954465065, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 41685 + }, + { + "epoch": 0.41686, + "grad_norm": 0.7775079430147799, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 41686 + }, + { + "epoch": 0.41687, + "grad_norm": 0.8070370132550354, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 41687 + }, + { + "epoch": 0.41688, + "grad_norm": 0.8878543960281905, + "learning_rate": 0.003, + "loss": 3.9961, + "step": 41688 + }, + { + "epoch": 0.41689, + "grad_norm": 1.0579859201795776, + "learning_rate": 0.003, + "loss": 4.055, + "step": 41689 + }, + { + "epoch": 0.4169, + "grad_norm": 1.049171442731083, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 41690 + }, + { + "epoch": 0.41691, + "grad_norm": 1.0997221206829622, + "learning_rate": 0.003, + "loss": 4.0695, + "step": 41691 + }, + { + "epoch": 0.41692, + "grad_norm": 0.9988171701709235, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 41692 + }, + { + "epoch": 0.41693, + "grad_norm": 1.087510355237598, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 41693 + }, + { + "epoch": 0.41694, + "grad_norm": 0.890381077162634, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 41694 + }, + { + "epoch": 0.41695, + "grad_norm": 0.7454904744509152, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 41695 + }, + { + "epoch": 0.41696, + "grad_norm": 0.6709002645475265, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 41696 + }, + { + "epoch": 0.41697, + "grad_norm": 0.6391240767460119, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 41697 + }, + { + "epoch": 0.41698, + "grad_norm": 0.6119834186215335, + "learning_rate": 0.003, + "loss": 4.005, + "step": 41698 + }, + { + "epoch": 0.41699, + "grad_norm": 0.6325666649132798, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 41699 + }, + { + "epoch": 0.417, + "grad_norm": 0.646788723909405, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 41700 + }, + { + "epoch": 0.41701, + "grad_norm": 0.6270330633762642, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 41701 + }, + { + "epoch": 0.41702, + "grad_norm": 0.6414596126165196, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 41702 + }, + { + "epoch": 0.41703, + "grad_norm": 0.6798893358027917, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 41703 + }, + { + "epoch": 0.41704, + "grad_norm": 0.681146891275249, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 41704 + }, + { + "epoch": 0.41705, + "grad_norm": 0.894118006176076, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 41705 + }, + { + "epoch": 0.41706, + "grad_norm": 1.1763685518941163, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 41706 + }, + { + "epoch": 0.41707, + "grad_norm": 1.0418357551395372, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 41707 + }, + { + "epoch": 0.41708, + "grad_norm": 0.7819208838061271, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 41708 + }, + { + "epoch": 0.41709, + "grad_norm": 0.6091588900165696, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 41709 + }, + { + "epoch": 0.4171, + "grad_norm": 0.6992743102835609, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 41710 + }, + { + "epoch": 0.41711, + "grad_norm": 0.7521293270451662, + "learning_rate": 0.003, + "loss": 4.03, + "step": 41711 + }, + { + "epoch": 0.41712, + "grad_norm": 0.6875616146597439, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 41712 + }, + { + "epoch": 0.41713, + "grad_norm": 0.7446532465691443, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 41713 + }, + { + "epoch": 0.41714, + "grad_norm": 0.7457465303396997, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 41714 + }, + { + "epoch": 0.41715, + "grad_norm": 0.808655261795599, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 41715 + }, + { + "epoch": 0.41716, + "grad_norm": 0.6669911661409678, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 41716 + }, + { + "epoch": 0.41717, + "grad_norm": 0.677870261406607, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 41717 + }, + { + "epoch": 0.41718, + "grad_norm": 0.7450091363810676, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 41718 + }, + { + "epoch": 0.41719, + "grad_norm": 0.8929332715515811, + "learning_rate": 0.003, + "loss": 4.048, + "step": 41719 + }, + { + "epoch": 0.4172, + "grad_norm": 1.1509738402213476, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 41720 + }, + { + "epoch": 0.41721, + "grad_norm": 0.8773897227381047, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 41721 + }, + { + "epoch": 0.41722, + "grad_norm": 0.8335264750480783, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 41722 + }, + { + "epoch": 0.41723, + "grad_norm": 0.8492324315217358, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 41723 + }, + { + "epoch": 0.41724, + "grad_norm": 0.872962570441549, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 41724 + }, + { + "epoch": 0.41725, + "grad_norm": 0.9403437252808954, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 41725 + }, + { + "epoch": 0.41726, + "grad_norm": 0.7845865819571174, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 41726 + }, + { + "epoch": 0.41727, + "grad_norm": 0.8096134225252243, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 41727 + }, + { + "epoch": 0.41728, + "grad_norm": 0.7861668790346207, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 41728 + }, + { + "epoch": 0.41729, + "grad_norm": 0.9697100568920919, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 41729 + }, + { + "epoch": 0.4173, + "grad_norm": 0.9497977548733966, + "learning_rate": 0.003, + "loss": 4.033, + "step": 41730 + }, + { + "epoch": 0.41731, + "grad_norm": 1.1131227713400642, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 41731 + }, + { + "epoch": 0.41732, + "grad_norm": 0.9873688145761969, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 41732 + }, + { + "epoch": 0.41733, + "grad_norm": 1.0175581275296777, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 41733 + }, + { + "epoch": 0.41734, + "grad_norm": 1.0128276943300156, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 41734 + }, + { + "epoch": 0.41735, + "grad_norm": 1.195974804574915, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 41735 + }, + { + "epoch": 0.41736, + "grad_norm": 1.2854746568492923, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 41736 + }, + { + "epoch": 0.41737, + "grad_norm": 0.8046948096782253, + "learning_rate": 0.003, + "loss": 4.073, + "step": 41737 + }, + { + "epoch": 0.41738, + "grad_norm": 0.7001190731009119, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 41738 + }, + { + "epoch": 0.41739, + "grad_norm": 0.6345751812395191, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 41739 + }, + { + "epoch": 0.4174, + "grad_norm": 0.667247710354922, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 41740 + }, + { + "epoch": 0.41741, + "grad_norm": 0.6444777680314953, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 41741 + }, + { + "epoch": 0.41742, + "grad_norm": 0.6539144424771953, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 41742 + }, + { + "epoch": 0.41743, + "grad_norm": 0.8810279863684699, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 41743 + }, + { + "epoch": 0.41744, + "grad_norm": 1.181225067334946, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 41744 + }, + { + "epoch": 0.41745, + "grad_norm": 1.078803610380891, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 41745 + }, + { + "epoch": 0.41746, + "grad_norm": 0.9276217789194365, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 41746 + }, + { + "epoch": 0.41747, + "grad_norm": 0.8326736036762159, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 41747 + }, + { + "epoch": 0.41748, + "grad_norm": 0.7803678478170287, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 41748 + }, + { + "epoch": 0.41749, + "grad_norm": 0.848734593327102, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 41749 + }, + { + "epoch": 0.4175, + "grad_norm": 0.9787981446294325, + "learning_rate": 0.003, + "loss": 4.0649, + "step": 41750 + }, + { + "epoch": 0.41751, + "grad_norm": 1.0792215749410814, + "learning_rate": 0.003, + "loss": 4.053, + "step": 41751 + }, + { + "epoch": 0.41752, + "grad_norm": 0.988045769832171, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 41752 + }, + { + "epoch": 0.41753, + "grad_norm": 0.868219425219256, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 41753 + }, + { + "epoch": 0.41754, + "grad_norm": 0.6640184598220698, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 41754 + }, + { + "epoch": 0.41755, + "grad_norm": 0.6067587881100425, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 41755 + }, + { + "epoch": 0.41756, + "grad_norm": 0.672416082816623, + "learning_rate": 0.003, + "loss": 4.033, + "step": 41756 + }, + { + "epoch": 0.41757, + "grad_norm": 0.5903543401567858, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 41757 + }, + { + "epoch": 0.41758, + "grad_norm": 0.5338594892799929, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 41758 + }, + { + "epoch": 0.41759, + "grad_norm": 0.49297068469398125, + "learning_rate": 0.003, + "loss": 4.004, + "step": 41759 + }, + { + "epoch": 0.4176, + "grad_norm": 0.610281271562211, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 41760 + }, + { + "epoch": 0.41761, + "grad_norm": 0.6369448204968046, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 41761 + }, + { + "epoch": 0.41762, + "grad_norm": 0.5786826942424448, + "learning_rate": 0.003, + "loss": 4.0042, + "step": 41762 + }, + { + "epoch": 0.41763, + "grad_norm": 0.6748106221075933, + "learning_rate": 0.003, + "loss": 4.0002, + "step": 41763 + }, + { + "epoch": 0.41764, + "grad_norm": 0.8974209717137639, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 41764 + }, + { + "epoch": 0.41765, + "grad_norm": 1.0884397042735785, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 41765 + }, + { + "epoch": 0.41766, + "grad_norm": 1.0003415163330487, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 41766 + }, + { + "epoch": 0.41767, + "grad_norm": 0.9336558871832432, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 41767 + }, + { + "epoch": 0.41768, + "grad_norm": 0.9215997161961577, + "learning_rate": 0.003, + "loss": 3.9952, + "step": 41768 + }, + { + "epoch": 0.41769, + "grad_norm": 0.908877235218154, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 41769 + }, + { + "epoch": 0.4177, + "grad_norm": 0.8391990202963338, + "learning_rate": 0.003, + "loss": 4.0001, + "step": 41770 + }, + { + "epoch": 0.41771, + "grad_norm": 0.8000624281835509, + "learning_rate": 0.003, + "loss": 4.042, + "step": 41771 + }, + { + "epoch": 0.41772, + "grad_norm": 0.806939476033626, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 41772 + }, + { + "epoch": 0.41773, + "grad_norm": 0.9649537777096232, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 41773 + }, + { + "epoch": 0.41774, + "grad_norm": 1.0953406263785965, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 41774 + }, + { + "epoch": 0.41775, + "grad_norm": 0.7086872110219957, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 41775 + }, + { + "epoch": 0.41776, + "grad_norm": 0.716689625827094, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 41776 + }, + { + "epoch": 0.41777, + "grad_norm": 0.6664569769936947, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 41777 + }, + { + "epoch": 0.41778, + "grad_norm": 0.6966755907940546, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 41778 + }, + { + "epoch": 0.41779, + "grad_norm": 0.7976827236121777, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 41779 + }, + { + "epoch": 0.4178, + "grad_norm": 0.9536287895369628, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 41780 + }, + { + "epoch": 0.41781, + "grad_norm": 1.0965934594251885, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 41781 + }, + { + "epoch": 0.41782, + "grad_norm": 0.8289033488714086, + "learning_rate": 0.003, + "loss": 4.021, + "step": 41782 + }, + { + "epoch": 0.41783, + "grad_norm": 0.7568881935016586, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 41783 + }, + { + "epoch": 0.41784, + "grad_norm": 0.7506387006644486, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 41784 + }, + { + "epoch": 0.41785, + "grad_norm": 0.8040785351607733, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 41785 + }, + { + "epoch": 0.41786, + "grad_norm": 0.8933217129293253, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 41786 + }, + { + "epoch": 0.41787, + "grad_norm": 0.8473221551700084, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 41787 + }, + { + "epoch": 0.41788, + "grad_norm": 0.8360647278017732, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 41788 + }, + { + "epoch": 0.41789, + "grad_norm": 0.7922661219373075, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 41789 + }, + { + "epoch": 0.4179, + "grad_norm": 0.7051762210130941, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 41790 + }, + { + "epoch": 0.41791, + "grad_norm": 0.8230399743228503, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 41791 + }, + { + "epoch": 0.41792, + "grad_norm": 1.0811383114810835, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 41792 + }, + { + "epoch": 0.41793, + "grad_norm": 1.0684966794268171, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 41793 + }, + { + "epoch": 0.41794, + "grad_norm": 1.0696345121218196, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 41794 + }, + { + "epoch": 0.41795, + "grad_norm": 0.8767900616381594, + "learning_rate": 0.003, + "loss": 4.0036, + "step": 41795 + }, + { + "epoch": 0.41796, + "grad_norm": 0.7643821528323332, + "learning_rate": 0.003, + "loss": 4.0008, + "step": 41796 + }, + { + "epoch": 0.41797, + "grad_norm": 0.8103219512686555, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 41797 + }, + { + "epoch": 0.41798, + "grad_norm": 0.8212774314588479, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 41798 + }, + { + "epoch": 0.41799, + "grad_norm": 0.8023999055162192, + "learning_rate": 0.003, + "loss": 4.0633, + "step": 41799 + }, + { + "epoch": 0.418, + "grad_norm": 0.8924459705960093, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 41800 + }, + { + "epoch": 0.41801, + "grad_norm": 0.961721446979145, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 41801 + }, + { + "epoch": 0.41802, + "grad_norm": 1.050231965524075, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 41802 + }, + { + "epoch": 0.41803, + "grad_norm": 1.0063374132577576, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 41803 + }, + { + "epoch": 0.41804, + "grad_norm": 0.8599011937435839, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 41804 + }, + { + "epoch": 0.41805, + "grad_norm": 0.6563594576129467, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 41805 + }, + { + "epoch": 0.41806, + "grad_norm": 0.6933713838570927, + "learning_rate": 0.003, + "loss": 4.0055, + "step": 41806 + }, + { + "epoch": 0.41807, + "grad_norm": 0.6819540539423808, + "learning_rate": 0.003, + "loss": 4.012, + "step": 41807 + }, + { + "epoch": 0.41808, + "grad_norm": 0.6463813140453396, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 41808 + }, + { + "epoch": 0.41809, + "grad_norm": 0.6230014982645746, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 41809 + }, + { + "epoch": 0.4181, + "grad_norm": 0.587368997722808, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 41810 + }, + { + "epoch": 0.41811, + "grad_norm": 0.6798571366836191, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 41811 + }, + { + "epoch": 0.41812, + "grad_norm": 0.8258633103007862, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 41812 + }, + { + "epoch": 0.41813, + "grad_norm": 0.8936326751794123, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 41813 + }, + { + "epoch": 0.41814, + "grad_norm": 0.9469022617163186, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 41814 + }, + { + "epoch": 0.41815, + "grad_norm": 1.0083348318867342, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 41815 + }, + { + "epoch": 0.41816, + "grad_norm": 0.9965651952992377, + "learning_rate": 0.003, + "loss": 4.051, + "step": 41816 + }, + { + "epoch": 0.41817, + "grad_norm": 0.9319293344457844, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 41817 + }, + { + "epoch": 0.41818, + "grad_norm": 0.8378374010931392, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 41818 + }, + { + "epoch": 0.41819, + "grad_norm": 0.8318602680087464, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 41819 + }, + { + "epoch": 0.4182, + "grad_norm": 0.8241256258324375, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 41820 + }, + { + "epoch": 0.41821, + "grad_norm": 0.9417714895633582, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 41821 + }, + { + "epoch": 0.41822, + "grad_norm": 1.030155651031654, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 41822 + }, + { + "epoch": 0.41823, + "grad_norm": 0.8889640639638249, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 41823 + }, + { + "epoch": 0.41824, + "grad_norm": 0.8775204285818284, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 41824 + }, + { + "epoch": 0.41825, + "grad_norm": 0.838557887715883, + "learning_rate": 0.003, + "loss": 4.015, + "step": 41825 + }, + { + "epoch": 0.41826, + "grad_norm": 0.6945513147635556, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 41826 + }, + { + "epoch": 0.41827, + "grad_norm": 0.6857155662732731, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 41827 + }, + { + "epoch": 0.41828, + "grad_norm": 0.7355587108338679, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 41828 + }, + { + "epoch": 0.41829, + "grad_norm": 0.87685492651005, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 41829 + }, + { + "epoch": 0.4183, + "grad_norm": 0.8838639288647833, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 41830 + }, + { + "epoch": 0.41831, + "grad_norm": 0.9205467403272327, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 41831 + }, + { + "epoch": 0.41832, + "grad_norm": 1.0061251525314776, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 41832 + }, + { + "epoch": 0.41833, + "grad_norm": 1.0723338181826572, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 41833 + }, + { + "epoch": 0.41834, + "grad_norm": 0.8777889760220016, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 41834 + }, + { + "epoch": 0.41835, + "grad_norm": 0.8990531439054493, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 41835 + }, + { + "epoch": 0.41836, + "grad_norm": 0.8240083942932461, + "learning_rate": 0.003, + "loss": 4.045, + "step": 41836 + }, + { + "epoch": 0.41837, + "grad_norm": 0.968694963688097, + "learning_rate": 0.003, + "loss": 4.054, + "step": 41837 + }, + { + "epoch": 0.41838, + "grad_norm": 1.0375461180299472, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 41838 + }, + { + "epoch": 0.41839, + "grad_norm": 0.9832625149820325, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 41839 + }, + { + "epoch": 0.4184, + "grad_norm": 1.0019796584270255, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 41840 + }, + { + "epoch": 0.41841, + "grad_norm": 1.0204075771031211, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 41841 + }, + { + "epoch": 0.41842, + "grad_norm": 1.084518595812644, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 41842 + }, + { + "epoch": 0.41843, + "grad_norm": 0.8877534618981299, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 41843 + }, + { + "epoch": 0.41844, + "grad_norm": 0.7990701069413986, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 41844 + }, + { + "epoch": 0.41845, + "grad_norm": 0.759541596223383, + "learning_rate": 0.003, + "loss": 4.061, + "step": 41845 + }, + { + "epoch": 0.41846, + "grad_norm": 0.7873555029837068, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 41846 + }, + { + "epoch": 0.41847, + "grad_norm": 0.7233215969888693, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 41847 + }, + { + "epoch": 0.41848, + "grad_norm": 0.6309060077537969, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 41848 + }, + { + "epoch": 0.41849, + "grad_norm": 0.724794949584867, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 41849 + }, + { + "epoch": 0.4185, + "grad_norm": 0.7870084717107348, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 41850 + }, + { + "epoch": 0.41851, + "grad_norm": 0.7235719380224364, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 41851 + }, + { + "epoch": 0.41852, + "grad_norm": 0.6699217126823162, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 41852 + }, + { + "epoch": 0.41853, + "grad_norm": 0.5942100985786215, + "learning_rate": 0.003, + "loss": 4.042, + "step": 41853 + }, + { + "epoch": 0.41854, + "grad_norm": 0.6014941721188909, + "learning_rate": 0.003, + "loss": 4.037, + "step": 41854 + }, + { + "epoch": 0.41855, + "grad_norm": 0.7776655461127068, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 41855 + }, + { + "epoch": 0.41856, + "grad_norm": 0.9823109757834566, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 41856 + }, + { + "epoch": 0.41857, + "grad_norm": 1.0290580681452688, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 41857 + }, + { + "epoch": 0.41858, + "grad_norm": 1.008821201417201, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 41858 + }, + { + "epoch": 0.41859, + "grad_norm": 0.8576611412272571, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 41859 + }, + { + "epoch": 0.4186, + "grad_norm": 0.7904384961952324, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 41860 + }, + { + "epoch": 0.41861, + "grad_norm": 0.7396380292399165, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 41861 + }, + { + "epoch": 0.41862, + "grad_norm": 0.6921626589703336, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 41862 + }, + { + "epoch": 0.41863, + "grad_norm": 0.8408513783910044, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 41863 + }, + { + "epoch": 0.41864, + "grad_norm": 0.8373799746202127, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 41864 + }, + { + "epoch": 0.41865, + "grad_norm": 0.6866668559142732, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 41865 + }, + { + "epoch": 0.41866, + "grad_norm": 0.5334508860134023, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 41866 + }, + { + "epoch": 0.41867, + "grad_norm": 0.5614560888324148, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 41867 + }, + { + "epoch": 0.41868, + "grad_norm": 0.6430033319947199, + "learning_rate": 0.003, + "loss": 4.029, + "step": 41868 + }, + { + "epoch": 0.41869, + "grad_norm": 0.7145272194219755, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 41869 + }, + { + "epoch": 0.4187, + "grad_norm": 0.7046981448955424, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 41870 + }, + { + "epoch": 0.41871, + "grad_norm": 0.7331738904401949, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 41871 + }, + { + "epoch": 0.41872, + "grad_norm": 0.6441555125305539, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 41872 + }, + { + "epoch": 0.41873, + "grad_norm": 0.7700697752688779, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 41873 + }, + { + "epoch": 0.41874, + "grad_norm": 0.9670388947586054, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 41874 + }, + { + "epoch": 0.41875, + "grad_norm": 1.1166008854544072, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 41875 + }, + { + "epoch": 0.41876, + "grad_norm": 0.8072750350363207, + "learning_rate": 0.003, + "loss": 4.046, + "step": 41876 + }, + { + "epoch": 0.41877, + "grad_norm": 0.7296370913429107, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 41877 + }, + { + "epoch": 0.41878, + "grad_norm": 0.7458450383819837, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 41878 + }, + { + "epoch": 0.41879, + "grad_norm": 0.6926081348038691, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 41879 + }, + { + "epoch": 0.4188, + "grad_norm": 0.671950107425444, + "learning_rate": 0.003, + "loss": 4.0052, + "step": 41880 + }, + { + "epoch": 0.41881, + "grad_norm": 0.6292416181163417, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 41881 + }, + { + "epoch": 0.41882, + "grad_norm": 0.6316669874271531, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 41882 + }, + { + "epoch": 0.41883, + "grad_norm": 0.6985661277124255, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 41883 + }, + { + "epoch": 0.41884, + "grad_norm": 0.9283797600766927, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 41884 + }, + { + "epoch": 0.41885, + "grad_norm": 1.1920473715930768, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 41885 + }, + { + "epoch": 0.41886, + "grad_norm": 0.8227590995570735, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 41886 + }, + { + "epoch": 0.41887, + "grad_norm": 1.0242485925824325, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 41887 + }, + { + "epoch": 0.41888, + "grad_norm": 1.2011736236470043, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 41888 + }, + { + "epoch": 0.41889, + "grad_norm": 0.7757046576717853, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 41889 + }, + { + "epoch": 0.4189, + "grad_norm": 0.6569661644567644, + "learning_rate": 0.003, + "loss": 3.9944, + "step": 41890 + }, + { + "epoch": 0.41891, + "grad_norm": 0.738212546881269, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 41891 + }, + { + "epoch": 0.41892, + "grad_norm": 0.7489510633078512, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 41892 + }, + { + "epoch": 0.41893, + "grad_norm": 0.7008726300702248, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 41893 + }, + { + "epoch": 0.41894, + "grad_norm": 0.7536335725165481, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 41894 + }, + { + "epoch": 0.41895, + "grad_norm": 0.8797335360673173, + "learning_rate": 0.003, + "loss": 4.0045, + "step": 41895 + }, + { + "epoch": 0.41896, + "grad_norm": 1.0390471515341242, + "learning_rate": 0.003, + "loss": 4.009, + "step": 41896 + }, + { + "epoch": 0.41897, + "grad_norm": 1.1625410094838908, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 41897 + }, + { + "epoch": 0.41898, + "grad_norm": 0.7121372356104232, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 41898 + }, + { + "epoch": 0.41899, + "grad_norm": 0.7420595647139816, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 41899 + }, + { + "epoch": 0.419, + "grad_norm": 0.8053167578828283, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 41900 + }, + { + "epoch": 0.41901, + "grad_norm": 0.6900988749650399, + "learning_rate": 0.003, + "loss": 4.03, + "step": 41901 + }, + { + "epoch": 0.41902, + "grad_norm": 0.7368876526742718, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 41902 + }, + { + "epoch": 0.41903, + "grad_norm": 0.9314640896111739, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 41903 + }, + { + "epoch": 0.41904, + "grad_norm": 1.057903355552666, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 41904 + }, + { + "epoch": 0.41905, + "grad_norm": 0.8913755955690298, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 41905 + }, + { + "epoch": 0.41906, + "grad_norm": 0.8480604907715621, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 41906 + }, + { + "epoch": 0.41907, + "grad_norm": 0.8188428378455288, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 41907 + }, + { + "epoch": 0.41908, + "grad_norm": 0.8664571034182496, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 41908 + }, + { + "epoch": 0.41909, + "grad_norm": 0.9677490425430147, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 41909 + }, + { + "epoch": 0.4191, + "grad_norm": 0.8595510279624902, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 41910 + }, + { + "epoch": 0.41911, + "grad_norm": 0.7522638364630586, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 41911 + }, + { + "epoch": 0.41912, + "grad_norm": 0.7067139332542369, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 41912 + }, + { + "epoch": 0.41913, + "grad_norm": 0.6747640006530338, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 41913 + }, + { + "epoch": 0.41914, + "grad_norm": 0.7823272005265649, + "learning_rate": 0.003, + "loss": 4.044, + "step": 41914 + }, + { + "epoch": 0.41915, + "grad_norm": 1.1560012048230155, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 41915 + }, + { + "epoch": 0.41916, + "grad_norm": 1.2373325869281697, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 41916 + }, + { + "epoch": 0.41917, + "grad_norm": 0.6937739929608624, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 41917 + }, + { + "epoch": 0.41918, + "grad_norm": 0.6374802222880095, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 41918 + }, + { + "epoch": 0.41919, + "grad_norm": 0.6421152829156554, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 41919 + }, + { + "epoch": 0.4192, + "grad_norm": 0.6122240622973385, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 41920 + }, + { + "epoch": 0.41921, + "grad_norm": 0.6162481860731317, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 41921 + }, + { + "epoch": 0.41922, + "grad_norm": 0.6195716397956177, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 41922 + }, + { + "epoch": 0.41923, + "grad_norm": 0.6721832352978013, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 41923 + }, + { + "epoch": 0.41924, + "grad_norm": 0.7380592203371225, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 41924 + }, + { + "epoch": 0.41925, + "grad_norm": 0.8194917668164882, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 41925 + }, + { + "epoch": 0.41926, + "grad_norm": 0.8836830696406779, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 41926 + }, + { + "epoch": 0.41927, + "grad_norm": 0.9532015620741225, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 41927 + }, + { + "epoch": 0.41928, + "grad_norm": 0.9125803846619991, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 41928 + }, + { + "epoch": 0.41929, + "grad_norm": 0.8980089159020077, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 41929 + }, + { + "epoch": 0.4193, + "grad_norm": 0.9201981914539951, + "learning_rate": 0.003, + "loss": 4.023, + "step": 41930 + }, + { + "epoch": 0.41931, + "grad_norm": 1.0950164917940879, + "learning_rate": 0.003, + "loss": 4.029, + "step": 41931 + }, + { + "epoch": 0.41932, + "grad_norm": 1.1680739454498024, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 41932 + }, + { + "epoch": 0.41933, + "grad_norm": 0.886923365351654, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 41933 + }, + { + "epoch": 0.41934, + "grad_norm": 0.9245754413984116, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 41934 + }, + { + "epoch": 0.41935, + "grad_norm": 0.99204009124921, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 41935 + }, + { + "epoch": 0.41936, + "grad_norm": 0.9472749842248008, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 41936 + }, + { + "epoch": 0.41937, + "grad_norm": 0.9701515966325731, + "learning_rate": 0.003, + "loss": 4.021, + "step": 41937 + }, + { + "epoch": 0.41938, + "grad_norm": 0.9979547593768884, + "learning_rate": 0.003, + "loss": 4.0712, + "step": 41938 + }, + { + "epoch": 0.41939, + "grad_norm": 1.068998263440927, + "learning_rate": 0.003, + "loss": 4.0681, + "step": 41939 + }, + { + "epoch": 0.4194, + "grad_norm": 0.9442304285303487, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 41940 + }, + { + "epoch": 0.41941, + "grad_norm": 1.0489864430811442, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 41941 + }, + { + "epoch": 0.41942, + "grad_norm": 1.0137946979001544, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 41942 + }, + { + "epoch": 0.41943, + "grad_norm": 1.012736558113824, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 41943 + }, + { + "epoch": 0.41944, + "grad_norm": 1.1457588998988202, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 41944 + }, + { + "epoch": 0.41945, + "grad_norm": 0.7983625794913519, + "learning_rate": 0.003, + "loss": 4.0014, + "step": 41945 + }, + { + "epoch": 0.41946, + "grad_norm": 0.7068536638343254, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 41946 + }, + { + "epoch": 0.41947, + "grad_norm": 0.6178046158982846, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 41947 + }, + { + "epoch": 0.41948, + "grad_norm": 0.510540357498098, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 41948 + }, + { + "epoch": 0.41949, + "grad_norm": 0.46110471098134753, + "learning_rate": 0.003, + "loss": 4.036, + "step": 41949 + }, + { + "epoch": 0.4195, + "grad_norm": 0.43147078233458164, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 41950 + }, + { + "epoch": 0.41951, + "grad_norm": 0.43669324103881607, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 41951 + }, + { + "epoch": 0.41952, + "grad_norm": 0.42870924281592426, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 41952 + }, + { + "epoch": 0.41953, + "grad_norm": 0.47920560109295085, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 41953 + }, + { + "epoch": 0.41954, + "grad_norm": 0.5551905724418512, + "learning_rate": 0.003, + "loss": 4.0095, + "step": 41954 + }, + { + "epoch": 0.41955, + "grad_norm": 0.6018050209737762, + "learning_rate": 0.003, + "loss": 3.9943, + "step": 41955 + }, + { + "epoch": 0.41956, + "grad_norm": 0.6764761875507685, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 41956 + }, + { + "epoch": 0.41957, + "grad_norm": 0.740667926229069, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 41957 + }, + { + "epoch": 0.41958, + "grad_norm": 0.9006579491028663, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 41958 + }, + { + "epoch": 0.41959, + "grad_norm": 0.9568146136671568, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 41959 + }, + { + "epoch": 0.4196, + "grad_norm": 0.9057336830990164, + "learning_rate": 0.003, + "loss": 4.0038, + "step": 41960 + }, + { + "epoch": 0.41961, + "grad_norm": 0.9873963052449148, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 41961 + }, + { + "epoch": 0.41962, + "grad_norm": 1.2249679800355944, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 41962 + }, + { + "epoch": 0.41963, + "grad_norm": 0.8623061242656561, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 41963 + }, + { + "epoch": 0.41964, + "grad_norm": 0.875643199690452, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 41964 + }, + { + "epoch": 0.41965, + "grad_norm": 1.062707011690637, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 41965 + }, + { + "epoch": 0.41966, + "grad_norm": 1.070140383146322, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 41966 + }, + { + "epoch": 0.41967, + "grad_norm": 0.9625753730377397, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 41967 + }, + { + "epoch": 0.41968, + "grad_norm": 0.8868729573938655, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 41968 + }, + { + "epoch": 0.41969, + "grad_norm": 0.8457104436185023, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 41969 + }, + { + "epoch": 0.4197, + "grad_norm": 0.7793434131054318, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 41970 + }, + { + "epoch": 0.41971, + "grad_norm": 0.6806409250980419, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 41971 + }, + { + "epoch": 0.41972, + "grad_norm": 0.801633527227838, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 41972 + }, + { + "epoch": 0.41973, + "grad_norm": 0.8259956953458057, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 41973 + }, + { + "epoch": 0.41974, + "grad_norm": 0.8817527221097698, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 41974 + }, + { + "epoch": 0.41975, + "grad_norm": 0.9361083462417511, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 41975 + }, + { + "epoch": 0.41976, + "grad_norm": 0.8428500751554211, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 41976 + }, + { + "epoch": 0.41977, + "grad_norm": 0.6881678941524536, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 41977 + }, + { + "epoch": 0.41978, + "grad_norm": 0.6875915035542749, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 41978 + }, + { + "epoch": 0.41979, + "grad_norm": 0.7018819841668035, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 41979 + }, + { + "epoch": 0.4198, + "grad_norm": 0.6993473968833854, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 41980 + }, + { + "epoch": 0.41981, + "grad_norm": 0.6333067171620967, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 41981 + }, + { + "epoch": 0.41982, + "grad_norm": 0.7107894951808577, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 41982 + }, + { + "epoch": 0.41983, + "grad_norm": 0.7776766427809282, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 41983 + }, + { + "epoch": 0.41984, + "grad_norm": 0.718712572643968, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 41984 + }, + { + "epoch": 0.41985, + "grad_norm": 0.6772972379867787, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 41985 + }, + { + "epoch": 0.41986, + "grad_norm": 0.7577240504875801, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 41986 + }, + { + "epoch": 0.41987, + "grad_norm": 1.08164062843504, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 41987 + }, + { + "epoch": 0.41988, + "grad_norm": 1.2708035601482255, + "learning_rate": 0.003, + "loss": 4.0007, + "step": 41988 + }, + { + "epoch": 0.41989, + "grad_norm": 0.7809760601423903, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 41989 + }, + { + "epoch": 0.4199, + "grad_norm": 0.6725771543166088, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 41990 + }, + { + "epoch": 0.41991, + "grad_norm": 0.6911608087132007, + "learning_rate": 0.003, + "loss": 4.0044, + "step": 41991 + }, + { + "epoch": 0.41992, + "grad_norm": 0.7195400560451717, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 41992 + }, + { + "epoch": 0.41993, + "grad_norm": 0.7428062644645633, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 41993 + }, + { + "epoch": 0.41994, + "grad_norm": 0.7843404629898937, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 41994 + }, + { + "epoch": 0.41995, + "grad_norm": 0.8416321898765078, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 41995 + }, + { + "epoch": 0.41996, + "grad_norm": 0.8920591394007581, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 41996 + }, + { + "epoch": 0.41997, + "grad_norm": 0.9831250855412378, + "learning_rate": 0.003, + "loss": 4.0051, + "step": 41997 + }, + { + "epoch": 0.41998, + "grad_norm": 1.134326705409468, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 41998 + }, + { + "epoch": 0.41999, + "grad_norm": 0.8181428647756868, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 41999 + }, + { + "epoch": 0.42, + "grad_norm": 0.8344948741617003, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 42000 + }, + { + "epoch": 0.42001, + "grad_norm": 0.8513024535728901, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 42001 + }, + { + "epoch": 0.42002, + "grad_norm": 0.9198319773044318, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 42002 + }, + { + "epoch": 0.42003, + "grad_norm": 1.0915695932752936, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 42003 + }, + { + "epoch": 0.42004, + "grad_norm": 0.950035950975282, + "learning_rate": 0.003, + "loss": 4.047, + "step": 42004 + }, + { + "epoch": 0.42005, + "grad_norm": 0.816490755768133, + "learning_rate": 0.003, + "loss": 4.0077, + "step": 42005 + }, + { + "epoch": 0.42006, + "grad_norm": 0.8067449002247555, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 42006 + }, + { + "epoch": 0.42007, + "grad_norm": 0.9068492016301725, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 42007 + }, + { + "epoch": 0.42008, + "grad_norm": 1.034338720132964, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 42008 + }, + { + "epoch": 0.42009, + "grad_norm": 0.9792318996702108, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 42009 + }, + { + "epoch": 0.4201, + "grad_norm": 0.9141073964387794, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 42010 + }, + { + "epoch": 0.42011, + "grad_norm": 0.984545652164254, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 42011 + }, + { + "epoch": 0.42012, + "grad_norm": 0.9614620586932181, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 42012 + }, + { + "epoch": 0.42013, + "grad_norm": 1.005096199023985, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 42013 + }, + { + "epoch": 0.42014, + "grad_norm": 1.1103947032062083, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 42014 + }, + { + "epoch": 0.42015, + "grad_norm": 0.9914205164710246, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 42015 + }, + { + "epoch": 0.42016, + "grad_norm": 0.9935681725283437, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 42016 + }, + { + "epoch": 0.42017, + "grad_norm": 1.0148819383117236, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 42017 + }, + { + "epoch": 0.42018, + "grad_norm": 1.1206282115765327, + "learning_rate": 0.003, + "loss": 4.0879, + "step": 42018 + }, + { + "epoch": 0.42019, + "grad_norm": 1.019216571208788, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 42019 + }, + { + "epoch": 0.4202, + "grad_norm": 0.9231325308871585, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 42020 + }, + { + "epoch": 0.42021, + "grad_norm": 0.8177801234605974, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 42021 + }, + { + "epoch": 0.42022, + "grad_norm": 0.7766226974530497, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 42022 + }, + { + "epoch": 0.42023, + "grad_norm": 0.7439954907051535, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 42023 + }, + { + "epoch": 0.42024, + "grad_norm": 0.7709173806870439, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 42024 + }, + { + "epoch": 0.42025, + "grad_norm": 0.74855219981498, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 42025 + }, + { + "epoch": 0.42026, + "grad_norm": 0.689010463060341, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 42026 + }, + { + "epoch": 0.42027, + "grad_norm": 0.6023748730408909, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 42027 + }, + { + "epoch": 0.42028, + "grad_norm": 0.5420430282552462, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 42028 + }, + { + "epoch": 0.42029, + "grad_norm": 0.5004965760092419, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 42029 + }, + { + "epoch": 0.4203, + "grad_norm": 0.5075604576960918, + "learning_rate": 0.003, + "loss": 4.022, + "step": 42030 + }, + { + "epoch": 0.42031, + "grad_norm": 0.510535115242062, + "learning_rate": 0.003, + "loss": 4.0018, + "step": 42031 + }, + { + "epoch": 0.42032, + "grad_norm": 0.6788880461194359, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 42032 + }, + { + "epoch": 0.42033, + "grad_norm": 1.013510008512796, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 42033 + }, + { + "epoch": 0.42034, + "grad_norm": 1.3896792482925084, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 42034 + }, + { + "epoch": 0.42035, + "grad_norm": 0.5378629042378291, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 42035 + }, + { + "epoch": 0.42036, + "grad_norm": 0.9553841051570785, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 42036 + }, + { + "epoch": 0.42037, + "grad_norm": 1.1188578113717955, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 42037 + }, + { + "epoch": 0.42038, + "grad_norm": 0.7285709382296578, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 42038 + }, + { + "epoch": 0.42039, + "grad_norm": 0.6476228662498374, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 42039 + }, + { + "epoch": 0.4204, + "grad_norm": 0.7151186946277315, + "learning_rate": 0.003, + "loss": 4.037, + "step": 42040 + }, + { + "epoch": 0.42041, + "grad_norm": 0.7844311552442457, + "learning_rate": 0.003, + "loss": 4.0001, + "step": 42041 + }, + { + "epoch": 0.42042, + "grad_norm": 0.8983785532867801, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 42042 + }, + { + "epoch": 0.42043, + "grad_norm": 0.9933733318705638, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 42043 + }, + { + "epoch": 0.42044, + "grad_norm": 1.0197375181900283, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 42044 + }, + { + "epoch": 0.42045, + "grad_norm": 0.8760309524803269, + "learning_rate": 0.003, + "loss": 4.022, + "step": 42045 + }, + { + "epoch": 0.42046, + "grad_norm": 0.8438446321604072, + "learning_rate": 0.003, + "loss": 3.996, + "step": 42046 + }, + { + "epoch": 0.42047, + "grad_norm": 0.82300985451618, + "learning_rate": 0.003, + "loss": 4.0618, + "step": 42047 + }, + { + "epoch": 0.42048, + "grad_norm": 0.7894917736369235, + "learning_rate": 0.003, + "loss": 3.994, + "step": 42048 + }, + { + "epoch": 0.42049, + "grad_norm": 0.6685344216147173, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 42049 + }, + { + "epoch": 0.4205, + "grad_norm": 0.724164113504283, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 42050 + }, + { + "epoch": 0.42051, + "grad_norm": 0.7835283790360806, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 42051 + }, + { + "epoch": 0.42052, + "grad_norm": 0.8012918307590602, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 42052 + }, + { + "epoch": 0.42053, + "grad_norm": 0.9020066965155493, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 42053 + }, + { + "epoch": 0.42054, + "grad_norm": 0.9111267269437391, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 42054 + }, + { + "epoch": 0.42055, + "grad_norm": 0.8598147920461698, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 42055 + }, + { + "epoch": 0.42056, + "grad_norm": 0.7848740898256867, + "learning_rate": 0.003, + "loss": 4.0053, + "step": 42056 + }, + { + "epoch": 0.42057, + "grad_norm": 0.7074656978044334, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 42057 + }, + { + "epoch": 0.42058, + "grad_norm": 0.7211689845463382, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 42058 + }, + { + "epoch": 0.42059, + "grad_norm": 0.747055997392695, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 42059 + }, + { + "epoch": 0.4206, + "grad_norm": 0.8153334314645313, + "learning_rate": 0.003, + "loss": 4.037, + "step": 42060 + }, + { + "epoch": 0.42061, + "grad_norm": 0.7805246087719881, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 42061 + }, + { + "epoch": 0.42062, + "grad_norm": 0.7736443458585652, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 42062 + }, + { + "epoch": 0.42063, + "grad_norm": 0.8873830094766861, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 42063 + }, + { + "epoch": 0.42064, + "grad_norm": 1.05403505719642, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 42064 + }, + { + "epoch": 0.42065, + "grad_norm": 1.2037815466304198, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 42065 + }, + { + "epoch": 0.42066, + "grad_norm": 0.9606098197284312, + "learning_rate": 0.003, + "loss": 4.0035, + "step": 42066 + }, + { + "epoch": 0.42067, + "grad_norm": 0.985698737368325, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 42067 + }, + { + "epoch": 0.42068, + "grad_norm": 0.9585107777872764, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 42068 + }, + { + "epoch": 0.42069, + "grad_norm": 1.0003603546807498, + "learning_rate": 0.003, + "loss": 4.056, + "step": 42069 + }, + { + "epoch": 0.4207, + "grad_norm": 1.083813684732228, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 42070 + }, + { + "epoch": 0.42071, + "grad_norm": 0.8261953351060758, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 42071 + }, + { + "epoch": 0.42072, + "grad_norm": 0.8754622624322653, + "learning_rate": 0.003, + "loss": 4.013, + "step": 42072 + }, + { + "epoch": 0.42073, + "grad_norm": 0.8106040240170085, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 42073 + }, + { + "epoch": 0.42074, + "grad_norm": 0.8275470633152185, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 42074 + }, + { + "epoch": 0.42075, + "grad_norm": 0.8787817316826854, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 42075 + }, + { + "epoch": 0.42076, + "grad_norm": 0.8610153752669037, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 42076 + }, + { + "epoch": 0.42077, + "grad_norm": 0.9849036774225622, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 42077 + }, + { + "epoch": 0.42078, + "grad_norm": 0.9825135617113048, + "learning_rate": 0.003, + "loss": 4.037, + "step": 42078 + }, + { + "epoch": 0.42079, + "grad_norm": 1.0524674576356057, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 42079 + }, + { + "epoch": 0.4208, + "grad_norm": 1.2457395264402704, + "learning_rate": 0.003, + "loss": 4.041, + "step": 42080 + }, + { + "epoch": 0.42081, + "grad_norm": 0.8418271665554756, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 42081 + }, + { + "epoch": 0.42082, + "grad_norm": 0.7196348767376056, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 42082 + }, + { + "epoch": 0.42083, + "grad_norm": 0.6270762207006718, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 42083 + }, + { + "epoch": 0.42084, + "grad_norm": 0.5274149923493691, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 42084 + }, + { + "epoch": 0.42085, + "grad_norm": 0.5391890115690697, + "learning_rate": 0.003, + "loss": 3.9918, + "step": 42085 + }, + { + "epoch": 0.42086, + "grad_norm": 0.5718250426752649, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 42086 + }, + { + "epoch": 0.42087, + "grad_norm": 0.6672324454466033, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 42087 + }, + { + "epoch": 0.42088, + "grad_norm": 0.9032909081923272, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 42088 + }, + { + "epoch": 0.42089, + "grad_norm": 0.9512540188103892, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 42089 + }, + { + "epoch": 0.4209, + "grad_norm": 0.8419964437388158, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 42090 + }, + { + "epoch": 0.42091, + "grad_norm": 0.854735780524881, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 42091 + }, + { + "epoch": 0.42092, + "grad_norm": 0.9397960988296157, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 42092 + }, + { + "epoch": 0.42093, + "grad_norm": 0.9049494180709922, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 42093 + }, + { + "epoch": 0.42094, + "grad_norm": 1.0527188580527402, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 42094 + }, + { + "epoch": 0.42095, + "grad_norm": 1.0000914445970517, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 42095 + }, + { + "epoch": 0.42096, + "grad_norm": 1.087981793856453, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 42096 + }, + { + "epoch": 0.42097, + "grad_norm": 0.9639876309259559, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 42097 + }, + { + "epoch": 0.42098, + "grad_norm": 0.8898749851120209, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 42098 + }, + { + "epoch": 0.42099, + "grad_norm": 0.753966835266764, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 42099 + }, + { + "epoch": 0.421, + "grad_norm": 0.6879027285663327, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 42100 + }, + { + "epoch": 0.42101, + "grad_norm": 0.5910447501978465, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 42101 + }, + { + "epoch": 0.42102, + "grad_norm": 0.5401366891852072, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 42102 + }, + { + "epoch": 0.42103, + "grad_norm": 0.6806640405801146, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 42103 + }, + { + "epoch": 0.42104, + "grad_norm": 0.8424566614583775, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 42104 + }, + { + "epoch": 0.42105, + "grad_norm": 0.8311594100212679, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 42105 + }, + { + "epoch": 0.42106, + "grad_norm": 0.8210529622447338, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 42106 + }, + { + "epoch": 0.42107, + "grad_norm": 0.8609789924661565, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 42107 + }, + { + "epoch": 0.42108, + "grad_norm": 0.8492525722682877, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 42108 + }, + { + "epoch": 0.42109, + "grad_norm": 0.6760683520675401, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 42109 + }, + { + "epoch": 0.4211, + "grad_norm": 0.5332478009961675, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 42110 + }, + { + "epoch": 0.42111, + "grad_norm": 0.5662457824466595, + "learning_rate": 0.003, + "loss": 3.9977, + "step": 42111 + }, + { + "epoch": 0.42112, + "grad_norm": 0.5639172504578692, + "learning_rate": 0.003, + "loss": 3.9982, + "step": 42112 + }, + { + "epoch": 0.42113, + "grad_norm": 0.5873697085355022, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 42113 + }, + { + "epoch": 0.42114, + "grad_norm": 0.6338507410407453, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 42114 + }, + { + "epoch": 0.42115, + "grad_norm": 0.7506788376838254, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 42115 + }, + { + "epoch": 0.42116, + "grad_norm": 0.8024028681757185, + "learning_rate": 0.003, + "loss": 4.068, + "step": 42116 + }, + { + "epoch": 0.42117, + "grad_norm": 0.7826388111819911, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 42117 + }, + { + "epoch": 0.42118, + "grad_norm": 0.7251470106828858, + "learning_rate": 0.003, + "loss": 4.056, + "step": 42118 + }, + { + "epoch": 0.42119, + "grad_norm": 0.755134603066978, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 42119 + }, + { + "epoch": 0.4212, + "grad_norm": 0.7205120221782702, + "learning_rate": 0.003, + "loss": 4.045, + "step": 42120 + }, + { + "epoch": 0.42121, + "grad_norm": 0.7299228515289429, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 42121 + }, + { + "epoch": 0.42122, + "grad_norm": 0.6872817214672333, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 42122 + }, + { + "epoch": 0.42123, + "grad_norm": 0.6534041187233443, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 42123 + }, + { + "epoch": 0.42124, + "grad_norm": 0.6006181950535462, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 42124 + }, + { + "epoch": 0.42125, + "grad_norm": 0.6183679828490305, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 42125 + }, + { + "epoch": 0.42126, + "grad_norm": 0.5676753946109382, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 42126 + }, + { + "epoch": 0.42127, + "grad_norm": 0.4881680379066254, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 42127 + }, + { + "epoch": 0.42128, + "grad_norm": 0.6133884225446551, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 42128 + }, + { + "epoch": 0.42129, + "grad_norm": 0.8048616969421194, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 42129 + }, + { + "epoch": 0.4213, + "grad_norm": 1.0781871524699964, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 42130 + }, + { + "epoch": 0.42131, + "grad_norm": 1.0154561604172485, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 42131 + }, + { + "epoch": 0.42132, + "grad_norm": 0.9890739397382381, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 42132 + }, + { + "epoch": 0.42133, + "grad_norm": 0.9858417968879188, + "learning_rate": 0.003, + "loss": 4.0013, + "step": 42133 + }, + { + "epoch": 0.42134, + "grad_norm": 1.1274614145843902, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 42134 + }, + { + "epoch": 0.42135, + "grad_norm": 1.0686397085222563, + "learning_rate": 0.003, + "loss": 4.044, + "step": 42135 + }, + { + "epoch": 0.42136, + "grad_norm": 1.2105058781372788, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 42136 + }, + { + "epoch": 0.42137, + "grad_norm": 1.0369012187764477, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 42137 + }, + { + "epoch": 0.42138, + "grad_norm": 1.1453313659950457, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 42138 + }, + { + "epoch": 0.42139, + "grad_norm": 1.0175782506128492, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 42139 + }, + { + "epoch": 0.4214, + "grad_norm": 0.94764792166741, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 42140 + }, + { + "epoch": 0.42141, + "grad_norm": 1.0032282223902314, + "learning_rate": 0.003, + "loss": 4.0095, + "step": 42141 + }, + { + "epoch": 0.42142, + "grad_norm": 0.9838174891374621, + "learning_rate": 0.003, + "loss": 4.075, + "step": 42142 + }, + { + "epoch": 0.42143, + "grad_norm": 1.0268555918395383, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 42143 + }, + { + "epoch": 0.42144, + "grad_norm": 0.9023303046803622, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 42144 + }, + { + "epoch": 0.42145, + "grad_norm": 0.7839003938860185, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 42145 + }, + { + "epoch": 0.42146, + "grad_norm": 0.7119822366069897, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 42146 + }, + { + "epoch": 0.42147, + "grad_norm": 0.6159694755792987, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 42147 + }, + { + "epoch": 0.42148, + "grad_norm": 0.5812471669973999, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 42148 + }, + { + "epoch": 0.42149, + "grad_norm": 0.5459911256421357, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 42149 + }, + { + "epoch": 0.4215, + "grad_norm": 0.5866048937099763, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 42150 + }, + { + "epoch": 0.42151, + "grad_norm": 0.5594778842889748, + "learning_rate": 0.003, + "loss": 3.9929, + "step": 42151 + }, + { + "epoch": 0.42152, + "grad_norm": 0.5678991313357155, + "learning_rate": 0.003, + "loss": 3.9947, + "step": 42152 + }, + { + "epoch": 0.42153, + "grad_norm": 0.6482044897899688, + "learning_rate": 0.003, + "loss": 4.001, + "step": 42153 + }, + { + "epoch": 0.42154, + "grad_norm": 0.8814761485275583, + "learning_rate": 0.003, + "loss": 4.006, + "step": 42154 + }, + { + "epoch": 0.42155, + "grad_norm": 1.0801985697380792, + "learning_rate": 0.003, + "loss": 4.0046, + "step": 42155 + }, + { + "epoch": 0.42156, + "grad_norm": 1.1587731314741805, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 42156 + }, + { + "epoch": 0.42157, + "grad_norm": 0.9805832814088576, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 42157 + }, + { + "epoch": 0.42158, + "grad_norm": 0.8981078109657122, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 42158 + }, + { + "epoch": 0.42159, + "grad_norm": 1.00904623062928, + "learning_rate": 0.003, + "loss": 4.033, + "step": 42159 + }, + { + "epoch": 0.4216, + "grad_norm": 0.9985075399470565, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 42160 + }, + { + "epoch": 0.42161, + "grad_norm": 0.8875571783020896, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 42161 + }, + { + "epoch": 0.42162, + "grad_norm": 0.7819358968612635, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 42162 + }, + { + "epoch": 0.42163, + "grad_norm": 0.6951940604737515, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 42163 + }, + { + "epoch": 0.42164, + "grad_norm": 0.6634415948300094, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 42164 + }, + { + "epoch": 0.42165, + "grad_norm": 0.6474639556368746, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 42165 + }, + { + "epoch": 0.42166, + "grad_norm": 0.7307160128656562, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 42166 + }, + { + "epoch": 0.42167, + "grad_norm": 0.8341650458233207, + "learning_rate": 0.003, + "loss": 4.0042, + "step": 42167 + }, + { + "epoch": 0.42168, + "grad_norm": 0.8777085345035041, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 42168 + }, + { + "epoch": 0.42169, + "grad_norm": 0.9903390216833626, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 42169 + }, + { + "epoch": 0.4217, + "grad_norm": 1.1226313068728029, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 42170 + }, + { + "epoch": 0.42171, + "grad_norm": 0.8956298959142523, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 42171 + }, + { + "epoch": 0.42172, + "grad_norm": 0.753292140057193, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 42172 + }, + { + "epoch": 0.42173, + "grad_norm": 0.7521112452040204, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 42173 + }, + { + "epoch": 0.42174, + "grad_norm": 0.7850775473382092, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 42174 + }, + { + "epoch": 0.42175, + "grad_norm": 0.7883535007378688, + "learning_rate": 0.003, + "loss": 4.047, + "step": 42175 + }, + { + "epoch": 0.42176, + "grad_norm": 0.8318374753808436, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 42176 + }, + { + "epoch": 0.42177, + "grad_norm": 0.8623448711453293, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 42177 + }, + { + "epoch": 0.42178, + "grad_norm": 0.8165544121292538, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 42178 + }, + { + "epoch": 0.42179, + "grad_norm": 0.7688420372421326, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 42179 + }, + { + "epoch": 0.4218, + "grad_norm": 0.8352456251863294, + "learning_rate": 0.003, + "loss": 4.049, + "step": 42180 + }, + { + "epoch": 0.42181, + "grad_norm": 0.9130241154684524, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 42181 + }, + { + "epoch": 0.42182, + "grad_norm": 0.9589267991583234, + "learning_rate": 0.003, + "loss": 4.05, + "step": 42182 + }, + { + "epoch": 0.42183, + "grad_norm": 0.9682713147823573, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 42183 + }, + { + "epoch": 0.42184, + "grad_norm": 1.075070079797686, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 42184 + }, + { + "epoch": 0.42185, + "grad_norm": 0.9932801173650144, + "learning_rate": 0.003, + "loss": 4.0808, + "step": 42185 + }, + { + "epoch": 0.42186, + "grad_norm": 0.8799854142159389, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 42186 + }, + { + "epoch": 0.42187, + "grad_norm": 0.8462843292039401, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 42187 + }, + { + "epoch": 0.42188, + "grad_norm": 0.7736721253727248, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 42188 + }, + { + "epoch": 0.42189, + "grad_norm": 0.85804636771316, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 42189 + }, + { + "epoch": 0.4219, + "grad_norm": 0.7402399082828026, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 42190 + }, + { + "epoch": 0.42191, + "grad_norm": 0.6753163052122857, + "learning_rate": 0.003, + "loss": 4.023, + "step": 42191 + }, + { + "epoch": 0.42192, + "grad_norm": 0.6632331687448274, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 42192 + }, + { + "epoch": 0.42193, + "grad_norm": 0.6979039897040211, + "learning_rate": 0.003, + "loss": 3.9875, + "step": 42193 + }, + { + "epoch": 0.42194, + "grad_norm": 0.7646494653629285, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 42194 + }, + { + "epoch": 0.42195, + "grad_norm": 0.9061448945042567, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 42195 + }, + { + "epoch": 0.42196, + "grad_norm": 1.1054493036222408, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 42196 + }, + { + "epoch": 0.42197, + "grad_norm": 1.0178253988772874, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 42197 + }, + { + "epoch": 0.42198, + "grad_norm": 0.9120060561133299, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 42198 + }, + { + "epoch": 0.42199, + "grad_norm": 0.8259166698660698, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 42199 + }, + { + "epoch": 0.422, + "grad_norm": 0.7344165536852996, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 42200 + }, + { + "epoch": 0.42201, + "grad_norm": 0.8566790945840597, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 42201 + }, + { + "epoch": 0.42202, + "grad_norm": 1.0677188368332124, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 42202 + }, + { + "epoch": 0.42203, + "grad_norm": 0.8724280999076303, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 42203 + }, + { + "epoch": 0.42204, + "grad_norm": 0.6746931075436439, + "learning_rate": 0.003, + "loss": 4.044, + "step": 42204 + }, + { + "epoch": 0.42205, + "grad_norm": 0.7145588537981962, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 42205 + }, + { + "epoch": 0.42206, + "grad_norm": 0.8026865295592651, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 42206 + }, + { + "epoch": 0.42207, + "grad_norm": 0.8896759552818694, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 42207 + }, + { + "epoch": 0.42208, + "grad_norm": 0.8307044228174557, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 42208 + }, + { + "epoch": 0.42209, + "grad_norm": 0.6662250367256393, + "learning_rate": 0.003, + "loss": 4.0045, + "step": 42209 + }, + { + "epoch": 0.4221, + "grad_norm": 0.6131039552752086, + "learning_rate": 0.003, + "loss": 4.035, + "step": 42210 + }, + { + "epoch": 0.42211, + "grad_norm": 0.5646849515486823, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 42211 + }, + { + "epoch": 0.42212, + "grad_norm": 0.5268336643390639, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 42212 + }, + { + "epoch": 0.42213, + "grad_norm": 0.5812577595040402, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 42213 + }, + { + "epoch": 0.42214, + "grad_norm": 0.7561475163414622, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 42214 + }, + { + "epoch": 0.42215, + "grad_norm": 1.0017668528506296, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 42215 + }, + { + "epoch": 0.42216, + "grad_norm": 1.1088294658773956, + "learning_rate": 0.003, + "loss": 3.9927, + "step": 42216 + }, + { + "epoch": 0.42217, + "grad_norm": 0.783060126054813, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 42217 + }, + { + "epoch": 0.42218, + "grad_norm": 0.7040089717917334, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 42218 + }, + { + "epoch": 0.42219, + "grad_norm": 0.6838654313704097, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 42219 + }, + { + "epoch": 0.4222, + "grad_norm": 0.7362340628028471, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 42220 + }, + { + "epoch": 0.42221, + "grad_norm": 0.7814517231720748, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 42221 + }, + { + "epoch": 0.42222, + "grad_norm": 0.7694556686842059, + "learning_rate": 0.003, + "loss": 4.009, + "step": 42222 + }, + { + "epoch": 0.42223, + "grad_norm": 0.7506857330926244, + "learning_rate": 0.003, + "loss": 4.027, + "step": 42223 + }, + { + "epoch": 0.42224, + "grad_norm": 0.7783672823043405, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 42224 + }, + { + "epoch": 0.42225, + "grad_norm": 0.7311335847987079, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 42225 + }, + { + "epoch": 0.42226, + "grad_norm": 0.6676103203776677, + "learning_rate": 0.003, + "loss": 4.0101, + "step": 42226 + }, + { + "epoch": 0.42227, + "grad_norm": 0.8389668177842909, + "learning_rate": 0.003, + "loss": 4.005, + "step": 42227 + }, + { + "epoch": 0.42228, + "grad_norm": 0.9583960454359495, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 42228 + }, + { + "epoch": 0.42229, + "grad_norm": 1.0725316744362015, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 42229 + }, + { + "epoch": 0.4223, + "grad_norm": 1.1420564786991647, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 42230 + }, + { + "epoch": 0.42231, + "grad_norm": 0.819091560707162, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 42231 + }, + { + "epoch": 0.42232, + "grad_norm": 0.76583536784047, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 42232 + }, + { + "epoch": 0.42233, + "grad_norm": 0.9335499632279979, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 42233 + }, + { + "epoch": 0.42234, + "grad_norm": 1.021998050448154, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 42234 + }, + { + "epoch": 0.42235, + "grad_norm": 1.0255864905364047, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 42235 + }, + { + "epoch": 0.42236, + "grad_norm": 1.0309298711242425, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 42236 + }, + { + "epoch": 0.42237, + "grad_norm": 0.8993150901220263, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 42237 + }, + { + "epoch": 0.42238, + "grad_norm": 0.8706293297697484, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 42238 + }, + { + "epoch": 0.42239, + "grad_norm": 0.9471531926589151, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 42239 + }, + { + "epoch": 0.4224, + "grad_norm": 1.0865518547764912, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 42240 + }, + { + "epoch": 0.42241, + "grad_norm": 1.108210100611505, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 42241 + }, + { + "epoch": 0.42242, + "grad_norm": 0.9415822119242043, + "learning_rate": 0.003, + "loss": 4.031, + "step": 42242 + }, + { + "epoch": 0.42243, + "grad_norm": 0.8563114073479992, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 42243 + }, + { + "epoch": 0.42244, + "grad_norm": 0.7495458936988013, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 42244 + }, + { + "epoch": 0.42245, + "grad_norm": 0.7513164322799831, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 42245 + }, + { + "epoch": 0.42246, + "grad_norm": 0.7221343133125891, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 42246 + }, + { + "epoch": 0.42247, + "grad_norm": 0.7949028116243089, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 42247 + }, + { + "epoch": 0.42248, + "grad_norm": 1.0682420095930494, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 42248 + }, + { + "epoch": 0.42249, + "grad_norm": 1.147792698207752, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 42249 + }, + { + "epoch": 0.4225, + "grad_norm": 0.988874743506778, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 42250 + }, + { + "epoch": 0.42251, + "grad_norm": 1.167132812706382, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 42251 + }, + { + "epoch": 0.42252, + "grad_norm": 0.8499953386881186, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 42252 + }, + { + "epoch": 0.42253, + "grad_norm": 0.620801619629547, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 42253 + }, + { + "epoch": 0.42254, + "grad_norm": 0.6698323712062276, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 42254 + }, + { + "epoch": 0.42255, + "grad_norm": 0.7113639832960074, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 42255 + }, + { + "epoch": 0.42256, + "grad_norm": 0.839284983034719, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 42256 + }, + { + "epoch": 0.42257, + "grad_norm": 0.8544643713447424, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 42257 + }, + { + "epoch": 0.42258, + "grad_norm": 1.0089919759202381, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 42258 + }, + { + "epoch": 0.42259, + "grad_norm": 1.0704748267139466, + "learning_rate": 0.003, + "loss": 4.0084, + "step": 42259 + }, + { + "epoch": 0.4226, + "grad_norm": 0.8951979993009543, + "learning_rate": 0.003, + "loss": 3.9957, + "step": 42260 + }, + { + "epoch": 0.42261, + "grad_norm": 0.7388134959747533, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 42261 + }, + { + "epoch": 0.42262, + "grad_norm": 0.7010699516819788, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 42262 + }, + { + "epoch": 0.42263, + "grad_norm": 0.7883253183881351, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 42263 + }, + { + "epoch": 0.42264, + "grad_norm": 0.7836446501759192, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 42264 + }, + { + "epoch": 0.42265, + "grad_norm": 0.6409124870435207, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 42265 + }, + { + "epoch": 0.42266, + "grad_norm": 0.6040722857488507, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 42266 + }, + { + "epoch": 0.42267, + "grad_norm": 0.6215599838948745, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 42267 + }, + { + "epoch": 0.42268, + "grad_norm": 0.5297157173590168, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 42268 + }, + { + "epoch": 0.42269, + "grad_norm": 0.5599125116677364, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 42269 + }, + { + "epoch": 0.4227, + "grad_norm": 0.5224502388464981, + "learning_rate": 0.003, + "loss": 3.9972, + "step": 42270 + }, + { + "epoch": 0.42271, + "grad_norm": 0.519884416991614, + "learning_rate": 0.003, + "loss": 4.0007, + "step": 42271 + }, + { + "epoch": 0.42272, + "grad_norm": 0.5118450098113696, + "learning_rate": 0.003, + "loss": 3.997, + "step": 42272 + }, + { + "epoch": 0.42273, + "grad_norm": 0.5340526543986647, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 42273 + }, + { + "epoch": 0.42274, + "grad_norm": 0.5417726665649455, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 42274 + }, + { + "epoch": 0.42275, + "grad_norm": 0.6503576510692469, + "learning_rate": 0.003, + "loss": 4.0088, + "step": 42275 + }, + { + "epoch": 0.42276, + "grad_norm": 0.8440594639410629, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 42276 + }, + { + "epoch": 0.42277, + "grad_norm": 1.1701797192757917, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 42277 + }, + { + "epoch": 0.42278, + "grad_norm": 0.8745885791167923, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 42278 + }, + { + "epoch": 0.42279, + "grad_norm": 0.7985748644548724, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 42279 + }, + { + "epoch": 0.4228, + "grad_norm": 0.9937873715415451, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 42280 + }, + { + "epoch": 0.42281, + "grad_norm": 1.0823672563959854, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 42281 + }, + { + "epoch": 0.42282, + "grad_norm": 0.8611897754456992, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 42282 + }, + { + "epoch": 0.42283, + "grad_norm": 0.8130608565457045, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 42283 + }, + { + "epoch": 0.42284, + "grad_norm": 0.7076822916321458, + "learning_rate": 0.003, + "loss": 3.9977, + "step": 42284 + }, + { + "epoch": 0.42285, + "grad_norm": 0.8551529752481204, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 42285 + }, + { + "epoch": 0.42286, + "grad_norm": 0.9906368535172565, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 42286 + }, + { + "epoch": 0.42287, + "grad_norm": 1.113699544331174, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 42287 + }, + { + "epoch": 0.42288, + "grad_norm": 1.015477203239415, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 42288 + }, + { + "epoch": 0.42289, + "grad_norm": 1.1415899527276494, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 42289 + }, + { + "epoch": 0.4229, + "grad_norm": 0.8085055577511707, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 42290 + }, + { + "epoch": 0.42291, + "grad_norm": 0.8181903037633804, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 42291 + }, + { + "epoch": 0.42292, + "grad_norm": 0.7633002917032319, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 42292 + }, + { + "epoch": 0.42293, + "grad_norm": 0.7237252505042606, + "learning_rate": 0.003, + "loss": 4.046, + "step": 42293 + }, + { + "epoch": 0.42294, + "grad_norm": 0.7947135558043286, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 42294 + }, + { + "epoch": 0.42295, + "grad_norm": 0.7572644611775595, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 42295 + }, + { + "epoch": 0.42296, + "grad_norm": 0.9004804722347403, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 42296 + }, + { + "epoch": 0.42297, + "grad_norm": 0.9335233116958055, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 42297 + }, + { + "epoch": 0.42298, + "grad_norm": 1.0502458929834864, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 42298 + }, + { + "epoch": 0.42299, + "grad_norm": 1.043966060039579, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 42299 + }, + { + "epoch": 0.423, + "grad_norm": 1.021170754510207, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 42300 + }, + { + "epoch": 0.42301, + "grad_norm": 0.8010639110161863, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 42301 + }, + { + "epoch": 0.42302, + "grad_norm": 0.7878751277194049, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 42302 + }, + { + "epoch": 0.42303, + "grad_norm": 0.8528408839369642, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 42303 + }, + { + "epoch": 0.42304, + "grad_norm": 0.7665702573462209, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 42304 + }, + { + "epoch": 0.42305, + "grad_norm": 0.6983895616923831, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 42305 + }, + { + "epoch": 0.42306, + "grad_norm": 0.5637986933384399, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 42306 + }, + { + "epoch": 0.42307, + "grad_norm": 0.6225703001273585, + "learning_rate": 0.003, + "loss": 4.047, + "step": 42307 + }, + { + "epoch": 0.42308, + "grad_norm": 0.6462355792031372, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 42308 + }, + { + "epoch": 0.42309, + "grad_norm": 0.674341175083342, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 42309 + }, + { + "epoch": 0.4231, + "grad_norm": 0.8051961243709318, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 42310 + }, + { + "epoch": 0.42311, + "grad_norm": 0.9681815710124625, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 42311 + }, + { + "epoch": 0.42312, + "grad_norm": 1.0444376212809865, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 42312 + }, + { + "epoch": 0.42313, + "grad_norm": 0.914826876104357, + "learning_rate": 0.003, + "loss": 4.037, + "step": 42313 + }, + { + "epoch": 0.42314, + "grad_norm": 0.9222734665407653, + "learning_rate": 0.003, + "loss": 4.062, + "step": 42314 + }, + { + "epoch": 0.42315, + "grad_norm": 0.8133590759719447, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 42315 + }, + { + "epoch": 0.42316, + "grad_norm": 0.8447205938429041, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 42316 + }, + { + "epoch": 0.42317, + "grad_norm": 0.806532094907084, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 42317 + }, + { + "epoch": 0.42318, + "grad_norm": 0.8184488297669389, + "learning_rate": 0.003, + "loss": 4.0754, + "step": 42318 + }, + { + "epoch": 0.42319, + "grad_norm": 0.8392702914495125, + "learning_rate": 0.003, + "loss": 4.026, + "step": 42319 + }, + { + "epoch": 0.4232, + "grad_norm": 0.9205055008398197, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 42320 + }, + { + "epoch": 0.42321, + "grad_norm": 1.066321170454744, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 42321 + }, + { + "epoch": 0.42322, + "grad_norm": 1.1794943244081213, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 42322 + }, + { + "epoch": 0.42323, + "grad_norm": 0.8345900750768785, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 42323 + }, + { + "epoch": 0.42324, + "grad_norm": 0.741898510688659, + "learning_rate": 0.003, + "loss": 4.04, + "step": 42324 + }, + { + "epoch": 0.42325, + "grad_norm": 0.7646370702067571, + "learning_rate": 0.003, + "loss": 4.026, + "step": 42325 + }, + { + "epoch": 0.42326, + "grad_norm": 0.6842995680724465, + "learning_rate": 0.003, + "loss": 3.9968, + "step": 42326 + }, + { + "epoch": 0.42327, + "grad_norm": 0.6758086374101091, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 42327 + }, + { + "epoch": 0.42328, + "grad_norm": 0.633112328905323, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 42328 + }, + { + "epoch": 0.42329, + "grad_norm": 0.5696150996435171, + "learning_rate": 0.003, + "loss": 3.9949, + "step": 42329 + }, + { + "epoch": 0.4233, + "grad_norm": 0.6333107332882855, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 42330 + }, + { + "epoch": 0.42331, + "grad_norm": 0.6554287670575712, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 42331 + }, + { + "epoch": 0.42332, + "grad_norm": 0.7708146888904954, + "learning_rate": 0.003, + "loss": 4.012, + "step": 42332 + }, + { + "epoch": 0.42333, + "grad_norm": 1.028870334796542, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 42333 + }, + { + "epoch": 0.42334, + "grad_norm": 1.2458127863173136, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 42334 + }, + { + "epoch": 0.42335, + "grad_norm": 0.5847655656650821, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 42335 + }, + { + "epoch": 0.42336, + "grad_norm": 0.6750911857975583, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 42336 + }, + { + "epoch": 0.42337, + "grad_norm": 0.6917672020397732, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 42337 + }, + { + "epoch": 0.42338, + "grad_norm": 0.6525397229746995, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 42338 + }, + { + "epoch": 0.42339, + "grad_norm": 0.6023405888067972, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 42339 + }, + { + "epoch": 0.4234, + "grad_norm": 0.653706672494862, + "learning_rate": 0.003, + "loss": 3.9996, + "step": 42340 + }, + { + "epoch": 0.42341, + "grad_norm": 0.5898345018169024, + "learning_rate": 0.003, + "loss": 3.9993, + "step": 42341 + }, + { + "epoch": 0.42342, + "grad_norm": 0.6060884791518192, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 42342 + }, + { + "epoch": 0.42343, + "grad_norm": 0.7227697980087255, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 42343 + }, + { + "epoch": 0.42344, + "grad_norm": 0.8937768475283104, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 42344 + }, + { + "epoch": 0.42345, + "grad_norm": 1.1453088049643991, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 42345 + }, + { + "epoch": 0.42346, + "grad_norm": 1.083841415903102, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 42346 + }, + { + "epoch": 0.42347, + "grad_norm": 1.1647297410749287, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 42347 + }, + { + "epoch": 0.42348, + "grad_norm": 0.8704534188148777, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 42348 + }, + { + "epoch": 0.42349, + "grad_norm": 0.7694806380006943, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 42349 + }, + { + "epoch": 0.4235, + "grad_norm": 0.7474465036919293, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 42350 + }, + { + "epoch": 0.42351, + "grad_norm": 0.7796023698240027, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 42351 + }, + { + "epoch": 0.42352, + "grad_norm": 0.8327998878935159, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 42352 + }, + { + "epoch": 0.42353, + "grad_norm": 0.8674031299839134, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 42353 + }, + { + "epoch": 0.42354, + "grad_norm": 0.8870433671889779, + "learning_rate": 0.003, + "loss": 4.024, + "step": 42354 + }, + { + "epoch": 0.42355, + "grad_norm": 1.0578295300016982, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 42355 + }, + { + "epoch": 0.42356, + "grad_norm": 1.2904333086398054, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 42356 + }, + { + "epoch": 0.42357, + "grad_norm": 0.815283444736957, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 42357 + }, + { + "epoch": 0.42358, + "grad_norm": 0.8147785816624067, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 42358 + }, + { + "epoch": 0.42359, + "grad_norm": 0.8453720986876486, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 42359 + }, + { + "epoch": 0.4236, + "grad_norm": 0.8681646575675772, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 42360 + }, + { + "epoch": 0.42361, + "grad_norm": 0.8488254284782161, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 42361 + }, + { + "epoch": 0.42362, + "grad_norm": 0.8166340048922558, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 42362 + }, + { + "epoch": 0.42363, + "grad_norm": 0.8485851473388039, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 42363 + }, + { + "epoch": 0.42364, + "grad_norm": 0.808746975046945, + "learning_rate": 0.003, + "loss": 4.0002, + "step": 42364 + }, + { + "epoch": 0.42365, + "grad_norm": 0.9277820618426524, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 42365 + }, + { + "epoch": 0.42366, + "grad_norm": 1.2249265116374641, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 42366 + }, + { + "epoch": 0.42367, + "grad_norm": 0.8893706601844622, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 42367 + }, + { + "epoch": 0.42368, + "grad_norm": 0.7848237899155894, + "learning_rate": 0.003, + "loss": 4.0035, + "step": 42368 + }, + { + "epoch": 0.42369, + "grad_norm": 0.722160906427499, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 42369 + }, + { + "epoch": 0.4237, + "grad_norm": 0.6690481077978656, + "learning_rate": 0.003, + "loss": 3.9888, + "step": 42370 + }, + { + "epoch": 0.42371, + "grad_norm": 0.6835115896435219, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 42371 + }, + { + "epoch": 0.42372, + "grad_norm": 0.7756990148438306, + "learning_rate": 0.003, + "loss": 4.027, + "step": 42372 + }, + { + "epoch": 0.42373, + "grad_norm": 0.7505062568618761, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 42373 + }, + { + "epoch": 0.42374, + "grad_norm": 0.6381586959881276, + "learning_rate": 0.003, + "loss": 4.028, + "step": 42374 + }, + { + "epoch": 0.42375, + "grad_norm": 0.7117086077134295, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 42375 + }, + { + "epoch": 0.42376, + "grad_norm": 0.741137634308175, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 42376 + }, + { + "epoch": 0.42377, + "grad_norm": 0.7394754032148616, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 42377 + }, + { + "epoch": 0.42378, + "grad_norm": 0.7657384219612025, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 42378 + }, + { + "epoch": 0.42379, + "grad_norm": 0.8029628859728521, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 42379 + }, + { + "epoch": 0.4238, + "grad_norm": 0.8558267971467539, + "learning_rate": 0.003, + "loss": 3.9983, + "step": 42380 + }, + { + "epoch": 0.42381, + "grad_norm": 0.9556022880165641, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 42381 + }, + { + "epoch": 0.42382, + "grad_norm": 1.0947452814028473, + "learning_rate": 0.003, + "loss": 4.0012, + "step": 42382 + }, + { + "epoch": 0.42383, + "grad_norm": 1.052656055378761, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 42383 + }, + { + "epoch": 0.42384, + "grad_norm": 0.808074408540637, + "learning_rate": 0.003, + "loss": 4.064, + "step": 42384 + }, + { + "epoch": 0.42385, + "grad_norm": 0.7109935992049183, + "learning_rate": 0.003, + "loss": 3.9934, + "step": 42385 + }, + { + "epoch": 0.42386, + "grad_norm": 0.69673866263149, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 42386 + }, + { + "epoch": 0.42387, + "grad_norm": 0.6862318030266384, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 42387 + }, + { + "epoch": 0.42388, + "grad_norm": 0.772500022246715, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 42388 + }, + { + "epoch": 0.42389, + "grad_norm": 0.9115020928237012, + "learning_rate": 0.003, + "loss": 4.05, + "step": 42389 + }, + { + "epoch": 0.4239, + "grad_norm": 0.9342693083369471, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 42390 + }, + { + "epoch": 0.42391, + "grad_norm": 0.8836949406269209, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 42391 + }, + { + "epoch": 0.42392, + "grad_norm": 0.8249282490293781, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 42392 + }, + { + "epoch": 0.42393, + "grad_norm": 0.8854629193266067, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 42393 + }, + { + "epoch": 0.42394, + "grad_norm": 0.8625560206929512, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 42394 + }, + { + "epoch": 0.42395, + "grad_norm": 0.9109553580138733, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 42395 + }, + { + "epoch": 0.42396, + "grad_norm": 0.9105667676056414, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 42396 + }, + { + "epoch": 0.42397, + "grad_norm": 1.0781581129002575, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 42397 + }, + { + "epoch": 0.42398, + "grad_norm": 1.0500706020111528, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 42398 + }, + { + "epoch": 0.42399, + "grad_norm": 0.9341129950106157, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 42399 + }, + { + "epoch": 0.424, + "grad_norm": 0.9287945010577797, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 42400 + }, + { + "epoch": 0.42401, + "grad_norm": 0.8866770931885222, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 42401 + }, + { + "epoch": 0.42402, + "grad_norm": 0.8224138880082594, + "learning_rate": 0.003, + "loss": 4.03, + "step": 42402 + }, + { + "epoch": 0.42403, + "grad_norm": 0.7558340377951348, + "learning_rate": 0.003, + "loss": 3.9987, + "step": 42403 + }, + { + "epoch": 0.42404, + "grad_norm": 0.7807539795050799, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 42404 + }, + { + "epoch": 0.42405, + "grad_norm": 0.8658759177329493, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 42405 + }, + { + "epoch": 0.42406, + "grad_norm": 0.9939116795383045, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 42406 + }, + { + "epoch": 0.42407, + "grad_norm": 0.9557079977784808, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 42407 + }, + { + "epoch": 0.42408, + "grad_norm": 1.1215818718258939, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 42408 + }, + { + "epoch": 0.42409, + "grad_norm": 1.0802188934401589, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 42409 + }, + { + "epoch": 0.4241, + "grad_norm": 0.8049327911566865, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 42410 + }, + { + "epoch": 0.42411, + "grad_norm": 0.7453627053760342, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 42411 + }, + { + "epoch": 0.42412, + "grad_norm": 0.6435179702000821, + "learning_rate": 0.003, + "loss": 4.0711, + "step": 42412 + }, + { + "epoch": 0.42413, + "grad_norm": 0.5742554694371295, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 42413 + }, + { + "epoch": 0.42414, + "grad_norm": 0.6081662608084828, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 42414 + }, + { + "epoch": 0.42415, + "grad_norm": 0.6376446460016288, + "learning_rate": 0.003, + "loss": 4.0041, + "step": 42415 + }, + { + "epoch": 0.42416, + "grad_norm": 0.7110061468153748, + "learning_rate": 0.003, + "loss": 3.9829, + "step": 42416 + }, + { + "epoch": 0.42417, + "grad_norm": 0.8589959276884837, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 42417 + }, + { + "epoch": 0.42418, + "grad_norm": 1.0161539903596435, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 42418 + }, + { + "epoch": 0.42419, + "grad_norm": 1.0054635975952788, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 42419 + }, + { + "epoch": 0.4242, + "grad_norm": 0.840281369529539, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 42420 + }, + { + "epoch": 0.42421, + "grad_norm": 0.7927193356618347, + "learning_rate": 0.003, + "loss": 4.03, + "step": 42421 + }, + { + "epoch": 0.42422, + "grad_norm": 0.7389062115099317, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 42422 + }, + { + "epoch": 0.42423, + "grad_norm": 0.7635202066899166, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 42423 + }, + { + "epoch": 0.42424, + "grad_norm": 0.7782946200521446, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 42424 + }, + { + "epoch": 0.42425, + "grad_norm": 0.7184478702928991, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 42425 + }, + { + "epoch": 0.42426, + "grad_norm": 0.7391999610293079, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 42426 + }, + { + "epoch": 0.42427, + "grad_norm": 0.8815474637200408, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 42427 + }, + { + "epoch": 0.42428, + "grad_norm": 0.8154128180050997, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 42428 + }, + { + "epoch": 0.42429, + "grad_norm": 0.6777810892696817, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 42429 + }, + { + "epoch": 0.4243, + "grad_norm": 0.6688608131878817, + "learning_rate": 0.003, + "loss": 3.994, + "step": 42430 + }, + { + "epoch": 0.42431, + "grad_norm": 0.6830248879289271, + "learning_rate": 0.003, + "loss": 3.9943, + "step": 42431 + }, + { + "epoch": 0.42432, + "grad_norm": 0.6795727401453447, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 42432 + }, + { + "epoch": 0.42433, + "grad_norm": 0.7055587811240899, + "learning_rate": 0.003, + "loss": 4.039, + "step": 42433 + }, + { + "epoch": 0.42434, + "grad_norm": 0.7748377912307337, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 42434 + }, + { + "epoch": 0.42435, + "grad_norm": 0.8386626715040119, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 42435 + }, + { + "epoch": 0.42436, + "grad_norm": 1.0564230094164733, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 42436 + }, + { + "epoch": 0.42437, + "grad_norm": 1.0333274077081536, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 42437 + }, + { + "epoch": 0.42438, + "grad_norm": 1.146722308248355, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 42438 + }, + { + "epoch": 0.42439, + "grad_norm": 0.907053479324825, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 42439 + }, + { + "epoch": 0.4244, + "grad_norm": 0.8788480655648603, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 42440 + }, + { + "epoch": 0.42441, + "grad_norm": 0.8383846415934799, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 42441 + }, + { + "epoch": 0.42442, + "grad_norm": 0.9530736424454204, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 42442 + }, + { + "epoch": 0.42443, + "grad_norm": 1.1989475237205434, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 42443 + }, + { + "epoch": 0.42444, + "grad_norm": 0.8687715207610984, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 42444 + }, + { + "epoch": 0.42445, + "grad_norm": 0.7359500025214698, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 42445 + }, + { + "epoch": 0.42446, + "grad_norm": 0.8041598257679909, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 42446 + }, + { + "epoch": 0.42447, + "grad_norm": 0.8342221274416906, + "learning_rate": 0.003, + "loss": 4.027, + "step": 42447 + }, + { + "epoch": 0.42448, + "grad_norm": 0.7412667014337712, + "learning_rate": 0.003, + "loss": 4.051, + "step": 42448 + }, + { + "epoch": 0.42449, + "grad_norm": 0.7367338595338109, + "learning_rate": 0.003, + "loss": 4.026, + "step": 42449 + }, + { + "epoch": 0.4245, + "grad_norm": 0.8518400214798342, + "learning_rate": 0.003, + "loss": 4.0658, + "step": 42450 + }, + { + "epoch": 0.42451, + "grad_norm": 1.128866878415787, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 42451 + }, + { + "epoch": 0.42452, + "grad_norm": 1.0536546637125344, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 42452 + }, + { + "epoch": 0.42453, + "grad_norm": 0.9027141614917302, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 42453 + }, + { + "epoch": 0.42454, + "grad_norm": 0.8472316777437519, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 42454 + }, + { + "epoch": 0.42455, + "grad_norm": 0.875323265053182, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 42455 + }, + { + "epoch": 0.42456, + "grad_norm": 0.9776613324382517, + "learning_rate": 0.003, + "loss": 4.043, + "step": 42456 + }, + { + "epoch": 0.42457, + "grad_norm": 1.061263454691318, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 42457 + }, + { + "epoch": 0.42458, + "grad_norm": 0.826938077302727, + "learning_rate": 0.003, + "loss": 4.018, + "step": 42458 + }, + { + "epoch": 0.42459, + "grad_norm": 0.7526352846013792, + "learning_rate": 0.003, + "loss": 3.9914, + "step": 42459 + }, + { + "epoch": 0.4246, + "grad_norm": 0.821771752885125, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 42460 + }, + { + "epoch": 0.42461, + "grad_norm": 0.9864901398736609, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 42461 + }, + { + "epoch": 0.42462, + "grad_norm": 1.1218108387615677, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 42462 + }, + { + "epoch": 0.42463, + "grad_norm": 0.8245437540404643, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 42463 + }, + { + "epoch": 0.42464, + "grad_norm": 0.7757142067451304, + "learning_rate": 0.003, + "loss": 4.041, + "step": 42464 + }, + { + "epoch": 0.42465, + "grad_norm": 0.7265536953184069, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 42465 + }, + { + "epoch": 0.42466, + "grad_norm": 0.7378414138191194, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 42466 + }, + { + "epoch": 0.42467, + "grad_norm": 0.8179389790808315, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 42467 + }, + { + "epoch": 0.42468, + "grad_norm": 0.8554092019165942, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 42468 + }, + { + "epoch": 0.42469, + "grad_norm": 0.8998975616112717, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 42469 + }, + { + "epoch": 0.4247, + "grad_norm": 0.9904012695969239, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 42470 + }, + { + "epoch": 0.42471, + "grad_norm": 0.9593368449229028, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 42471 + }, + { + "epoch": 0.42472, + "grad_norm": 0.8809643586754239, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 42472 + }, + { + "epoch": 0.42473, + "grad_norm": 0.9065394962936313, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 42473 + }, + { + "epoch": 0.42474, + "grad_norm": 0.888541148841326, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 42474 + }, + { + "epoch": 0.42475, + "grad_norm": 0.9536953273163906, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 42475 + }, + { + "epoch": 0.42476, + "grad_norm": 1.0057404242343722, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 42476 + }, + { + "epoch": 0.42477, + "grad_norm": 1.009765323211742, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 42477 + }, + { + "epoch": 0.42478, + "grad_norm": 0.9417559261550638, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 42478 + }, + { + "epoch": 0.42479, + "grad_norm": 0.8437553356488897, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 42479 + }, + { + "epoch": 0.4248, + "grad_norm": 0.8318602941616083, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 42480 + }, + { + "epoch": 0.42481, + "grad_norm": 0.7242910627548914, + "learning_rate": 0.003, + "loss": 4.042, + "step": 42481 + }, + { + "epoch": 0.42482, + "grad_norm": 0.705162752774331, + "learning_rate": 0.003, + "loss": 3.9921, + "step": 42482 + }, + { + "epoch": 0.42483, + "grad_norm": 0.643894869362214, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 42483 + }, + { + "epoch": 0.42484, + "grad_norm": 0.6640284546479762, + "learning_rate": 0.003, + "loss": 4.017, + "step": 42484 + }, + { + "epoch": 0.42485, + "grad_norm": 0.677122653157523, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 42485 + }, + { + "epoch": 0.42486, + "grad_norm": 0.6726555591097495, + "learning_rate": 0.003, + "loss": 3.9896, + "step": 42486 + }, + { + "epoch": 0.42487, + "grad_norm": 0.7185749866092458, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 42487 + }, + { + "epoch": 0.42488, + "grad_norm": 0.7553517459991094, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 42488 + }, + { + "epoch": 0.42489, + "grad_norm": 0.8231909918296073, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 42489 + }, + { + "epoch": 0.4249, + "grad_norm": 0.902167407981817, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 42490 + }, + { + "epoch": 0.42491, + "grad_norm": 1.0857384748109413, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 42491 + }, + { + "epoch": 0.42492, + "grad_norm": 1.078950523774376, + "learning_rate": 0.003, + "loss": 4.012, + "step": 42492 + }, + { + "epoch": 0.42493, + "grad_norm": 0.7761917376362019, + "learning_rate": 0.003, + "loss": 4.03, + "step": 42493 + }, + { + "epoch": 0.42494, + "grad_norm": 0.7111081245028826, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 42494 + }, + { + "epoch": 0.42495, + "grad_norm": 0.6638512259190362, + "learning_rate": 0.003, + "loss": 4.0002, + "step": 42495 + }, + { + "epoch": 0.42496, + "grad_norm": 0.7094664845124069, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 42496 + }, + { + "epoch": 0.42497, + "grad_norm": 0.7244401589757712, + "learning_rate": 0.003, + "loss": 3.9879, + "step": 42497 + }, + { + "epoch": 0.42498, + "grad_norm": 0.7042017608449219, + "learning_rate": 0.003, + "loss": 4.05, + "step": 42498 + }, + { + "epoch": 0.42499, + "grad_norm": 0.7257631167269628, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 42499 + }, + { + "epoch": 0.425, + "grad_norm": 0.807832466496927, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 42500 + }, + { + "epoch": 0.42501, + "grad_norm": 0.9307386856509132, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 42501 + }, + { + "epoch": 0.42502, + "grad_norm": 1.0075538121623253, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 42502 + }, + { + "epoch": 0.42503, + "grad_norm": 1.0712527081353984, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 42503 + }, + { + "epoch": 0.42504, + "grad_norm": 0.9350755350681651, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 42504 + }, + { + "epoch": 0.42505, + "grad_norm": 0.8576146236117854, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 42505 + }, + { + "epoch": 0.42506, + "grad_norm": 0.742110671865377, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 42506 + }, + { + "epoch": 0.42507, + "grad_norm": 0.7222514701922796, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 42507 + }, + { + "epoch": 0.42508, + "grad_norm": 0.670523004579473, + "learning_rate": 0.003, + "loss": 4.0023, + "step": 42508 + }, + { + "epoch": 0.42509, + "grad_norm": 0.6077886139941011, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 42509 + }, + { + "epoch": 0.4251, + "grad_norm": 0.5356315117597763, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 42510 + }, + { + "epoch": 0.42511, + "grad_norm": 0.5854993548303334, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 42511 + }, + { + "epoch": 0.42512, + "grad_norm": 0.6330315334421456, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 42512 + }, + { + "epoch": 0.42513, + "grad_norm": 0.7148905402912451, + "learning_rate": 0.003, + "loss": 3.9988, + "step": 42513 + }, + { + "epoch": 0.42514, + "grad_norm": 0.8562573741885808, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 42514 + }, + { + "epoch": 0.42515, + "grad_norm": 0.9987151995800552, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 42515 + }, + { + "epoch": 0.42516, + "grad_norm": 1.1232956477243181, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 42516 + }, + { + "epoch": 0.42517, + "grad_norm": 0.8379643664981772, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 42517 + }, + { + "epoch": 0.42518, + "grad_norm": 0.7354766484930219, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 42518 + }, + { + "epoch": 0.42519, + "grad_norm": 0.6505480897755721, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 42519 + }, + { + "epoch": 0.4252, + "grad_norm": 0.71875269904861, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 42520 + }, + { + "epoch": 0.42521, + "grad_norm": 0.731639431911028, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 42521 + }, + { + "epoch": 0.42522, + "grad_norm": 0.7190849321149918, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 42522 + }, + { + "epoch": 0.42523, + "grad_norm": 0.7875639740604226, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 42523 + }, + { + "epoch": 0.42524, + "grad_norm": 0.9277923962994449, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 42524 + }, + { + "epoch": 0.42525, + "grad_norm": 1.024797373010291, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 42525 + }, + { + "epoch": 0.42526, + "grad_norm": 0.994804319109286, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 42526 + }, + { + "epoch": 0.42527, + "grad_norm": 1.017924629454799, + "learning_rate": 0.003, + "loss": 4.017, + "step": 42527 + }, + { + "epoch": 0.42528, + "grad_norm": 0.9239410981921078, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 42528 + }, + { + "epoch": 0.42529, + "grad_norm": 1.0291684832903862, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 42529 + }, + { + "epoch": 0.4253, + "grad_norm": 1.0484569585311367, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 42530 + }, + { + "epoch": 0.42531, + "grad_norm": 0.9234196278397075, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 42531 + }, + { + "epoch": 0.42532, + "grad_norm": 0.8477902667847406, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 42532 + }, + { + "epoch": 0.42533, + "grad_norm": 0.9796054843259814, + "learning_rate": 0.003, + "loss": 4.0823, + "step": 42533 + }, + { + "epoch": 0.42534, + "grad_norm": 1.183338794084466, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 42534 + }, + { + "epoch": 0.42535, + "grad_norm": 0.8879461790938281, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 42535 + }, + { + "epoch": 0.42536, + "grad_norm": 0.9117597387047259, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 42536 + }, + { + "epoch": 0.42537, + "grad_norm": 0.7320543207827446, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 42537 + }, + { + "epoch": 0.42538, + "grad_norm": 0.6995996526089057, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 42538 + }, + { + "epoch": 0.42539, + "grad_norm": 0.718766928633714, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 42539 + }, + { + "epoch": 0.4254, + "grad_norm": 0.6765293113713152, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 42540 + }, + { + "epoch": 0.42541, + "grad_norm": 0.6672139702192622, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 42541 + }, + { + "epoch": 0.42542, + "grad_norm": 0.792410817240178, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 42542 + }, + { + "epoch": 0.42543, + "grad_norm": 0.8497391367597649, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 42543 + }, + { + "epoch": 0.42544, + "grad_norm": 0.937696530326693, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 42544 + }, + { + "epoch": 0.42545, + "grad_norm": 0.9507794171587989, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 42545 + }, + { + "epoch": 0.42546, + "grad_norm": 0.8278217519611175, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 42546 + }, + { + "epoch": 0.42547, + "grad_norm": 0.7725940489343498, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 42547 + }, + { + "epoch": 0.42548, + "grad_norm": 0.737045878091949, + "learning_rate": 0.003, + "loss": 4.024, + "step": 42548 + }, + { + "epoch": 0.42549, + "grad_norm": 0.809851992498847, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 42549 + }, + { + "epoch": 0.4255, + "grad_norm": 0.9207556750241993, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 42550 + }, + { + "epoch": 0.42551, + "grad_norm": 0.8794354201140717, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 42551 + }, + { + "epoch": 0.42552, + "grad_norm": 0.8894146952817821, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 42552 + }, + { + "epoch": 0.42553, + "grad_norm": 1.0097960298458617, + "learning_rate": 0.003, + "loss": 4.0818, + "step": 42553 + }, + { + "epoch": 0.42554, + "grad_norm": 1.1984395863111497, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 42554 + }, + { + "epoch": 0.42555, + "grad_norm": 0.9809227095132949, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 42555 + }, + { + "epoch": 0.42556, + "grad_norm": 0.8920886973525839, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 42556 + }, + { + "epoch": 0.42557, + "grad_norm": 0.7963017857768876, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 42557 + }, + { + "epoch": 0.42558, + "grad_norm": 0.7036125530032696, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 42558 + }, + { + "epoch": 0.42559, + "grad_norm": 0.6793085203924341, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 42559 + }, + { + "epoch": 0.4256, + "grad_norm": 0.7882729136041949, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 42560 + }, + { + "epoch": 0.42561, + "grad_norm": 0.8830734433995181, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 42561 + }, + { + "epoch": 0.42562, + "grad_norm": 0.9114043995713877, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 42562 + }, + { + "epoch": 0.42563, + "grad_norm": 0.9152904137679057, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 42563 + }, + { + "epoch": 0.42564, + "grad_norm": 1.0021338028304807, + "learning_rate": 0.003, + "loss": 4.0839, + "step": 42564 + }, + { + "epoch": 0.42565, + "grad_norm": 1.0390596364531217, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 42565 + }, + { + "epoch": 0.42566, + "grad_norm": 0.9307779602870357, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 42566 + }, + { + "epoch": 0.42567, + "grad_norm": 0.8699803789172889, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 42567 + }, + { + "epoch": 0.42568, + "grad_norm": 0.8279622847566033, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 42568 + }, + { + "epoch": 0.42569, + "grad_norm": 0.8057887092791431, + "learning_rate": 0.003, + "loss": 4.052, + "step": 42569 + }, + { + "epoch": 0.4257, + "grad_norm": 0.7480494739121165, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 42570 + }, + { + "epoch": 0.42571, + "grad_norm": 0.736923931828387, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 42571 + }, + { + "epoch": 0.42572, + "grad_norm": 0.7331320630952104, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 42572 + }, + { + "epoch": 0.42573, + "grad_norm": 0.7404734163170503, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 42573 + }, + { + "epoch": 0.42574, + "grad_norm": 0.9480025975890042, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 42574 + }, + { + "epoch": 0.42575, + "grad_norm": 1.1302499578017664, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 42575 + }, + { + "epoch": 0.42576, + "grad_norm": 0.834659044774927, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 42576 + }, + { + "epoch": 0.42577, + "grad_norm": 0.8283759138056878, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 42577 + }, + { + "epoch": 0.42578, + "grad_norm": 0.7892446591487501, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 42578 + }, + { + "epoch": 0.42579, + "grad_norm": 0.734033096247046, + "learning_rate": 0.003, + "loss": 4.053, + "step": 42579 + }, + { + "epoch": 0.4258, + "grad_norm": 0.6699781537966589, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 42580 + }, + { + "epoch": 0.42581, + "grad_norm": 0.6555556227509153, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 42581 + }, + { + "epoch": 0.42582, + "grad_norm": 0.6766302664588667, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 42582 + }, + { + "epoch": 0.42583, + "grad_norm": 0.6223445366072368, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 42583 + }, + { + "epoch": 0.42584, + "grad_norm": 0.5846143380887726, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 42584 + }, + { + "epoch": 0.42585, + "grad_norm": 0.6316695213676715, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 42585 + }, + { + "epoch": 0.42586, + "grad_norm": 0.6636624120421988, + "learning_rate": 0.003, + "loss": 4.005, + "step": 42586 + }, + { + "epoch": 0.42587, + "grad_norm": 0.8551153372938002, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 42587 + }, + { + "epoch": 0.42588, + "grad_norm": 1.1949617235484442, + "learning_rate": 0.003, + "loss": 4.043, + "step": 42588 + }, + { + "epoch": 0.42589, + "grad_norm": 1.0919613929392975, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 42589 + }, + { + "epoch": 0.4259, + "grad_norm": 0.8539023932982726, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 42590 + }, + { + "epoch": 0.42591, + "grad_norm": 0.6823247093792856, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 42591 + }, + { + "epoch": 0.42592, + "grad_norm": 0.5406326958988653, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 42592 + }, + { + "epoch": 0.42593, + "grad_norm": 0.6176072693998192, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 42593 + }, + { + "epoch": 0.42594, + "grad_norm": 0.6168112156597599, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 42594 + }, + { + "epoch": 0.42595, + "grad_norm": 0.7164273274648044, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 42595 + }, + { + "epoch": 0.42596, + "grad_norm": 0.8374279999063299, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 42596 + }, + { + "epoch": 0.42597, + "grad_norm": 0.9358362974298514, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 42597 + }, + { + "epoch": 0.42598, + "grad_norm": 1.0235961082887195, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 42598 + }, + { + "epoch": 0.42599, + "grad_norm": 1.0446877772749896, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 42599 + }, + { + "epoch": 0.426, + "grad_norm": 0.9642387830747033, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 42600 + }, + { + "epoch": 0.42601, + "grad_norm": 1.0977474277781143, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 42601 + }, + { + "epoch": 0.42602, + "grad_norm": 0.9780901322763407, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 42602 + }, + { + "epoch": 0.42603, + "grad_norm": 0.8496788872857897, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 42603 + }, + { + "epoch": 0.42604, + "grad_norm": 0.721628102331232, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 42604 + }, + { + "epoch": 0.42605, + "grad_norm": 0.8051803364250434, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 42605 + }, + { + "epoch": 0.42606, + "grad_norm": 0.8864091987809775, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 42606 + }, + { + "epoch": 0.42607, + "grad_norm": 0.9110282691570398, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 42607 + }, + { + "epoch": 0.42608, + "grad_norm": 0.9685306777405658, + "learning_rate": 0.003, + "loss": 4.01, + "step": 42608 + }, + { + "epoch": 0.42609, + "grad_norm": 1.2033363749931578, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 42609 + }, + { + "epoch": 0.4261, + "grad_norm": 0.7975666112054354, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 42610 + }, + { + "epoch": 0.42611, + "grad_norm": 0.7389843321093853, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 42611 + }, + { + "epoch": 0.42612, + "grad_norm": 0.9180048591611965, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 42612 + }, + { + "epoch": 0.42613, + "grad_norm": 1.030433691266867, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 42613 + }, + { + "epoch": 0.42614, + "grad_norm": 0.9096305798604375, + "learning_rate": 0.003, + "loss": 4.0699, + "step": 42614 + }, + { + "epoch": 0.42615, + "grad_norm": 1.0138878680936447, + "learning_rate": 0.003, + "loss": 4.0744, + "step": 42615 + }, + { + "epoch": 0.42616, + "grad_norm": 0.9185965953899813, + "learning_rate": 0.003, + "loss": 4.026, + "step": 42616 + }, + { + "epoch": 0.42617, + "grad_norm": 0.9975220722122112, + "learning_rate": 0.003, + "loss": 4.0679, + "step": 42617 + }, + { + "epoch": 0.42618, + "grad_norm": 1.038438270531355, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 42618 + }, + { + "epoch": 0.42619, + "grad_norm": 0.8994585216840275, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 42619 + }, + { + "epoch": 0.4262, + "grad_norm": 0.8649834750905592, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 42620 + }, + { + "epoch": 0.42621, + "grad_norm": 0.9625435990944322, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 42621 + }, + { + "epoch": 0.42622, + "grad_norm": 0.9406035229354652, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 42622 + }, + { + "epoch": 0.42623, + "grad_norm": 0.8589584134853765, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 42623 + }, + { + "epoch": 0.42624, + "grad_norm": 0.7665482625173082, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 42624 + }, + { + "epoch": 0.42625, + "grad_norm": 0.6798953742964756, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 42625 + }, + { + "epoch": 0.42626, + "grad_norm": 0.7838201483664096, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 42626 + }, + { + "epoch": 0.42627, + "grad_norm": 0.867550534745915, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 42627 + }, + { + "epoch": 0.42628, + "grad_norm": 1.07328356154215, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 42628 + }, + { + "epoch": 0.42629, + "grad_norm": 1.0097377664676004, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 42629 + }, + { + "epoch": 0.4263, + "grad_norm": 0.9558397772206765, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 42630 + }, + { + "epoch": 0.42631, + "grad_norm": 0.8016786277279435, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 42631 + }, + { + "epoch": 0.42632, + "grad_norm": 0.7118009163923787, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 42632 + }, + { + "epoch": 0.42633, + "grad_norm": 0.7684769370194038, + "learning_rate": 0.003, + "loss": 4.028, + "step": 42633 + }, + { + "epoch": 0.42634, + "grad_norm": 0.7266037702415592, + "learning_rate": 0.003, + "loss": 4.0666, + "step": 42634 + }, + { + "epoch": 0.42635, + "grad_norm": 0.6616359368196439, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 42635 + }, + { + "epoch": 0.42636, + "grad_norm": 0.662844015728914, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 42636 + }, + { + "epoch": 0.42637, + "grad_norm": 0.6960392843567709, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 42637 + }, + { + "epoch": 0.42638, + "grad_norm": 0.6241300448146089, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 42638 + }, + { + "epoch": 0.42639, + "grad_norm": 0.6066828366988999, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 42639 + }, + { + "epoch": 0.4264, + "grad_norm": 0.6450910071317147, + "learning_rate": 0.003, + "loss": 3.9999, + "step": 42640 + }, + { + "epoch": 0.42641, + "grad_norm": 0.6098373466018302, + "learning_rate": 0.003, + "loss": 4.0058, + "step": 42641 + }, + { + "epoch": 0.42642, + "grad_norm": 0.5669807121829601, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 42642 + }, + { + "epoch": 0.42643, + "grad_norm": 0.6483714183936954, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 42643 + }, + { + "epoch": 0.42644, + "grad_norm": 0.852920617079802, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 42644 + }, + { + "epoch": 0.42645, + "grad_norm": 0.967019473625909, + "learning_rate": 0.003, + "loss": 4.029, + "step": 42645 + }, + { + "epoch": 0.42646, + "grad_norm": 1.0796512429214025, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 42646 + }, + { + "epoch": 0.42647, + "grad_norm": 0.8176851622222003, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 42647 + }, + { + "epoch": 0.42648, + "grad_norm": 0.6803951174852874, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 42648 + }, + { + "epoch": 0.42649, + "grad_norm": 0.633629113707592, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 42649 + }, + { + "epoch": 0.4265, + "grad_norm": 0.631705179255241, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 42650 + }, + { + "epoch": 0.42651, + "grad_norm": 0.7255445255787005, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 42651 + }, + { + "epoch": 0.42652, + "grad_norm": 0.8118358859439251, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 42652 + }, + { + "epoch": 0.42653, + "grad_norm": 0.8422431923373623, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 42653 + }, + { + "epoch": 0.42654, + "grad_norm": 0.783496788343619, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 42654 + }, + { + "epoch": 0.42655, + "grad_norm": 0.8023945306212411, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 42655 + }, + { + "epoch": 0.42656, + "grad_norm": 0.7673991306949444, + "learning_rate": 0.003, + "loss": 4.045, + "step": 42656 + }, + { + "epoch": 0.42657, + "grad_norm": 0.697092096213959, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 42657 + }, + { + "epoch": 0.42658, + "grad_norm": 0.6567018933036555, + "learning_rate": 0.003, + "loss": 4.03, + "step": 42658 + }, + { + "epoch": 0.42659, + "grad_norm": 0.6093319885960488, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 42659 + }, + { + "epoch": 0.4266, + "grad_norm": 0.6417021840918756, + "learning_rate": 0.003, + "loss": 3.9993, + "step": 42660 + }, + { + "epoch": 0.42661, + "grad_norm": 0.7784842412944183, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 42661 + }, + { + "epoch": 0.42662, + "grad_norm": 1.0464231927088938, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 42662 + }, + { + "epoch": 0.42663, + "grad_norm": 1.2557362787427064, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 42663 + }, + { + "epoch": 0.42664, + "grad_norm": 0.7942581635682041, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 42664 + }, + { + "epoch": 0.42665, + "grad_norm": 0.7012191594828874, + "learning_rate": 0.003, + "loss": 3.9994, + "step": 42665 + }, + { + "epoch": 0.42666, + "grad_norm": 0.7310150817929181, + "learning_rate": 0.003, + "loss": 4.037, + "step": 42666 + }, + { + "epoch": 0.42667, + "grad_norm": 0.7328879674480991, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 42667 + }, + { + "epoch": 0.42668, + "grad_norm": 0.8135868474097913, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 42668 + }, + { + "epoch": 0.42669, + "grad_norm": 0.8879694765400722, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 42669 + }, + { + "epoch": 0.4267, + "grad_norm": 0.870197564436561, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 42670 + }, + { + "epoch": 0.42671, + "grad_norm": 0.731200461906429, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 42671 + }, + { + "epoch": 0.42672, + "grad_norm": 0.7281175217467858, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 42672 + }, + { + "epoch": 0.42673, + "grad_norm": 0.7510546797252674, + "learning_rate": 0.003, + "loss": 4.048, + "step": 42673 + }, + { + "epoch": 0.42674, + "grad_norm": 0.8454059084302727, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 42674 + }, + { + "epoch": 0.42675, + "grad_norm": 1.067193164988639, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 42675 + }, + { + "epoch": 0.42676, + "grad_norm": 1.0341680759941319, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 42676 + }, + { + "epoch": 0.42677, + "grad_norm": 1.1919785324257304, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 42677 + }, + { + "epoch": 0.42678, + "grad_norm": 0.9289500894768679, + "learning_rate": 0.003, + "loss": 4.0008, + "step": 42678 + }, + { + "epoch": 0.42679, + "grad_norm": 0.8854783064215159, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 42679 + }, + { + "epoch": 0.4268, + "grad_norm": 0.9392131367528267, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 42680 + }, + { + "epoch": 0.42681, + "grad_norm": 1.0648595983656262, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 42681 + }, + { + "epoch": 0.42682, + "grad_norm": 0.9647443770747872, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 42682 + }, + { + "epoch": 0.42683, + "grad_norm": 0.8968056355909007, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 42683 + }, + { + "epoch": 0.42684, + "grad_norm": 0.8564886264629251, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 42684 + }, + { + "epoch": 0.42685, + "grad_norm": 0.9252569143033049, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 42685 + }, + { + "epoch": 0.42686, + "grad_norm": 1.0204714340043293, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 42686 + }, + { + "epoch": 0.42687, + "grad_norm": 1.036081002855637, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 42687 + }, + { + "epoch": 0.42688, + "grad_norm": 1.1030429228080796, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 42688 + }, + { + "epoch": 0.42689, + "grad_norm": 0.9885634432524733, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 42689 + }, + { + "epoch": 0.4269, + "grad_norm": 0.8272942232571736, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 42690 + }, + { + "epoch": 0.42691, + "grad_norm": 0.7845123394424212, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 42691 + }, + { + "epoch": 0.42692, + "grad_norm": 0.7343925316563357, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 42692 + }, + { + "epoch": 0.42693, + "grad_norm": 0.7578946746690132, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 42693 + }, + { + "epoch": 0.42694, + "grad_norm": 0.8209558357031864, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 42694 + }, + { + "epoch": 0.42695, + "grad_norm": 0.7501785360997173, + "learning_rate": 0.003, + "loss": 4.0684, + "step": 42695 + }, + { + "epoch": 0.42696, + "grad_norm": 0.7248335101092103, + "learning_rate": 0.003, + "loss": 4.0034, + "step": 42696 + }, + { + "epoch": 0.42697, + "grad_norm": 0.6488489517193773, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 42697 + }, + { + "epoch": 0.42698, + "grad_norm": 0.6841868397021944, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 42698 + }, + { + "epoch": 0.42699, + "grad_norm": 0.6984268931700136, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 42699 + }, + { + "epoch": 0.427, + "grad_norm": 0.7117302985512288, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 42700 + }, + { + "epoch": 0.42701, + "grad_norm": 0.8039999854934987, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 42701 + }, + { + "epoch": 0.42702, + "grad_norm": 1.0230908436543342, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 42702 + }, + { + "epoch": 0.42703, + "grad_norm": 1.073400204482124, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 42703 + }, + { + "epoch": 0.42704, + "grad_norm": 0.8803903619166789, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 42704 + }, + { + "epoch": 0.42705, + "grad_norm": 0.8281149610194348, + "learning_rate": 0.003, + "loss": 4.046, + "step": 42705 + }, + { + "epoch": 0.42706, + "grad_norm": 0.7664931130144521, + "learning_rate": 0.003, + "loss": 4.0052, + "step": 42706 + }, + { + "epoch": 0.42707, + "grad_norm": 0.7202975721755356, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 42707 + }, + { + "epoch": 0.42708, + "grad_norm": 0.6458405454559136, + "learning_rate": 0.003, + "loss": 4.037, + "step": 42708 + }, + { + "epoch": 0.42709, + "grad_norm": 0.7350194410378006, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 42709 + }, + { + "epoch": 0.4271, + "grad_norm": 0.8224797279894046, + "learning_rate": 0.003, + "loss": 4.038, + "step": 42710 + }, + { + "epoch": 0.42711, + "grad_norm": 0.7992944682398073, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 42711 + }, + { + "epoch": 0.42712, + "grad_norm": 0.793988552652129, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 42712 + }, + { + "epoch": 0.42713, + "grad_norm": 0.8570949560604508, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 42713 + }, + { + "epoch": 0.42714, + "grad_norm": 0.88411544230939, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 42714 + }, + { + "epoch": 0.42715, + "grad_norm": 0.981834016225561, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 42715 + }, + { + "epoch": 0.42716, + "grad_norm": 1.1764441302560056, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 42716 + }, + { + "epoch": 0.42717, + "grad_norm": 0.8244672367388438, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 42717 + }, + { + "epoch": 0.42718, + "grad_norm": 0.7608411231844766, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 42718 + }, + { + "epoch": 0.42719, + "grad_norm": 0.8297032642378066, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 42719 + }, + { + "epoch": 0.4272, + "grad_norm": 0.8690238812587268, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 42720 + }, + { + "epoch": 0.42721, + "grad_norm": 0.8553658695539315, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 42721 + }, + { + "epoch": 0.42722, + "grad_norm": 0.8458099609036026, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 42722 + }, + { + "epoch": 0.42723, + "grad_norm": 0.962720971478166, + "learning_rate": 0.003, + "loss": 4.0824, + "step": 42723 + }, + { + "epoch": 0.42724, + "grad_norm": 1.0189617717394677, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 42724 + }, + { + "epoch": 0.42725, + "grad_norm": 1.1562141673655386, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 42725 + }, + { + "epoch": 0.42726, + "grad_norm": 0.9326661934703212, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 42726 + }, + { + "epoch": 0.42727, + "grad_norm": 0.8320634550096673, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 42727 + }, + { + "epoch": 0.42728, + "grad_norm": 0.9299024668877167, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 42728 + }, + { + "epoch": 0.42729, + "grad_norm": 1.184452866353322, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 42729 + }, + { + "epoch": 0.4273, + "grad_norm": 0.9142899228009045, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 42730 + }, + { + "epoch": 0.42731, + "grad_norm": 0.7971412198962701, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 42731 + }, + { + "epoch": 0.42732, + "grad_norm": 0.6759427943430611, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 42732 + }, + { + "epoch": 0.42733, + "grad_norm": 0.6173728208099312, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 42733 + }, + { + "epoch": 0.42734, + "grad_norm": 0.5671976492142026, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 42734 + }, + { + "epoch": 0.42735, + "grad_norm": 0.5532938938778865, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 42735 + }, + { + "epoch": 0.42736, + "grad_norm": 0.5267218505527387, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 42736 + }, + { + "epoch": 0.42737, + "grad_norm": 0.5949862817288111, + "learning_rate": 0.003, + "loss": 3.9982, + "step": 42737 + }, + { + "epoch": 0.42738, + "grad_norm": 0.6417743264656374, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 42738 + }, + { + "epoch": 0.42739, + "grad_norm": 0.663290640687811, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 42739 + }, + { + "epoch": 0.4274, + "grad_norm": 0.7121721097751713, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 42740 + }, + { + "epoch": 0.42741, + "grad_norm": 0.821771371022849, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 42741 + }, + { + "epoch": 0.42742, + "grad_norm": 0.8439062756364515, + "learning_rate": 0.003, + "loss": 4.0073, + "step": 42742 + }, + { + "epoch": 0.42743, + "grad_norm": 0.8414628507300079, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 42743 + }, + { + "epoch": 0.42744, + "grad_norm": 0.9376330391451329, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 42744 + }, + { + "epoch": 0.42745, + "grad_norm": 0.9842432996522463, + "learning_rate": 0.003, + "loss": 4.09, + "step": 42745 + }, + { + "epoch": 0.42746, + "grad_norm": 1.051349082985549, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 42746 + }, + { + "epoch": 0.42747, + "grad_norm": 0.8654939445250115, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 42747 + }, + { + "epoch": 0.42748, + "grad_norm": 0.7570087312753411, + "learning_rate": 0.003, + "loss": 4.0041, + "step": 42748 + }, + { + "epoch": 0.42749, + "grad_norm": 0.6210355588177751, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 42749 + }, + { + "epoch": 0.4275, + "grad_norm": 0.6428596286804116, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 42750 + }, + { + "epoch": 0.42751, + "grad_norm": 0.6289371232799743, + "learning_rate": 0.003, + "loss": 3.9989, + "step": 42751 + }, + { + "epoch": 0.42752, + "grad_norm": 0.6927554664000072, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 42752 + }, + { + "epoch": 0.42753, + "grad_norm": 0.6930290050585142, + "learning_rate": 0.003, + "loss": 4.035, + "step": 42753 + }, + { + "epoch": 0.42754, + "grad_norm": 0.7371429560712147, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 42754 + }, + { + "epoch": 0.42755, + "grad_norm": 0.8214642925703105, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 42755 + }, + { + "epoch": 0.42756, + "grad_norm": 0.8707190534071196, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 42756 + }, + { + "epoch": 0.42757, + "grad_norm": 1.015063744811903, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 42757 + }, + { + "epoch": 0.42758, + "grad_norm": 1.2192876509401982, + "learning_rate": 0.003, + "loss": 4.037, + "step": 42758 + }, + { + "epoch": 0.42759, + "grad_norm": 0.759446915913801, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 42759 + }, + { + "epoch": 0.4276, + "grad_norm": 0.6602980136912655, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 42760 + }, + { + "epoch": 0.42761, + "grad_norm": 0.7874897022932389, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 42761 + }, + { + "epoch": 0.42762, + "grad_norm": 0.7992117451482816, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 42762 + }, + { + "epoch": 0.42763, + "grad_norm": 0.8609413922799958, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 42763 + }, + { + "epoch": 0.42764, + "grad_norm": 0.9887680321383335, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 42764 + }, + { + "epoch": 0.42765, + "grad_norm": 1.0779313040435152, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 42765 + }, + { + "epoch": 0.42766, + "grad_norm": 0.9306535351746418, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 42766 + }, + { + "epoch": 0.42767, + "grad_norm": 0.9682087460181299, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 42767 + }, + { + "epoch": 0.42768, + "grad_norm": 0.8794180266538728, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 42768 + }, + { + "epoch": 0.42769, + "grad_norm": 0.8274059352777339, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 42769 + }, + { + "epoch": 0.4277, + "grad_norm": 0.9379613540083231, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 42770 + }, + { + "epoch": 0.42771, + "grad_norm": 1.0917492067533203, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 42771 + }, + { + "epoch": 0.42772, + "grad_norm": 0.9299746888248042, + "learning_rate": 0.003, + "loss": 4.047, + "step": 42772 + }, + { + "epoch": 0.42773, + "grad_norm": 0.9238752343808974, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 42773 + }, + { + "epoch": 0.42774, + "grad_norm": 0.926276452501581, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 42774 + }, + { + "epoch": 0.42775, + "grad_norm": 0.9427064119558032, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 42775 + }, + { + "epoch": 0.42776, + "grad_norm": 0.8456309348058608, + "learning_rate": 0.003, + "loss": 4.0023, + "step": 42776 + }, + { + "epoch": 0.42777, + "grad_norm": 0.9086813832486972, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 42777 + }, + { + "epoch": 0.42778, + "grad_norm": 0.8953664467219095, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 42778 + }, + { + "epoch": 0.42779, + "grad_norm": 0.8861257216996972, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 42779 + }, + { + "epoch": 0.4278, + "grad_norm": 0.8887349169608929, + "learning_rate": 0.003, + "loss": 4.044, + "step": 42780 + }, + { + "epoch": 0.42781, + "grad_norm": 0.9216701350139749, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 42781 + }, + { + "epoch": 0.42782, + "grad_norm": 1.022700876304174, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 42782 + }, + { + "epoch": 0.42783, + "grad_norm": 1.0980649014372588, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 42783 + }, + { + "epoch": 0.42784, + "grad_norm": 0.9556992460754941, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 42784 + }, + { + "epoch": 0.42785, + "grad_norm": 1.045846175318265, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 42785 + }, + { + "epoch": 0.42786, + "grad_norm": 0.8803174604316113, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 42786 + }, + { + "epoch": 0.42787, + "grad_norm": 0.8072045029785705, + "learning_rate": 0.003, + "loss": 4.0893, + "step": 42787 + }, + { + "epoch": 0.42788, + "grad_norm": 0.8738944321482348, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 42788 + }, + { + "epoch": 0.42789, + "grad_norm": 0.7803021073500885, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 42789 + }, + { + "epoch": 0.4279, + "grad_norm": 0.7439623345252465, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 42790 + }, + { + "epoch": 0.42791, + "grad_norm": 0.7293832211611228, + "learning_rate": 0.003, + "loss": 3.9963, + "step": 42791 + }, + { + "epoch": 0.42792, + "grad_norm": 0.660614461703193, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 42792 + }, + { + "epoch": 0.42793, + "grad_norm": 0.614969108736722, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 42793 + }, + { + "epoch": 0.42794, + "grad_norm": 0.7089063199258047, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 42794 + }, + { + "epoch": 0.42795, + "grad_norm": 0.8068396667550003, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 42795 + }, + { + "epoch": 0.42796, + "grad_norm": 1.1313530526178985, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 42796 + }, + { + "epoch": 0.42797, + "grad_norm": 1.105444196887904, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 42797 + }, + { + "epoch": 0.42798, + "grad_norm": 0.6934926630852445, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 42798 + }, + { + "epoch": 0.42799, + "grad_norm": 0.6739557651452046, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 42799 + }, + { + "epoch": 0.428, + "grad_norm": 0.763872216594981, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 42800 + }, + { + "epoch": 0.42801, + "grad_norm": 0.7104856110941147, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 42801 + }, + { + "epoch": 0.42802, + "grad_norm": 0.5807685836550108, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 42802 + }, + { + "epoch": 0.42803, + "grad_norm": 0.5653544503722234, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 42803 + }, + { + "epoch": 0.42804, + "grad_norm": 0.5522797951494046, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 42804 + }, + { + "epoch": 0.42805, + "grad_norm": 0.6050808977778971, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 42805 + }, + { + "epoch": 0.42806, + "grad_norm": 0.6522426681449475, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 42806 + }, + { + "epoch": 0.42807, + "grad_norm": 0.6609464285720915, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 42807 + }, + { + "epoch": 0.42808, + "grad_norm": 0.7024535041259282, + "learning_rate": 0.003, + "loss": 4.0001, + "step": 42808 + }, + { + "epoch": 0.42809, + "grad_norm": 0.6832021338285433, + "learning_rate": 0.003, + "loss": 3.9907, + "step": 42809 + }, + { + "epoch": 0.4281, + "grad_norm": 0.8108829395006338, + "learning_rate": 0.003, + "loss": 4.023, + "step": 42810 + }, + { + "epoch": 0.42811, + "grad_norm": 1.0589321934848128, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 42811 + }, + { + "epoch": 0.42812, + "grad_norm": 1.1671728137518596, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 42812 + }, + { + "epoch": 0.42813, + "grad_norm": 0.6666171803922004, + "learning_rate": 0.003, + "loss": 4.016, + "step": 42813 + }, + { + "epoch": 0.42814, + "grad_norm": 0.8500475321887739, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 42814 + }, + { + "epoch": 0.42815, + "grad_norm": 0.9221248479353051, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 42815 + }, + { + "epoch": 0.42816, + "grad_norm": 0.9050361698061212, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 42816 + }, + { + "epoch": 0.42817, + "grad_norm": 0.928544095435907, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 42817 + }, + { + "epoch": 0.42818, + "grad_norm": 0.9393722718457972, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 42818 + }, + { + "epoch": 0.42819, + "grad_norm": 0.9453980522156484, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 42819 + }, + { + "epoch": 0.4282, + "grad_norm": 0.9305576546085558, + "learning_rate": 0.003, + "loss": 4.065, + "step": 42820 + }, + { + "epoch": 0.42821, + "grad_norm": 0.8149259935263258, + "learning_rate": 0.003, + "loss": 3.9819, + "step": 42821 + }, + { + "epoch": 0.42822, + "grad_norm": 0.7891760300202628, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 42822 + }, + { + "epoch": 0.42823, + "grad_norm": 0.7979975323370267, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 42823 + }, + { + "epoch": 0.42824, + "grad_norm": 0.7386634687347919, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 42824 + }, + { + "epoch": 0.42825, + "grad_norm": 0.7961025897689233, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 42825 + }, + { + "epoch": 0.42826, + "grad_norm": 0.8506400092673109, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 42826 + }, + { + "epoch": 0.42827, + "grad_norm": 0.8796698216576397, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 42827 + }, + { + "epoch": 0.42828, + "grad_norm": 0.8573121402344165, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 42828 + }, + { + "epoch": 0.42829, + "grad_norm": 0.7607636454019984, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 42829 + }, + { + "epoch": 0.4283, + "grad_norm": 0.7366125122426347, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 42830 + }, + { + "epoch": 0.42831, + "grad_norm": 0.7790510190930323, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 42831 + }, + { + "epoch": 0.42832, + "grad_norm": 0.793921832377938, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 42832 + }, + { + "epoch": 0.42833, + "grad_norm": 0.8778452598217034, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 42833 + }, + { + "epoch": 0.42834, + "grad_norm": 0.8140848383242972, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 42834 + }, + { + "epoch": 0.42835, + "grad_norm": 0.7607009454998269, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 42835 + }, + { + "epoch": 0.42836, + "grad_norm": 0.8221046590151707, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 42836 + }, + { + "epoch": 0.42837, + "grad_norm": 1.0021223702820237, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 42837 + }, + { + "epoch": 0.42838, + "grad_norm": 1.144809505965571, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 42838 + }, + { + "epoch": 0.42839, + "grad_norm": 0.7815177226809604, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 42839 + }, + { + "epoch": 0.4284, + "grad_norm": 0.7286320861452925, + "learning_rate": 0.003, + "loss": 4.005, + "step": 42840 + }, + { + "epoch": 0.42841, + "grad_norm": 0.7850567752855319, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 42841 + }, + { + "epoch": 0.42842, + "grad_norm": 0.9473257358813177, + "learning_rate": 0.003, + "loss": 4.0032, + "step": 42842 + }, + { + "epoch": 0.42843, + "grad_norm": 0.9954806049522215, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 42843 + }, + { + "epoch": 0.42844, + "grad_norm": 0.991214393756752, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 42844 + }, + { + "epoch": 0.42845, + "grad_norm": 0.8711726384581755, + "learning_rate": 0.003, + "loss": 4.018, + "step": 42845 + }, + { + "epoch": 0.42846, + "grad_norm": 0.7771803302265342, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 42846 + }, + { + "epoch": 0.42847, + "grad_norm": 0.6948046409196699, + "learning_rate": 0.003, + "loss": 4.035, + "step": 42847 + }, + { + "epoch": 0.42848, + "grad_norm": 0.7152739366120671, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 42848 + }, + { + "epoch": 0.42849, + "grad_norm": 0.6210223893067466, + "learning_rate": 0.003, + "loss": 4.0061, + "step": 42849 + }, + { + "epoch": 0.4285, + "grad_norm": 0.5765074279192287, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 42850 + }, + { + "epoch": 0.42851, + "grad_norm": 0.6259444929393158, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 42851 + }, + { + "epoch": 0.42852, + "grad_norm": 0.6384488989594038, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 42852 + }, + { + "epoch": 0.42853, + "grad_norm": 0.6000590187138914, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 42853 + }, + { + "epoch": 0.42854, + "grad_norm": 0.6900571751355684, + "learning_rate": 0.003, + "loss": 4.0032, + "step": 42854 + }, + { + "epoch": 0.42855, + "grad_norm": 0.8857618534110807, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 42855 + }, + { + "epoch": 0.42856, + "grad_norm": 1.2013828836294147, + "learning_rate": 0.003, + "loss": 4.013, + "step": 42856 + }, + { + "epoch": 0.42857, + "grad_norm": 1.142724948170031, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 42857 + }, + { + "epoch": 0.42858, + "grad_norm": 0.902810281337086, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 42858 + }, + { + "epoch": 0.42859, + "grad_norm": 0.8274052762685433, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 42859 + }, + { + "epoch": 0.4286, + "grad_norm": 0.7283753162046657, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 42860 + }, + { + "epoch": 0.42861, + "grad_norm": 0.8218045902092574, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 42861 + }, + { + "epoch": 0.42862, + "grad_norm": 0.9007434271955033, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 42862 + }, + { + "epoch": 0.42863, + "grad_norm": 0.9671415408726928, + "learning_rate": 0.003, + "loss": 3.9902, + "step": 42863 + }, + { + "epoch": 0.42864, + "grad_norm": 1.0415333087576732, + "learning_rate": 0.003, + "loss": 4.027, + "step": 42864 + }, + { + "epoch": 0.42865, + "grad_norm": 0.9435982112700124, + "learning_rate": 0.003, + "loss": 4.013, + "step": 42865 + }, + { + "epoch": 0.42866, + "grad_norm": 0.9857481754030644, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 42866 + }, + { + "epoch": 0.42867, + "grad_norm": 1.0226886446775216, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 42867 + }, + { + "epoch": 0.42868, + "grad_norm": 0.9747185562991542, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 42868 + }, + { + "epoch": 0.42869, + "grad_norm": 0.8964813490500059, + "learning_rate": 0.003, + "loss": 4.015, + "step": 42869 + }, + { + "epoch": 0.4287, + "grad_norm": 0.8355467738854072, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 42870 + }, + { + "epoch": 0.42871, + "grad_norm": 0.7793794175296205, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 42871 + }, + { + "epoch": 0.42872, + "grad_norm": 0.7363691014497533, + "learning_rate": 0.003, + "loss": 4.051, + "step": 42872 + }, + { + "epoch": 0.42873, + "grad_norm": 0.8950826499700826, + "learning_rate": 0.003, + "loss": 4.0043, + "step": 42873 + }, + { + "epoch": 0.42874, + "grad_norm": 0.9833645194904524, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 42874 + }, + { + "epoch": 0.42875, + "grad_norm": 0.8588463347049592, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 42875 + }, + { + "epoch": 0.42876, + "grad_norm": 0.7985837820250082, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 42876 + }, + { + "epoch": 0.42877, + "grad_norm": 0.9026070382937714, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 42877 + }, + { + "epoch": 0.42878, + "grad_norm": 1.0030108788130692, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 42878 + }, + { + "epoch": 0.42879, + "grad_norm": 0.8957897915063192, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 42879 + }, + { + "epoch": 0.4288, + "grad_norm": 0.7774828129818958, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 42880 + }, + { + "epoch": 0.42881, + "grad_norm": 0.7199328585526409, + "learning_rate": 0.003, + "loss": 4.026, + "step": 42881 + }, + { + "epoch": 0.42882, + "grad_norm": 0.7406897117011476, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 42882 + }, + { + "epoch": 0.42883, + "grad_norm": 0.9176435822832469, + "learning_rate": 0.003, + "loss": 4.006, + "step": 42883 + }, + { + "epoch": 0.42884, + "grad_norm": 1.014828056485115, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 42884 + }, + { + "epoch": 0.42885, + "grad_norm": 0.9679518842368959, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 42885 + }, + { + "epoch": 0.42886, + "grad_norm": 0.9570629037858038, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 42886 + }, + { + "epoch": 0.42887, + "grad_norm": 0.9496223473665463, + "learning_rate": 0.003, + "loss": 4.041, + "step": 42887 + }, + { + "epoch": 0.42888, + "grad_norm": 0.8673493894649308, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 42888 + }, + { + "epoch": 0.42889, + "grad_norm": 0.8088214163064418, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 42889 + }, + { + "epoch": 0.4289, + "grad_norm": 0.755079929702602, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 42890 + }, + { + "epoch": 0.42891, + "grad_norm": 0.7127434129143231, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 42891 + }, + { + "epoch": 0.42892, + "grad_norm": 0.6172938204224495, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 42892 + }, + { + "epoch": 0.42893, + "grad_norm": 0.615907858916562, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 42893 + }, + { + "epoch": 0.42894, + "grad_norm": 0.7393853924267336, + "learning_rate": 0.003, + "loss": 4.022, + "step": 42894 + }, + { + "epoch": 0.42895, + "grad_norm": 0.919474498290313, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 42895 + }, + { + "epoch": 0.42896, + "grad_norm": 0.9914495747217329, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 42896 + }, + { + "epoch": 0.42897, + "grad_norm": 1.0077693751237737, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 42897 + }, + { + "epoch": 0.42898, + "grad_norm": 0.9823142632339531, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 42898 + }, + { + "epoch": 0.42899, + "grad_norm": 0.9092473507185589, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 42899 + }, + { + "epoch": 0.429, + "grad_norm": 0.8447260892696447, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 42900 + }, + { + "epoch": 0.42901, + "grad_norm": 0.7554109048719305, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 42901 + }, + { + "epoch": 0.42902, + "grad_norm": 0.8231822201558192, + "learning_rate": 0.003, + "loss": 4.061, + "step": 42902 + }, + { + "epoch": 0.42903, + "grad_norm": 0.7821839005724903, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 42903 + }, + { + "epoch": 0.42904, + "grad_norm": 0.7917176424845305, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 42904 + }, + { + "epoch": 0.42905, + "grad_norm": 0.8038760932472439, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 42905 + }, + { + "epoch": 0.42906, + "grad_norm": 0.833741048657983, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 42906 + }, + { + "epoch": 0.42907, + "grad_norm": 0.7888548233840917, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 42907 + }, + { + "epoch": 0.42908, + "grad_norm": 0.8379681580367652, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 42908 + }, + { + "epoch": 0.42909, + "grad_norm": 0.8189169424382367, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 42909 + }, + { + "epoch": 0.4291, + "grad_norm": 0.7502635908811487, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 42910 + }, + { + "epoch": 0.42911, + "grad_norm": 0.6816556228416731, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 42911 + }, + { + "epoch": 0.42912, + "grad_norm": 0.753871019748157, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 42912 + }, + { + "epoch": 0.42913, + "grad_norm": 0.8396766779465662, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 42913 + }, + { + "epoch": 0.42914, + "grad_norm": 0.8962511324842023, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 42914 + }, + { + "epoch": 0.42915, + "grad_norm": 0.9720090148254409, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 42915 + }, + { + "epoch": 0.42916, + "grad_norm": 0.9492299842099374, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 42916 + }, + { + "epoch": 0.42917, + "grad_norm": 0.8072445258542302, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 42917 + }, + { + "epoch": 0.42918, + "grad_norm": 0.5974809547890944, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 42918 + }, + { + "epoch": 0.42919, + "grad_norm": 0.6453834750583157, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 42919 + }, + { + "epoch": 0.4292, + "grad_norm": 0.6758121061665463, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 42920 + }, + { + "epoch": 0.42921, + "grad_norm": 0.7369522011595777, + "learning_rate": 0.003, + "loss": 4.03, + "step": 42921 + }, + { + "epoch": 0.42922, + "grad_norm": 0.6898329345129248, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 42922 + }, + { + "epoch": 0.42923, + "grad_norm": 0.723175261492936, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 42923 + }, + { + "epoch": 0.42924, + "grad_norm": 0.7568251082068023, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 42924 + }, + { + "epoch": 0.42925, + "grad_norm": 0.8221692625537081, + "learning_rate": 0.003, + "loss": 4.02, + "step": 42925 + }, + { + "epoch": 0.42926, + "grad_norm": 0.6959469265492827, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 42926 + }, + { + "epoch": 0.42927, + "grad_norm": 0.629143900214102, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 42927 + }, + { + "epoch": 0.42928, + "grad_norm": 0.5568256698427162, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 42928 + }, + { + "epoch": 0.42929, + "grad_norm": 0.5455910825610675, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 42929 + }, + { + "epoch": 0.4293, + "grad_norm": 0.58740401089055, + "learning_rate": 0.003, + "loss": 3.9745, + "step": 42930 + }, + { + "epoch": 0.42931, + "grad_norm": 0.8060062357573852, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 42931 + }, + { + "epoch": 0.42932, + "grad_norm": 1.252546872026421, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 42932 + }, + { + "epoch": 0.42933, + "grad_norm": 0.9072523670593684, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 42933 + }, + { + "epoch": 0.42934, + "grad_norm": 0.8897156877197829, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 42934 + }, + { + "epoch": 0.42935, + "grad_norm": 0.8563227745311082, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 42935 + }, + { + "epoch": 0.42936, + "grad_norm": 0.8305144200915799, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 42936 + }, + { + "epoch": 0.42937, + "grad_norm": 0.7981018970969561, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 42937 + }, + { + "epoch": 0.42938, + "grad_norm": 0.7831352234719641, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 42938 + }, + { + "epoch": 0.42939, + "grad_norm": 0.7189698206953768, + "learning_rate": 0.003, + "loss": 4.0694, + "step": 42939 + }, + { + "epoch": 0.4294, + "grad_norm": 0.8188340994057502, + "learning_rate": 0.003, + "loss": 3.9915, + "step": 42940 + }, + { + "epoch": 0.42941, + "grad_norm": 0.8686438207872049, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 42941 + }, + { + "epoch": 0.42942, + "grad_norm": 1.3263355238801369, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 42942 + }, + { + "epoch": 0.42943, + "grad_norm": 0.8249311110196851, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 42943 + }, + { + "epoch": 0.42944, + "grad_norm": 0.7501002547578036, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 42944 + }, + { + "epoch": 0.42945, + "grad_norm": 0.7491867581289656, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 42945 + }, + { + "epoch": 0.42946, + "grad_norm": 0.8120726490374326, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 42946 + }, + { + "epoch": 0.42947, + "grad_norm": 0.8682990233872976, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 42947 + }, + { + "epoch": 0.42948, + "grad_norm": 1.0090038233338166, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 42948 + }, + { + "epoch": 0.42949, + "grad_norm": 1.183036556421868, + "learning_rate": 0.003, + "loss": 4.039, + "step": 42949 + }, + { + "epoch": 0.4295, + "grad_norm": 0.9653122966938769, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 42950 + }, + { + "epoch": 0.42951, + "grad_norm": 1.0605903539109836, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 42951 + }, + { + "epoch": 0.42952, + "grad_norm": 0.8711913187765596, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 42952 + }, + { + "epoch": 0.42953, + "grad_norm": 0.8413844774730299, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 42953 + }, + { + "epoch": 0.42954, + "grad_norm": 0.8438814805656113, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 42954 + }, + { + "epoch": 0.42955, + "grad_norm": 0.9234947451312542, + "learning_rate": 0.003, + "loss": 4.03, + "step": 42955 + }, + { + "epoch": 0.42956, + "grad_norm": 0.9989667831910164, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 42956 + }, + { + "epoch": 0.42957, + "grad_norm": 1.0644723267273857, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 42957 + }, + { + "epoch": 0.42958, + "grad_norm": 0.9575033970254943, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 42958 + }, + { + "epoch": 0.42959, + "grad_norm": 0.7858015731188098, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 42959 + }, + { + "epoch": 0.4296, + "grad_norm": 0.7278653252213724, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 42960 + }, + { + "epoch": 0.42961, + "grad_norm": 0.6903276566181094, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 42961 + }, + { + "epoch": 0.42962, + "grad_norm": 0.6400309685364002, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 42962 + }, + { + "epoch": 0.42963, + "grad_norm": 0.6565910652451693, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 42963 + }, + { + "epoch": 0.42964, + "grad_norm": 0.7153042072670829, + "learning_rate": 0.003, + "loss": 4.03, + "step": 42964 + }, + { + "epoch": 0.42965, + "grad_norm": 0.8243558228958203, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 42965 + }, + { + "epoch": 0.42966, + "grad_norm": 0.946281894731026, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 42966 + }, + { + "epoch": 0.42967, + "grad_norm": 1.0909782619334596, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 42967 + }, + { + "epoch": 0.42968, + "grad_norm": 1.0242697555200073, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 42968 + }, + { + "epoch": 0.42969, + "grad_norm": 1.073841094531163, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 42969 + }, + { + "epoch": 0.4297, + "grad_norm": 1.0314658420112008, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 42970 + }, + { + "epoch": 0.42971, + "grad_norm": 1.1779487935087565, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 42971 + }, + { + "epoch": 0.42972, + "grad_norm": 0.8389467372205309, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 42972 + }, + { + "epoch": 0.42973, + "grad_norm": 0.6405950510362666, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 42973 + }, + { + "epoch": 0.42974, + "grad_norm": 0.6786921463610698, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 42974 + }, + { + "epoch": 0.42975, + "grad_norm": 0.856869200983531, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 42975 + }, + { + "epoch": 0.42976, + "grad_norm": 1.0587408266975633, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 42976 + }, + { + "epoch": 0.42977, + "grad_norm": 0.9390039266209631, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 42977 + }, + { + "epoch": 0.42978, + "grad_norm": 0.782953440728126, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 42978 + }, + { + "epoch": 0.42979, + "grad_norm": 0.7462695705089123, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 42979 + }, + { + "epoch": 0.4298, + "grad_norm": 0.7252010267890024, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 42980 + }, + { + "epoch": 0.42981, + "grad_norm": 0.7423458280682445, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 42981 + }, + { + "epoch": 0.42982, + "grad_norm": 0.7948266698760309, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 42982 + }, + { + "epoch": 0.42983, + "grad_norm": 0.6873625544496853, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 42983 + }, + { + "epoch": 0.42984, + "grad_norm": 0.7107882312916126, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 42984 + }, + { + "epoch": 0.42985, + "grad_norm": 0.7597600431059311, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 42985 + }, + { + "epoch": 0.42986, + "grad_norm": 0.7177993356343406, + "learning_rate": 0.003, + "loss": 4.026, + "step": 42986 + }, + { + "epoch": 0.42987, + "grad_norm": 0.7234426866970236, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 42987 + }, + { + "epoch": 0.42988, + "grad_norm": 0.7416412105865938, + "learning_rate": 0.003, + "loss": 4.039, + "step": 42988 + }, + { + "epoch": 0.42989, + "grad_norm": 0.682703301030877, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 42989 + }, + { + "epoch": 0.4299, + "grad_norm": 0.6738748336449195, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 42990 + }, + { + "epoch": 0.42991, + "grad_norm": 0.7526526233969605, + "learning_rate": 0.003, + "loss": 3.9776, + "step": 42991 + }, + { + "epoch": 0.42992, + "grad_norm": 0.8251148038402228, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 42992 + }, + { + "epoch": 0.42993, + "grad_norm": 0.9875816845181702, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 42993 + }, + { + "epoch": 0.42994, + "grad_norm": 1.368764696711705, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 42994 + }, + { + "epoch": 0.42995, + "grad_norm": 0.6635637105689958, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 42995 + }, + { + "epoch": 0.42996, + "grad_norm": 0.6194179402010275, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 42996 + }, + { + "epoch": 0.42997, + "grad_norm": 0.6740641613185527, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 42997 + }, + { + "epoch": 0.42998, + "grad_norm": 0.7335198745715528, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 42998 + }, + { + "epoch": 0.42999, + "grad_norm": 0.812493065818367, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 42999 + }, + { + "epoch": 0.43, + "grad_norm": 0.9004761646559409, + "learning_rate": 0.003, + "loss": 4.046, + "step": 43000 + }, + { + "epoch": 0.43001, + "grad_norm": 0.9087377136342425, + "learning_rate": 0.003, + "loss": 4.033, + "step": 43001 + }, + { + "epoch": 0.43002, + "grad_norm": 0.8400862184988456, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 43002 + }, + { + "epoch": 0.43003, + "grad_norm": 0.8392783383659603, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 43003 + }, + { + "epoch": 0.43004, + "grad_norm": 0.95668672211098, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 43004 + }, + { + "epoch": 0.43005, + "grad_norm": 0.9467965162726888, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 43005 + }, + { + "epoch": 0.43006, + "grad_norm": 1.1638182392296752, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 43006 + }, + { + "epoch": 0.43007, + "grad_norm": 1.1441110737196811, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 43007 + }, + { + "epoch": 0.43008, + "grad_norm": 0.8285485065141743, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 43008 + }, + { + "epoch": 0.43009, + "grad_norm": 0.7308078701960844, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 43009 + }, + { + "epoch": 0.4301, + "grad_norm": 0.737098312611496, + "learning_rate": 0.003, + "loss": 3.996, + "step": 43010 + }, + { + "epoch": 0.43011, + "grad_norm": 0.679628690139456, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 43011 + }, + { + "epoch": 0.43012, + "grad_norm": 0.6730021377079228, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 43012 + }, + { + "epoch": 0.43013, + "grad_norm": 0.7300062228893787, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 43013 + }, + { + "epoch": 0.43014, + "grad_norm": 0.7304015788508673, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 43014 + }, + { + "epoch": 0.43015, + "grad_norm": 0.8119577907807799, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 43015 + }, + { + "epoch": 0.43016, + "grad_norm": 0.9395934831492039, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 43016 + }, + { + "epoch": 0.43017, + "grad_norm": 0.8988803078164622, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 43017 + }, + { + "epoch": 0.43018, + "grad_norm": 0.7928723376709998, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 43018 + }, + { + "epoch": 0.43019, + "grad_norm": 0.7051991466300024, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 43019 + }, + { + "epoch": 0.4302, + "grad_norm": 0.6036507996939717, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 43020 + }, + { + "epoch": 0.43021, + "grad_norm": 0.7183310491486824, + "learning_rate": 0.003, + "loss": 3.994, + "step": 43021 + }, + { + "epoch": 0.43022, + "grad_norm": 0.9918196552543211, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 43022 + }, + { + "epoch": 0.43023, + "grad_norm": 1.496204465534801, + "learning_rate": 0.003, + "loss": 4.044, + "step": 43023 + }, + { + "epoch": 0.43024, + "grad_norm": 0.5441123801497313, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 43024 + }, + { + "epoch": 0.43025, + "grad_norm": 0.7598131530268912, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 43025 + }, + { + "epoch": 0.43026, + "grad_norm": 0.8761030318701682, + "learning_rate": 0.003, + "loss": 4.046, + "step": 43026 + }, + { + "epoch": 0.43027, + "grad_norm": 0.8562896463584908, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 43027 + }, + { + "epoch": 0.43028, + "grad_norm": 0.8132209203593437, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 43028 + }, + { + "epoch": 0.43029, + "grad_norm": 0.7791616546807534, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 43029 + }, + { + "epoch": 0.4303, + "grad_norm": 0.7412511087269721, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 43030 + }, + { + "epoch": 0.43031, + "grad_norm": 0.80322395066206, + "learning_rate": 0.003, + "loss": 4.018, + "step": 43031 + }, + { + "epoch": 0.43032, + "grad_norm": 0.8553994957678819, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 43032 + }, + { + "epoch": 0.43033, + "grad_norm": 0.8237409520193805, + "learning_rate": 0.003, + "loss": 4.0039, + "step": 43033 + }, + { + "epoch": 0.43034, + "grad_norm": 0.8476422503941231, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 43034 + }, + { + "epoch": 0.43035, + "grad_norm": 0.8403633770495582, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 43035 + }, + { + "epoch": 0.43036, + "grad_norm": 0.8547964705084728, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 43036 + }, + { + "epoch": 0.43037, + "grad_norm": 0.8058484129158732, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 43037 + }, + { + "epoch": 0.43038, + "grad_norm": 0.8805068032081143, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 43038 + }, + { + "epoch": 0.43039, + "grad_norm": 0.9699016739122518, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 43039 + }, + { + "epoch": 0.4304, + "grad_norm": 1.0185120943317114, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 43040 + }, + { + "epoch": 0.43041, + "grad_norm": 1.07857482838033, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 43041 + }, + { + "epoch": 0.43042, + "grad_norm": 1.0243495820510358, + "learning_rate": 0.003, + "loss": 3.9992, + "step": 43042 + }, + { + "epoch": 0.43043, + "grad_norm": 0.821354411956832, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 43043 + }, + { + "epoch": 0.43044, + "grad_norm": 0.7333171888699406, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 43044 + }, + { + "epoch": 0.43045, + "grad_norm": 0.6116902474233004, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 43045 + }, + { + "epoch": 0.43046, + "grad_norm": 0.7027138681130872, + "learning_rate": 0.003, + "loss": 4.041, + "step": 43046 + }, + { + "epoch": 0.43047, + "grad_norm": 0.7961951605376595, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 43047 + }, + { + "epoch": 0.43048, + "grad_norm": 0.9257597967292505, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 43048 + }, + { + "epoch": 0.43049, + "grad_norm": 1.0279386927420597, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 43049 + }, + { + "epoch": 0.4305, + "grad_norm": 0.9925299846289749, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 43050 + }, + { + "epoch": 0.43051, + "grad_norm": 1.0362853902412816, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 43051 + }, + { + "epoch": 0.43052, + "grad_norm": 0.8654324097601274, + "learning_rate": 0.003, + "loss": 4.0034, + "step": 43052 + }, + { + "epoch": 0.43053, + "grad_norm": 0.8066056590090388, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 43053 + }, + { + "epoch": 0.43054, + "grad_norm": 0.8577059238499339, + "learning_rate": 0.003, + "loss": 4.02, + "step": 43054 + }, + { + "epoch": 0.43055, + "grad_norm": 0.7896916372326983, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 43055 + }, + { + "epoch": 0.43056, + "grad_norm": 0.7218631167578998, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 43056 + }, + { + "epoch": 0.43057, + "grad_norm": 0.9289262522849433, + "learning_rate": 0.003, + "loss": 3.9946, + "step": 43057 + }, + { + "epoch": 0.43058, + "grad_norm": 1.0221476137660446, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 43058 + }, + { + "epoch": 0.43059, + "grad_norm": 1.085887623136704, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 43059 + }, + { + "epoch": 0.4306, + "grad_norm": 0.9361967954320329, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 43060 + }, + { + "epoch": 0.43061, + "grad_norm": 0.8671087090901064, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 43061 + }, + { + "epoch": 0.43062, + "grad_norm": 0.7204394005100748, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 43062 + }, + { + "epoch": 0.43063, + "grad_norm": 0.5890483728039377, + "learning_rate": 0.003, + "loss": 4.06, + "step": 43063 + }, + { + "epoch": 0.43064, + "grad_norm": 0.5609873125024146, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 43064 + }, + { + "epoch": 0.43065, + "grad_norm": 0.5684855210055092, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 43065 + }, + { + "epoch": 0.43066, + "grad_norm": 0.6828119545320079, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 43066 + }, + { + "epoch": 0.43067, + "grad_norm": 0.7993428373248384, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 43067 + }, + { + "epoch": 0.43068, + "grad_norm": 0.8937548115212365, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 43068 + }, + { + "epoch": 0.43069, + "grad_norm": 1.0810603786269342, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 43069 + }, + { + "epoch": 0.4307, + "grad_norm": 0.9162573171261696, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 43070 + }, + { + "epoch": 0.43071, + "grad_norm": 0.7584721890449345, + "learning_rate": 0.003, + "loss": 4.051, + "step": 43071 + }, + { + "epoch": 0.43072, + "grad_norm": 0.7709990482427441, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 43072 + }, + { + "epoch": 0.43073, + "grad_norm": 0.7409124134299695, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 43073 + }, + { + "epoch": 0.43074, + "grad_norm": 0.6765466790598315, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 43074 + }, + { + "epoch": 0.43075, + "grad_norm": 0.7618845114897798, + "learning_rate": 0.003, + "loss": 4.032, + "step": 43075 + }, + { + "epoch": 0.43076, + "grad_norm": 0.7265106420028865, + "learning_rate": 0.003, + "loss": 4.0003, + "step": 43076 + }, + { + "epoch": 0.43077, + "grad_norm": 0.6034194292757089, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 43077 + }, + { + "epoch": 0.43078, + "grad_norm": 0.6605256494350519, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 43078 + }, + { + "epoch": 0.43079, + "grad_norm": 0.7005567511008143, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 43079 + }, + { + "epoch": 0.4308, + "grad_norm": 0.7736339367074909, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 43080 + }, + { + "epoch": 0.43081, + "grad_norm": 0.8688352943828914, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 43081 + }, + { + "epoch": 0.43082, + "grad_norm": 0.9876631251814303, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 43082 + }, + { + "epoch": 0.43083, + "grad_norm": 1.0870989415355916, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 43083 + }, + { + "epoch": 0.43084, + "grad_norm": 0.9406500464815529, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 43084 + }, + { + "epoch": 0.43085, + "grad_norm": 0.9567840193775283, + "learning_rate": 0.003, + "loss": 4.057, + "step": 43085 + }, + { + "epoch": 0.43086, + "grad_norm": 0.900588124798482, + "learning_rate": 0.003, + "loss": 4.06, + "step": 43086 + }, + { + "epoch": 0.43087, + "grad_norm": 0.9720368689213774, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 43087 + }, + { + "epoch": 0.43088, + "grad_norm": 1.018487958524551, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 43088 + }, + { + "epoch": 0.43089, + "grad_norm": 1.1159658528029257, + "learning_rate": 0.003, + "loss": 4.0012, + "step": 43089 + }, + { + "epoch": 0.4309, + "grad_norm": 0.859427332069627, + "learning_rate": 0.003, + "loss": 4.019, + "step": 43090 + }, + { + "epoch": 0.43091, + "grad_norm": 0.8735693342783583, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 43091 + }, + { + "epoch": 0.43092, + "grad_norm": 0.9442051578242553, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 43092 + }, + { + "epoch": 0.43093, + "grad_norm": 0.994410469930728, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 43093 + }, + { + "epoch": 0.43094, + "grad_norm": 0.9949871971345375, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 43094 + }, + { + "epoch": 0.43095, + "grad_norm": 1.0571680815498516, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 43095 + }, + { + "epoch": 0.43096, + "grad_norm": 0.947691943604103, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 43096 + }, + { + "epoch": 0.43097, + "grad_norm": 0.9598095299285752, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 43097 + }, + { + "epoch": 0.43098, + "grad_norm": 1.0295161637407113, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 43098 + }, + { + "epoch": 0.43099, + "grad_norm": 1.0364358928652837, + "learning_rate": 0.003, + "loss": 4.089, + "step": 43099 + }, + { + "epoch": 0.431, + "grad_norm": 0.9726081104733273, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 43100 + }, + { + "epoch": 0.43101, + "grad_norm": 0.8856256639641341, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 43101 + }, + { + "epoch": 0.43102, + "grad_norm": 0.9080646519248673, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 43102 + }, + { + "epoch": 0.43103, + "grad_norm": 0.9313276965147587, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 43103 + }, + { + "epoch": 0.43104, + "grad_norm": 0.8333553279886826, + "learning_rate": 0.003, + "loss": 4.075, + "step": 43104 + }, + { + "epoch": 0.43105, + "grad_norm": 0.8037975820950136, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 43105 + }, + { + "epoch": 0.43106, + "grad_norm": 0.7591036192330157, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 43106 + }, + { + "epoch": 0.43107, + "grad_norm": 0.7372600750148489, + "learning_rate": 0.003, + "loss": 4.053, + "step": 43107 + }, + { + "epoch": 0.43108, + "grad_norm": 0.9096136530399129, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 43108 + }, + { + "epoch": 0.43109, + "grad_norm": 1.0318991876466035, + "learning_rate": 0.003, + "loss": 4.028, + "step": 43109 + }, + { + "epoch": 0.4311, + "grad_norm": 1.0458108131188633, + "learning_rate": 0.003, + "loss": 4.054, + "step": 43110 + }, + { + "epoch": 0.43111, + "grad_norm": 0.9683570997511419, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 43111 + }, + { + "epoch": 0.43112, + "grad_norm": 0.8825933413866107, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 43112 + }, + { + "epoch": 0.43113, + "grad_norm": 0.9247925690469551, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 43113 + }, + { + "epoch": 0.43114, + "grad_norm": 0.8060097775465568, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 43114 + }, + { + "epoch": 0.43115, + "grad_norm": 0.7795023591092682, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 43115 + }, + { + "epoch": 0.43116, + "grad_norm": 0.6998060763640703, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 43116 + }, + { + "epoch": 0.43117, + "grad_norm": 0.6631041578505551, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 43117 + }, + { + "epoch": 0.43118, + "grad_norm": 0.5789675644685903, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 43118 + }, + { + "epoch": 0.43119, + "grad_norm": 0.5333824359422714, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 43119 + }, + { + "epoch": 0.4312, + "grad_norm": 0.4949050832783218, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 43120 + }, + { + "epoch": 0.43121, + "grad_norm": 0.5298142843677568, + "learning_rate": 0.003, + "loss": 3.9833, + "step": 43121 + }, + { + "epoch": 0.43122, + "grad_norm": 0.5114178279243745, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 43122 + }, + { + "epoch": 0.43123, + "grad_norm": 0.47582262479447857, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 43123 + }, + { + "epoch": 0.43124, + "grad_norm": 0.4953139927361343, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 43124 + }, + { + "epoch": 0.43125, + "grad_norm": 0.494938426039148, + "learning_rate": 0.003, + "loss": 3.9969, + "step": 43125 + }, + { + "epoch": 0.43126, + "grad_norm": 0.5718943612719959, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 43126 + }, + { + "epoch": 0.43127, + "grad_norm": 0.7434509048833726, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 43127 + }, + { + "epoch": 0.43128, + "grad_norm": 0.9825348696454063, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 43128 + }, + { + "epoch": 0.43129, + "grad_norm": 1.1727569052715685, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 43129 + }, + { + "epoch": 0.4313, + "grad_norm": 0.801630881459023, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 43130 + }, + { + "epoch": 0.43131, + "grad_norm": 0.71902553136949, + "learning_rate": 0.003, + "loss": 4.0046, + "step": 43131 + }, + { + "epoch": 0.43132, + "grad_norm": 0.7506598198866593, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 43132 + }, + { + "epoch": 0.43133, + "grad_norm": 0.7420384176439826, + "learning_rate": 0.003, + "loss": 4.0622, + "step": 43133 + }, + { + "epoch": 0.43134, + "grad_norm": 0.9119100291537988, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 43134 + }, + { + "epoch": 0.43135, + "grad_norm": 0.9464819078026645, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 43135 + }, + { + "epoch": 0.43136, + "grad_norm": 0.8086700621927019, + "learning_rate": 0.003, + "loss": 3.9827, + "step": 43136 + }, + { + "epoch": 0.43137, + "grad_norm": 0.7426882672285937, + "learning_rate": 0.003, + "loss": 3.9997, + "step": 43137 + }, + { + "epoch": 0.43138, + "grad_norm": 0.7102106703509331, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 43138 + }, + { + "epoch": 0.43139, + "grad_norm": 0.7774283881091459, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 43139 + }, + { + "epoch": 0.4314, + "grad_norm": 0.9802711894082664, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 43140 + }, + { + "epoch": 0.43141, + "grad_norm": 1.1044285595518233, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 43141 + }, + { + "epoch": 0.43142, + "grad_norm": 1.1883940811646834, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 43142 + }, + { + "epoch": 0.43143, + "grad_norm": 0.9743882977643076, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 43143 + }, + { + "epoch": 0.43144, + "grad_norm": 1.0682226184861814, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 43144 + }, + { + "epoch": 0.43145, + "grad_norm": 1.1038631655212547, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 43145 + }, + { + "epoch": 0.43146, + "grad_norm": 0.8872358589475443, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 43146 + }, + { + "epoch": 0.43147, + "grad_norm": 0.9313826157914223, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 43147 + }, + { + "epoch": 0.43148, + "grad_norm": 0.9519911337973339, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 43148 + }, + { + "epoch": 0.43149, + "grad_norm": 0.9635765299347299, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 43149 + }, + { + "epoch": 0.4315, + "grad_norm": 1.0110530839503142, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 43150 + }, + { + "epoch": 0.43151, + "grad_norm": 1.1701505308619624, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 43151 + }, + { + "epoch": 0.43152, + "grad_norm": 0.8558923502440007, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 43152 + }, + { + "epoch": 0.43153, + "grad_norm": 0.8694733289248251, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 43153 + }, + { + "epoch": 0.43154, + "grad_norm": 0.8776659494978463, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 43154 + }, + { + "epoch": 0.43155, + "grad_norm": 0.929993525233893, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 43155 + }, + { + "epoch": 0.43156, + "grad_norm": 0.8419059790238635, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 43156 + }, + { + "epoch": 0.43157, + "grad_norm": 0.696492211822815, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 43157 + }, + { + "epoch": 0.43158, + "grad_norm": 0.671319703938562, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 43158 + }, + { + "epoch": 0.43159, + "grad_norm": 0.7296076886795706, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 43159 + }, + { + "epoch": 0.4316, + "grad_norm": 0.7602475522534748, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 43160 + }, + { + "epoch": 0.43161, + "grad_norm": 0.764248090411343, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 43161 + }, + { + "epoch": 0.43162, + "grad_norm": 0.8227075336869917, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 43162 + }, + { + "epoch": 0.43163, + "grad_norm": 0.835557300325611, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 43163 + }, + { + "epoch": 0.43164, + "grad_norm": 0.7712495110570077, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 43164 + }, + { + "epoch": 0.43165, + "grad_norm": 0.6733032020429479, + "learning_rate": 0.003, + "loss": 4.03, + "step": 43165 + }, + { + "epoch": 0.43166, + "grad_norm": 0.7218970533798204, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 43166 + }, + { + "epoch": 0.43167, + "grad_norm": 0.7368169149391764, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 43167 + }, + { + "epoch": 0.43168, + "grad_norm": 0.6395970267555775, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 43168 + }, + { + "epoch": 0.43169, + "grad_norm": 0.563310645116705, + "learning_rate": 0.003, + "loss": 3.9943, + "step": 43169 + }, + { + "epoch": 0.4317, + "grad_norm": 0.5080356461192528, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 43170 + }, + { + "epoch": 0.43171, + "grad_norm": 0.48415354969005836, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 43171 + }, + { + "epoch": 0.43172, + "grad_norm": 0.5142635326675934, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 43172 + }, + { + "epoch": 0.43173, + "grad_norm": 0.6147851002720837, + "learning_rate": 0.003, + "loss": 3.9992, + "step": 43173 + }, + { + "epoch": 0.43174, + "grad_norm": 0.7291667565469698, + "learning_rate": 0.003, + "loss": 4.039, + "step": 43174 + }, + { + "epoch": 0.43175, + "grad_norm": 0.9122251296720509, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 43175 + }, + { + "epoch": 0.43176, + "grad_norm": 1.0583036113896922, + "learning_rate": 0.003, + "loss": 4.043, + "step": 43176 + }, + { + "epoch": 0.43177, + "grad_norm": 0.9303982070970958, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 43177 + }, + { + "epoch": 0.43178, + "grad_norm": 0.8636409560430945, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 43178 + }, + { + "epoch": 0.43179, + "grad_norm": 0.8644592467126586, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 43179 + }, + { + "epoch": 0.4318, + "grad_norm": 0.7398488425619598, + "learning_rate": 0.003, + "loss": 3.9916, + "step": 43180 + }, + { + "epoch": 0.43181, + "grad_norm": 0.7458912665511671, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 43181 + }, + { + "epoch": 0.43182, + "grad_norm": 0.7705762135408569, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 43182 + }, + { + "epoch": 0.43183, + "grad_norm": 0.9334158070881978, + "learning_rate": 0.003, + "loss": 4.018, + "step": 43183 + }, + { + "epoch": 0.43184, + "grad_norm": 1.0941987804269973, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 43184 + }, + { + "epoch": 0.43185, + "grad_norm": 1.0455861395351802, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 43185 + }, + { + "epoch": 0.43186, + "grad_norm": 1.186643977147884, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 43186 + }, + { + "epoch": 0.43187, + "grad_norm": 0.8137647773533114, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 43187 + }, + { + "epoch": 0.43188, + "grad_norm": 0.792798848852523, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 43188 + }, + { + "epoch": 0.43189, + "grad_norm": 0.7954574975706306, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 43189 + }, + { + "epoch": 0.4319, + "grad_norm": 0.8246903800244811, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 43190 + }, + { + "epoch": 0.43191, + "grad_norm": 0.9515401458449625, + "learning_rate": 0.003, + "loss": 4.0528, + "step": 43191 + }, + { + "epoch": 0.43192, + "grad_norm": 0.9333952257863417, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 43192 + }, + { + "epoch": 0.43193, + "grad_norm": 0.8789199800496003, + "learning_rate": 0.003, + "loss": 4.015, + "step": 43193 + }, + { + "epoch": 0.43194, + "grad_norm": 0.9808186033116592, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 43194 + }, + { + "epoch": 0.43195, + "grad_norm": 0.961897047413435, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 43195 + }, + { + "epoch": 0.43196, + "grad_norm": 1.1135337196112636, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 43196 + }, + { + "epoch": 0.43197, + "grad_norm": 0.9094454167844767, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 43197 + }, + { + "epoch": 0.43198, + "grad_norm": 0.8578946307065131, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 43198 + }, + { + "epoch": 0.43199, + "grad_norm": 1.0339847241120128, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 43199 + }, + { + "epoch": 0.432, + "grad_norm": 1.0403988351977422, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 43200 + }, + { + "epoch": 0.43201, + "grad_norm": 1.0633643416449698, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 43201 + }, + { + "epoch": 0.43202, + "grad_norm": 0.8699175142316438, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 43202 + }, + { + "epoch": 0.43203, + "grad_norm": 0.7452898244088861, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 43203 + }, + { + "epoch": 0.43204, + "grad_norm": 0.7297535656308538, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 43204 + }, + { + "epoch": 0.43205, + "grad_norm": 0.6203266785099536, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 43205 + }, + { + "epoch": 0.43206, + "grad_norm": 0.597258300066205, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 43206 + }, + { + "epoch": 0.43207, + "grad_norm": 0.6283801322291227, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 43207 + }, + { + "epoch": 0.43208, + "grad_norm": 0.613297227755919, + "learning_rate": 0.003, + "loss": 4.018, + "step": 43208 + }, + { + "epoch": 0.43209, + "grad_norm": 0.7203291719743689, + "learning_rate": 0.003, + "loss": 4.032, + "step": 43209 + }, + { + "epoch": 0.4321, + "grad_norm": 0.866856329067245, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 43210 + }, + { + "epoch": 0.43211, + "grad_norm": 0.9109694474209094, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 43211 + }, + { + "epoch": 0.43212, + "grad_norm": 0.9824967029664309, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 43212 + }, + { + "epoch": 0.43213, + "grad_norm": 1.157358563573204, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 43213 + }, + { + "epoch": 0.43214, + "grad_norm": 0.8945068767559534, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 43214 + }, + { + "epoch": 0.43215, + "grad_norm": 0.7816851239772544, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 43215 + }, + { + "epoch": 0.43216, + "grad_norm": 0.7116306521503376, + "learning_rate": 0.003, + "loss": 4.0027, + "step": 43216 + }, + { + "epoch": 0.43217, + "grad_norm": 0.7497263478009574, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 43217 + }, + { + "epoch": 0.43218, + "grad_norm": 0.7385971668645096, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 43218 + }, + { + "epoch": 0.43219, + "grad_norm": 0.6514441922298643, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 43219 + }, + { + "epoch": 0.4322, + "grad_norm": 0.6520029455822146, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 43220 + }, + { + "epoch": 0.43221, + "grad_norm": 0.6439022648215855, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 43221 + }, + { + "epoch": 0.43222, + "grad_norm": 0.7480876920535026, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 43222 + }, + { + "epoch": 0.43223, + "grad_norm": 0.7266071982212425, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 43223 + }, + { + "epoch": 0.43224, + "grad_norm": 0.74901448928792, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 43224 + }, + { + "epoch": 0.43225, + "grad_norm": 0.6798872814120761, + "learning_rate": 0.003, + "loss": 4.035, + "step": 43225 + }, + { + "epoch": 0.43226, + "grad_norm": 0.6215185973138431, + "learning_rate": 0.003, + "loss": 3.9959, + "step": 43226 + }, + { + "epoch": 0.43227, + "grad_norm": 0.7288121808655073, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 43227 + }, + { + "epoch": 0.43228, + "grad_norm": 0.8747341074968756, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 43228 + }, + { + "epoch": 0.43229, + "grad_norm": 0.982616010545514, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 43229 + }, + { + "epoch": 0.4323, + "grad_norm": 1.311163108752391, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 43230 + }, + { + "epoch": 0.43231, + "grad_norm": 0.6942142292934189, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 43231 + }, + { + "epoch": 0.43232, + "grad_norm": 0.6465404575884032, + "learning_rate": 0.003, + "loss": 4.023, + "step": 43232 + }, + { + "epoch": 0.43233, + "grad_norm": 0.7474519743898597, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 43233 + }, + { + "epoch": 0.43234, + "grad_norm": 0.8243241955403177, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 43234 + }, + { + "epoch": 0.43235, + "grad_norm": 0.9099815088249948, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 43235 + }, + { + "epoch": 0.43236, + "grad_norm": 0.8915968254451624, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 43236 + }, + { + "epoch": 0.43237, + "grad_norm": 0.8968644603082273, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 43237 + }, + { + "epoch": 0.43238, + "grad_norm": 0.944590984233911, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 43238 + }, + { + "epoch": 0.43239, + "grad_norm": 0.8256191503277314, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 43239 + }, + { + "epoch": 0.4324, + "grad_norm": 0.8003172895924394, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 43240 + }, + { + "epoch": 0.43241, + "grad_norm": 0.8035573147656259, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 43241 + }, + { + "epoch": 0.43242, + "grad_norm": 0.7738203712823857, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 43242 + }, + { + "epoch": 0.43243, + "grad_norm": 0.7792359775178351, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 43243 + }, + { + "epoch": 0.43244, + "grad_norm": 0.7493978121147893, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 43244 + }, + { + "epoch": 0.43245, + "grad_norm": 0.7741345725470958, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 43245 + }, + { + "epoch": 0.43246, + "grad_norm": 0.7696509889829356, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 43246 + }, + { + "epoch": 0.43247, + "grad_norm": 0.8574346502269888, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 43247 + }, + { + "epoch": 0.43248, + "grad_norm": 0.8278946137852844, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 43248 + }, + { + "epoch": 0.43249, + "grad_norm": 0.8591812581017704, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 43249 + }, + { + "epoch": 0.4325, + "grad_norm": 0.9387197297948986, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 43250 + }, + { + "epoch": 0.43251, + "grad_norm": 1.1317796772114845, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 43251 + }, + { + "epoch": 0.43252, + "grad_norm": 1.0067076556295524, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 43252 + }, + { + "epoch": 0.43253, + "grad_norm": 1.080328826319412, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 43253 + }, + { + "epoch": 0.43254, + "grad_norm": 1.0281704056572025, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 43254 + }, + { + "epoch": 0.43255, + "grad_norm": 1.0055964452307435, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 43255 + }, + { + "epoch": 0.43256, + "grad_norm": 1.0073530818683436, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 43256 + }, + { + "epoch": 0.43257, + "grad_norm": 0.9042261343952354, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 43257 + }, + { + "epoch": 0.43258, + "grad_norm": 0.89902037413967, + "learning_rate": 0.003, + "loss": 4.0873, + "step": 43258 + }, + { + "epoch": 0.43259, + "grad_norm": 0.7869056212483957, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 43259 + }, + { + "epoch": 0.4326, + "grad_norm": 0.8064931679776411, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 43260 + }, + { + "epoch": 0.43261, + "grad_norm": 0.9248804630372633, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 43261 + }, + { + "epoch": 0.43262, + "grad_norm": 0.922225357905002, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 43262 + }, + { + "epoch": 0.43263, + "grad_norm": 0.9583398944309776, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 43263 + }, + { + "epoch": 0.43264, + "grad_norm": 1.0059625708626458, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 43264 + }, + { + "epoch": 0.43265, + "grad_norm": 0.9927866230499051, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 43265 + }, + { + "epoch": 0.43266, + "grad_norm": 0.961098004788376, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 43266 + }, + { + "epoch": 0.43267, + "grad_norm": 0.9656779327024331, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 43267 + }, + { + "epoch": 0.43268, + "grad_norm": 0.9105421063209898, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 43268 + }, + { + "epoch": 0.43269, + "grad_norm": 0.8905263580201442, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 43269 + }, + { + "epoch": 0.4327, + "grad_norm": 1.0768875804051616, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 43270 + }, + { + "epoch": 0.43271, + "grad_norm": 1.181269004611643, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 43271 + }, + { + "epoch": 0.43272, + "grad_norm": 0.8693577403662598, + "learning_rate": 0.003, + "loss": 4.037, + "step": 43272 + }, + { + "epoch": 0.43273, + "grad_norm": 0.7392370350241811, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 43273 + }, + { + "epoch": 0.43274, + "grad_norm": 0.6146710571569014, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 43274 + }, + { + "epoch": 0.43275, + "grad_norm": 0.490989726887227, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 43275 + }, + { + "epoch": 0.43276, + "grad_norm": 0.562002917915055, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 43276 + }, + { + "epoch": 0.43277, + "grad_norm": 0.5707378117607521, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 43277 + }, + { + "epoch": 0.43278, + "grad_norm": 0.5378544010180769, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 43278 + }, + { + "epoch": 0.43279, + "grad_norm": 0.49635558330372065, + "learning_rate": 0.003, + "loss": 4.0101, + "step": 43279 + }, + { + "epoch": 0.4328, + "grad_norm": 0.5165412951059859, + "learning_rate": 0.003, + "loss": 3.9975, + "step": 43280 + }, + { + "epoch": 0.43281, + "grad_norm": 0.5099672020490411, + "learning_rate": 0.003, + "loss": 4.002, + "step": 43281 + }, + { + "epoch": 0.43282, + "grad_norm": 0.6628724546362531, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 43282 + }, + { + "epoch": 0.43283, + "grad_norm": 0.9189320466090536, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 43283 + }, + { + "epoch": 0.43284, + "grad_norm": 1.190667888453178, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 43284 + }, + { + "epoch": 0.43285, + "grad_norm": 0.6728489562854619, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 43285 + }, + { + "epoch": 0.43286, + "grad_norm": 0.66315269819834, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 43286 + }, + { + "epoch": 0.43287, + "grad_norm": 0.7510606578767993, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 43287 + }, + { + "epoch": 0.43288, + "grad_norm": 0.8383100281036583, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 43288 + }, + { + "epoch": 0.43289, + "grad_norm": 0.8267452872754145, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 43289 + }, + { + "epoch": 0.4329, + "grad_norm": 0.8494921336470659, + "learning_rate": 0.003, + "loss": 3.9987, + "step": 43290 + }, + { + "epoch": 0.43291, + "grad_norm": 1.0105921840192242, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 43291 + }, + { + "epoch": 0.43292, + "grad_norm": 1.1067045816904244, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 43292 + }, + { + "epoch": 0.43293, + "grad_norm": 1.0094027440303768, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 43293 + }, + { + "epoch": 0.43294, + "grad_norm": 1.0394233901060506, + "learning_rate": 0.003, + "loss": 4.0751, + "step": 43294 + }, + { + "epoch": 0.43295, + "grad_norm": 0.9987693764702628, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 43295 + }, + { + "epoch": 0.43296, + "grad_norm": 0.9921720990465069, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 43296 + }, + { + "epoch": 0.43297, + "grad_norm": 0.8827393699722851, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 43297 + }, + { + "epoch": 0.43298, + "grad_norm": 0.8051849441252233, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 43298 + }, + { + "epoch": 0.43299, + "grad_norm": 0.7864610350158853, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 43299 + }, + { + "epoch": 0.433, + "grad_norm": 0.6553705085429777, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 43300 + }, + { + "epoch": 0.43301, + "grad_norm": 0.5680520760897454, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 43301 + }, + { + "epoch": 0.43302, + "grad_norm": 0.5646242363748221, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 43302 + }, + { + "epoch": 0.43303, + "grad_norm": 0.5878744839302119, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 43303 + }, + { + "epoch": 0.43304, + "grad_norm": 0.6237863099743577, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 43304 + }, + { + "epoch": 0.43305, + "grad_norm": 0.6398737677115074, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 43305 + }, + { + "epoch": 0.43306, + "grad_norm": 0.6471495868501359, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 43306 + }, + { + "epoch": 0.43307, + "grad_norm": 0.6508322472107231, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 43307 + }, + { + "epoch": 0.43308, + "grad_norm": 0.6266471938888, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 43308 + }, + { + "epoch": 0.43309, + "grad_norm": 0.7882519431993698, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 43309 + }, + { + "epoch": 0.4331, + "grad_norm": 1.1094321731418733, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 43310 + }, + { + "epoch": 0.43311, + "grad_norm": 1.1471851432092668, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 43311 + }, + { + "epoch": 0.43312, + "grad_norm": 1.102093901715195, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 43312 + }, + { + "epoch": 0.43313, + "grad_norm": 0.9801173233730814, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 43313 + }, + { + "epoch": 0.43314, + "grad_norm": 0.9965055837403528, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 43314 + }, + { + "epoch": 0.43315, + "grad_norm": 0.9747329851441193, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 43315 + }, + { + "epoch": 0.43316, + "grad_norm": 0.9323594597459975, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 43316 + }, + { + "epoch": 0.43317, + "grad_norm": 0.9425303143320097, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 43317 + }, + { + "epoch": 0.43318, + "grad_norm": 0.9111103434346208, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 43318 + }, + { + "epoch": 0.43319, + "grad_norm": 1.0249867841813107, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 43319 + }, + { + "epoch": 0.4332, + "grad_norm": 1.0228993127473625, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 43320 + }, + { + "epoch": 0.43321, + "grad_norm": 0.8711846727670055, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 43321 + }, + { + "epoch": 0.43322, + "grad_norm": 0.6574968719249109, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 43322 + }, + { + "epoch": 0.43323, + "grad_norm": 0.6094436546977966, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 43323 + }, + { + "epoch": 0.43324, + "grad_norm": 0.5631439923080862, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 43324 + }, + { + "epoch": 0.43325, + "grad_norm": 0.5930305135193925, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 43325 + }, + { + "epoch": 0.43326, + "grad_norm": 0.6635321899019363, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 43326 + }, + { + "epoch": 0.43327, + "grad_norm": 0.7420267931671596, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 43327 + }, + { + "epoch": 0.43328, + "grad_norm": 0.8329883141930766, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 43328 + }, + { + "epoch": 0.43329, + "grad_norm": 0.8262544214371063, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 43329 + }, + { + "epoch": 0.4333, + "grad_norm": 0.8792831506817201, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 43330 + }, + { + "epoch": 0.43331, + "grad_norm": 1.0173312211171763, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 43331 + }, + { + "epoch": 0.43332, + "grad_norm": 0.932498786858504, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 43332 + }, + { + "epoch": 0.43333, + "grad_norm": 0.8937736782297574, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 43333 + }, + { + "epoch": 0.43334, + "grad_norm": 0.8024084327053211, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 43334 + }, + { + "epoch": 0.43335, + "grad_norm": 0.7071870310673251, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 43335 + }, + { + "epoch": 0.43336, + "grad_norm": 0.7513102506375687, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 43336 + }, + { + "epoch": 0.43337, + "grad_norm": 0.8562980306392624, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 43337 + }, + { + "epoch": 0.43338, + "grad_norm": 0.8823153714435413, + "learning_rate": 0.003, + "loss": 4.051, + "step": 43338 + }, + { + "epoch": 0.43339, + "grad_norm": 0.8639271293616045, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 43339 + }, + { + "epoch": 0.4334, + "grad_norm": 0.9289875598990318, + "learning_rate": 0.003, + "loss": 4.049, + "step": 43340 + }, + { + "epoch": 0.43341, + "grad_norm": 1.0470296370661398, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 43341 + }, + { + "epoch": 0.43342, + "grad_norm": 1.1247742016402236, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 43342 + }, + { + "epoch": 0.43343, + "grad_norm": 0.8542465885593993, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 43343 + }, + { + "epoch": 0.43344, + "grad_norm": 0.8042357978534538, + "learning_rate": 0.003, + "loss": 4.0503, + "step": 43344 + }, + { + "epoch": 0.43345, + "grad_norm": 0.8228505026214724, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 43345 + }, + { + "epoch": 0.43346, + "grad_norm": 0.8735785206010978, + "learning_rate": 0.003, + "loss": 4.057, + "step": 43346 + }, + { + "epoch": 0.43347, + "grad_norm": 0.8989918929708376, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 43347 + }, + { + "epoch": 0.43348, + "grad_norm": 0.8665484999191859, + "learning_rate": 0.003, + "loss": 3.9948, + "step": 43348 + }, + { + "epoch": 0.43349, + "grad_norm": 0.9632971057068427, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 43349 + }, + { + "epoch": 0.4335, + "grad_norm": 1.1099638136734933, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 43350 + }, + { + "epoch": 0.43351, + "grad_norm": 0.9187327329919898, + "learning_rate": 0.003, + "loss": 4.0742, + "step": 43351 + }, + { + "epoch": 0.43352, + "grad_norm": 0.7678869290564166, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 43352 + }, + { + "epoch": 0.43353, + "grad_norm": 0.7449134263731182, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 43353 + }, + { + "epoch": 0.43354, + "grad_norm": 0.6994636390798792, + "learning_rate": 0.003, + "loss": 3.9875, + "step": 43354 + }, + { + "epoch": 0.43355, + "grad_norm": 0.6837468813984254, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 43355 + }, + { + "epoch": 0.43356, + "grad_norm": 0.7243081765941572, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 43356 + }, + { + "epoch": 0.43357, + "grad_norm": 0.6786445055917298, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 43357 + }, + { + "epoch": 0.43358, + "grad_norm": 0.6929724746034451, + "learning_rate": 0.003, + "loss": 3.9965, + "step": 43358 + }, + { + "epoch": 0.43359, + "grad_norm": 0.9076965657277396, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 43359 + }, + { + "epoch": 0.4336, + "grad_norm": 1.1878661916907032, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 43360 + }, + { + "epoch": 0.43361, + "grad_norm": 1.0196065114232504, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 43361 + }, + { + "epoch": 0.43362, + "grad_norm": 0.8390609419160356, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 43362 + }, + { + "epoch": 0.43363, + "grad_norm": 0.6970503597605257, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 43363 + }, + { + "epoch": 0.43364, + "grad_norm": 0.6656577471206989, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 43364 + }, + { + "epoch": 0.43365, + "grad_norm": 0.6022167898223505, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 43365 + }, + { + "epoch": 0.43366, + "grad_norm": 0.5543353953482442, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 43366 + }, + { + "epoch": 0.43367, + "grad_norm": 0.5551101479245891, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 43367 + }, + { + "epoch": 0.43368, + "grad_norm": 0.5787648352698543, + "learning_rate": 0.003, + "loss": 4.029, + "step": 43368 + }, + { + "epoch": 0.43369, + "grad_norm": 0.6474038073403747, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 43369 + }, + { + "epoch": 0.4337, + "grad_norm": 0.7043590179731447, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 43370 + }, + { + "epoch": 0.43371, + "grad_norm": 0.7321686836058805, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 43371 + }, + { + "epoch": 0.43372, + "grad_norm": 0.7129208834842626, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 43372 + }, + { + "epoch": 0.43373, + "grad_norm": 0.8027651553085968, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 43373 + }, + { + "epoch": 0.43374, + "grad_norm": 0.8543991164337578, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 43374 + }, + { + "epoch": 0.43375, + "grad_norm": 0.9819643773001091, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 43375 + }, + { + "epoch": 0.43376, + "grad_norm": 1.2621022092813534, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 43376 + }, + { + "epoch": 0.43377, + "grad_norm": 0.7990952740589937, + "learning_rate": 0.003, + "loss": 3.9852, + "step": 43377 + }, + { + "epoch": 0.43378, + "grad_norm": 0.7085638597835905, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 43378 + }, + { + "epoch": 0.43379, + "grad_norm": 0.7366804434210531, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 43379 + }, + { + "epoch": 0.4338, + "grad_norm": 0.7180849032737004, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 43380 + }, + { + "epoch": 0.43381, + "grad_norm": 0.7177172891389968, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 43381 + }, + { + "epoch": 0.43382, + "grad_norm": 0.7191043834691178, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 43382 + }, + { + "epoch": 0.43383, + "grad_norm": 0.7542235778779041, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 43383 + }, + { + "epoch": 0.43384, + "grad_norm": 0.6751149258944255, + "learning_rate": 0.003, + "loss": 3.9844, + "step": 43384 + }, + { + "epoch": 0.43385, + "grad_norm": 0.6057086077620565, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 43385 + }, + { + "epoch": 0.43386, + "grad_norm": 0.6339956301354789, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 43386 + }, + { + "epoch": 0.43387, + "grad_norm": 0.6429248395160192, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 43387 + }, + { + "epoch": 0.43388, + "grad_norm": 0.6978240117220628, + "learning_rate": 0.003, + "loss": 3.9837, + "step": 43388 + }, + { + "epoch": 0.43389, + "grad_norm": 0.8394160137990332, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 43389 + }, + { + "epoch": 0.4339, + "grad_norm": 1.124673572279606, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 43390 + }, + { + "epoch": 0.43391, + "grad_norm": 1.1914552016824291, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 43391 + }, + { + "epoch": 0.43392, + "grad_norm": 0.8982288327033094, + "learning_rate": 0.003, + "loss": 4.0022, + "step": 43392 + }, + { + "epoch": 0.43393, + "grad_norm": 0.9091466770161503, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 43393 + }, + { + "epoch": 0.43394, + "grad_norm": 0.9696939976492817, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 43394 + }, + { + "epoch": 0.43395, + "grad_norm": 0.919994600584643, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 43395 + }, + { + "epoch": 0.43396, + "grad_norm": 0.9292380892356144, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 43396 + }, + { + "epoch": 0.43397, + "grad_norm": 0.8790241504344755, + "learning_rate": 0.003, + "loss": 4.0076, + "step": 43397 + }, + { + "epoch": 0.43398, + "grad_norm": 0.7954885384691646, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 43398 + }, + { + "epoch": 0.43399, + "grad_norm": 0.8993691571785314, + "learning_rate": 0.003, + "loss": 4.044, + "step": 43399 + }, + { + "epoch": 0.434, + "grad_norm": 1.085508312445184, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 43400 + }, + { + "epoch": 0.43401, + "grad_norm": 0.9261948228693855, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 43401 + }, + { + "epoch": 0.43402, + "grad_norm": 0.8741628892823884, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 43402 + }, + { + "epoch": 0.43403, + "grad_norm": 0.8309145638653863, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 43403 + }, + { + "epoch": 0.43404, + "grad_norm": 0.8179501457709999, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 43404 + }, + { + "epoch": 0.43405, + "grad_norm": 0.9460364856978375, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 43405 + }, + { + "epoch": 0.43406, + "grad_norm": 1.0133491299024713, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 43406 + }, + { + "epoch": 0.43407, + "grad_norm": 0.9085045781716513, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 43407 + }, + { + "epoch": 0.43408, + "grad_norm": 1.0139783191768443, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 43408 + }, + { + "epoch": 0.43409, + "grad_norm": 1.0813026525664926, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 43409 + }, + { + "epoch": 0.4341, + "grad_norm": 0.9888358356879187, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 43410 + }, + { + "epoch": 0.43411, + "grad_norm": 1.10615630382423, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 43411 + }, + { + "epoch": 0.43412, + "grad_norm": 0.7854388368611772, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 43412 + }, + { + "epoch": 0.43413, + "grad_norm": 0.8250180392126357, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 43413 + }, + { + "epoch": 0.43414, + "grad_norm": 0.842634605037293, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 43414 + }, + { + "epoch": 0.43415, + "grad_norm": 0.9353788463464724, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 43415 + }, + { + "epoch": 0.43416, + "grad_norm": 0.9566091099352961, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 43416 + }, + { + "epoch": 0.43417, + "grad_norm": 1.0595731466278038, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 43417 + }, + { + "epoch": 0.43418, + "grad_norm": 0.8835536237614647, + "learning_rate": 0.003, + "loss": 4.019, + "step": 43418 + }, + { + "epoch": 0.43419, + "grad_norm": 0.8257682122008752, + "learning_rate": 0.003, + "loss": 4.037, + "step": 43419 + }, + { + "epoch": 0.4342, + "grad_norm": 0.8926701009260202, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 43420 + }, + { + "epoch": 0.43421, + "grad_norm": 0.7963963230987153, + "learning_rate": 0.003, + "loss": 3.992, + "step": 43421 + }, + { + "epoch": 0.43422, + "grad_norm": 0.7208585220487992, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 43422 + }, + { + "epoch": 0.43423, + "grad_norm": 0.7717285701860351, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 43423 + }, + { + "epoch": 0.43424, + "grad_norm": 0.7931579438544595, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 43424 + }, + { + "epoch": 0.43425, + "grad_norm": 0.7801339389504667, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 43425 + }, + { + "epoch": 0.43426, + "grad_norm": 0.8003406779882298, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 43426 + }, + { + "epoch": 0.43427, + "grad_norm": 0.92658023056427, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 43427 + }, + { + "epoch": 0.43428, + "grad_norm": 1.0714896906711247, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 43428 + }, + { + "epoch": 0.43429, + "grad_norm": 1.099391854647327, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 43429 + }, + { + "epoch": 0.4343, + "grad_norm": 0.7614775610465581, + "learning_rate": 0.003, + "loss": 3.9865, + "step": 43430 + }, + { + "epoch": 0.43431, + "grad_norm": 0.5632315979789522, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 43431 + }, + { + "epoch": 0.43432, + "grad_norm": 0.7612485317711506, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 43432 + }, + { + "epoch": 0.43433, + "grad_norm": 0.8038044537319459, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 43433 + }, + { + "epoch": 0.43434, + "grad_norm": 0.7376772327741025, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 43434 + }, + { + "epoch": 0.43435, + "grad_norm": 0.617345176766912, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 43435 + }, + { + "epoch": 0.43436, + "grad_norm": 0.5905114809914719, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 43436 + }, + { + "epoch": 0.43437, + "grad_norm": 0.6076208445758671, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 43437 + }, + { + "epoch": 0.43438, + "grad_norm": 0.6477996302473453, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 43438 + }, + { + "epoch": 0.43439, + "grad_norm": 0.6495903350247421, + "learning_rate": 0.003, + "loss": 3.9958, + "step": 43439 + }, + { + "epoch": 0.4344, + "grad_norm": 0.6566756658976386, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 43440 + }, + { + "epoch": 0.43441, + "grad_norm": 0.6602349380914017, + "learning_rate": 0.003, + "loss": 3.9833, + "step": 43441 + }, + { + "epoch": 0.43442, + "grad_norm": 0.6926316797876483, + "learning_rate": 0.003, + "loss": 3.9973, + "step": 43442 + }, + { + "epoch": 0.43443, + "grad_norm": 0.7430285124664695, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 43443 + }, + { + "epoch": 0.43444, + "grad_norm": 0.854445066048825, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 43444 + }, + { + "epoch": 0.43445, + "grad_norm": 1.0519606677101052, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 43445 + }, + { + "epoch": 0.43446, + "grad_norm": 0.8788391968500021, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 43446 + }, + { + "epoch": 0.43447, + "grad_norm": 0.8362227256225355, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 43447 + }, + { + "epoch": 0.43448, + "grad_norm": 0.9018230362914557, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 43448 + }, + { + "epoch": 0.43449, + "grad_norm": 0.9928674674557062, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 43449 + }, + { + "epoch": 0.4345, + "grad_norm": 0.8214787409002833, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 43450 + }, + { + "epoch": 0.43451, + "grad_norm": 0.7259489565596409, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 43451 + }, + { + "epoch": 0.43452, + "grad_norm": 0.6851510131623305, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 43452 + }, + { + "epoch": 0.43453, + "grad_norm": 0.8091647337862398, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 43453 + }, + { + "epoch": 0.43454, + "grad_norm": 0.9923044438980795, + "learning_rate": 0.003, + "loss": 4.018, + "step": 43454 + }, + { + "epoch": 0.43455, + "grad_norm": 1.1532955606700068, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 43455 + }, + { + "epoch": 0.43456, + "grad_norm": 1.0596395465278443, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 43456 + }, + { + "epoch": 0.43457, + "grad_norm": 1.07288716971704, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 43457 + }, + { + "epoch": 0.43458, + "grad_norm": 0.9232250863993654, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 43458 + }, + { + "epoch": 0.43459, + "grad_norm": 0.877627258416937, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 43459 + }, + { + "epoch": 0.4346, + "grad_norm": 0.8244290499963579, + "learning_rate": 0.003, + "loss": 4.042, + "step": 43460 + }, + { + "epoch": 0.43461, + "grad_norm": 0.8203943596070007, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 43461 + }, + { + "epoch": 0.43462, + "grad_norm": 0.7815636207826623, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 43462 + }, + { + "epoch": 0.43463, + "grad_norm": 0.7814035765664797, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 43463 + }, + { + "epoch": 0.43464, + "grad_norm": 1.026807595286563, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 43464 + }, + { + "epoch": 0.43465, + "grad_norm": 1.1999520850653038, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 43465 + }, + { + "epoch": 0.43466, + "grad_norm": 0.8630928085502546, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 43466 + }, + { + "epoch": 0.43467, + "grad_norm": 0.887070206334236, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 43467 + }, + { + "epoch": 0.43468, + "grad_norm": 0.9279129721060284, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 43468 + }, + { + "epoch": 0.43469, + "grad_norm": 0.9622328268338406, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 43469 + }, + { + "epoch": 0.4347, + "grad_norm": 0.9026422849878657, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 43470 + }, + { + "epoch": 0.43471, + "grad_norm": 0.8198279504243022, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 43471 + }, + { + "epoch": 0.43472, + "grad_norm": 0.8027536325274386, + "learning_rate": 0.003, + "loss": 4.022, + "step": 43472 + }, + { + "epoch": 0.43473, + "grad_norm": 0.9293668566163602, + "learning_rate": 0.003, + "loss": 4.04, + "step": 43473 + }, + { + "epoch": 0.43474, + "grad_norm": 1.104854655534921, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 43474 + }, + { + "epoch": 0.43475, + "grad_norm": 0.9989497440470331, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 43475 + }, + { + "epoch": 0.43476, + "grad_norm": 0.9645608124501852, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 43476 + }, + { + "epoch": 0.43477, + "grad_norm": 0.7772241726999962, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 43477 + }, + { + "epoch": 0.43478, + "grad_norm": 0.6404369896446814, + "learning_rate": 0.003, + "loss": 3.9914, + "step": 43478 + }, + { + "epoch": 0.43479, + "grad_norm": 0.7104044349641992, + "learning_rate": 0.003, + "loss": 4.045, + "step": 43479 + }, + { + "epoch": 0.4348, + "grad_norm": 0.8557486121018045, + "learning_rate": 0.003, + "loss": 4.0046, + "step": 43480 + }, + { + "epoch": 0.43481, + "grad_norm": 0.9342682447747509, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 43481 + }, + { + "epoch": 0.43482, + "grad_norm": 0.8887338366151137, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 43482 + }, + { + "epoch": 0.43483, + "grad_norm": 0.9496069599369185, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 43483 + }, + { + "epoch": 0.43484, + "grad_norm": 0.8754884160804818, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 43484 + }, + { + "epoch": 0.43485, + "grad_norm": 0.6882203500732651, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 43485 + }, + { + "epoch": 0.43486, + "grad_norm": 0.685670730886181, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 43486 + }, + { + "epoch": 0.43487, + "grad_norm": 0.6508450285999834, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 43487 + }, + { + "epoch": 0.43488, + "grad_norm": 0.6872587046468891, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 43488 + }, + { + "epoch": 0.43489, + "grad_norm": 0.6746121972474208, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 43489 + }, + { + "epoch": 0.4349, + "grad_norm": 0.6088006912089797, + "learning_rate": 0.003, + "loss": 4.0077, + "step": 43490 + }, + { + "epoch": 0.43491, + "grad_norm": 0.6584223499654089, + "learning_rate": 0.003, + "loss": 4.0015, + "step": 43491 + }, + { + "epoch": 0.43492, + "grad_norm": 0.6718437273239485, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 43492 + }, + { + "epoch": 0.43493, + "grad_norm": 0.6934768086618064, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 43493 + }, + { + "epoch": 0.43494, + "grad_norm": 0.6602150843110729, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 43494 + }, + { + "epoch": 0.43495, + "grad_norm": 0.7665872777409461, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 43495 + }, + { + "epoch": 0.43496, + "grad_norm": 0.9509932283293655, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 43496 + }, + { + "epoch": 0.43497, + "grad_norm": 1.24391949097813, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 43497 + }, + { + "epoch": 0.43498, + "grad_norm": 0.816750006498112, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 43498 + }, + { + "epoch": 0.43499, + "grad_norm": 0.8094548274122948, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 43499 + }, + { + "epoch": 0.435, + "grad_norm": 0.9009139555243777, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 43500 + }, + { + "epoch": 0.43501, + "grad_norm": 1.0206848035290654, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 43501 + }, + { + "epoch": 0.43502, + "grad_norm": 1.02375988446338, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 43502 + }, + { + "epoch": 0.43503, + "grad_norm": 0.8035123345551554, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 43503 + }, + { + "epoch": 0.43504, + "grad_norm": 0.6751182853446934, + "learning_rate": 0.003, + "loss": 4.026, + "step": 43504 + }, + { + "epoch": 0.43505, + "grad_norm": 0.6560518085159561, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 43505 + }, + { + "epoch": 0.43506, + "grad_norm": 0.6752033877389246, + "learning_rate": 0.003, + "loss": 4.038, + "step": 43506 + }, + { + "epoch": 0.43507, + "grad_norm": 0.6517301625423487, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 43507 + }, + { + "epoch": 0.43508, + "grad_norm": 0.8084534549451541, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 43508 + }, + { + "epoch": 0.43509, + "grad_norm": 1.026485877135495, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 43509 + }, + { + "epoch": 0.4351, + "grad_norm": 1.0627352364539546, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 43510 + }, + { + "epoch": 0.43511, + "grad_norm": 0.878641779995158, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 43511 + }, + { + "epoch": 0.43512, + "grad_norm": 0.7213960119419407, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 43512 + }, + { + "epoch": 0.43513, + "grad_norm": 0.8816696003590019, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 43513 + }, + { + "epoch": 0.43514, + "grad_norm": 1.0695322791032298, + "learning_rate": 0.003, + "loss": 3.9996, + "step": 43514 + }, + { + "epoch": 0.43515, + "grad_norm": 0.7943843749451117, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 43515 + }, + { + "epoch": 0.43516, + "grad_norm": 0.7397927924869858, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 43516 + }, + { + "epoch": 0.43517, + "grad_norm": 0.6049451446156847, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 43517 + }, + { + "epoch": 0.43518, + "grad_norm": 0.58536598619683, + "learning_rate": 0.003, + "loss": 3.9926, + "step": 43518 + }, + { + "epoch": 0.43519, + "grad_norm": 0.6378238763745907, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 43519 + }, + { + "epoch": 0.4352, + "grad_norm": 0.7052721492521339, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 43520 + }, + { + "epoch": 0.43521, + "grad_norm": 0.8368871950197585, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 43521 + }, + { + "epoch": 0.43522, + "grad_norm": 0.9360917413153489, + "learning_rate": 0.003, + "loss": 4.038, + "step": 43522 + }, + { + "epoch": 0.43523, + "grad_norm": 1.0300367438289344, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 43523 + }, + { + "epoch": 0.43524, + "grad_norm": 1.0023375103414787, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 43524 + }, + { + "epoch": 0.43525, + "grad_norm": 0.9389388640036425, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 43525 + }, + { + "epoch": 0.43526, + "grad_norm": 0.9473391528912686, + "learning_rate": 0.003, + "loss": 4.044, + "step": 43526 + }, + { + "epoch": 0.43527, + "grad_norm": 0.9332961770014078, + "learning_rate": 0.003, + "loss": 4.04, + "step": 43527 + }, + { + "epoch": 0.43528, + "grad_norm": 0.9335034162410406, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 43528 + }, + { + "epoch": 0.43529, + "grad_norm": 0.9176466103527112, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 43529 + }, + { + "epoch": 0.4353, + "grad_norm": 0.7437359451155718, + "learning_rate": 0.003, + "loss": 4.043, + "step": 43530 + }, + { + "epoch": 0.43531, + "grad_norm": 0.7366128859122606, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 43531 + }, + { + "epoch": 0.43532, + "grad_norm": 0.6488079504525669, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 43532 + }, + { + "epoch": 0.43533, + "grad_norm": 0.6001973841708872, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 43533 + }, + { + "epoch": 0.43534, + "grad_norm": 0.6432314930222096, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 43534 + }, + { + "epoch": 0.43535, + "grad_norm": 0.7349811077715978, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 43535 + }, + { + "epoch": 0.43536, + "grad_norm": 0.7506738795293174, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 43536 + }, + { + "epoch": 0.43537, + "grad_norm": 0.8254048151509598, + "learning_rate": 0.003, + "loss": 4.022, + "step": 43537 + }, + { + "epoch": 0.43538, + "grad_norm": 0.9739834388306633, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 43538 + }, + { + "epoch": 0.43539, + "grad_norm": 1.08271741496983, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 43539 + }, + { + "epoch": 0.4354, + "grad_norm": 1.0839411659692744, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 43540 + }, + { + "epoch": 0.43541, + "grad_norm": 1.0491610031345586, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 43541 + }, + { + "epoch": 0.43542, + "grad_norm": 1.1292641851627319, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 43542 + }, + { + "epoch": 0.43543, + "grad_norm": 0.8501315752950257, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 43543 + }, + { + "epoch": 0.43544, + "grad_norm": 0.6545177962734269, + "learning_rate": 0.003, + "loss": 3.9948, + "step": 43544 + }, + { + "epoch": 0.43545, + "grad_norm": 0.6969528319148366, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 43545 + }, + { + "epoch": 0.43546, + "grad_norm": 0.6955691817024829, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 43546 + }, + { + "epoch": 0.43547, + "grad_norm": 0.765287223232484, + "learning_rate": 0.003, + "loss": 4.0053, + "step": 43547 + }, + { + "epoch": 0.43548, + "grad_norm": 0.9320596192133657, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 43548 + }, + { + "epoch": 0.43549, + "grad_norm": 0.9874817139115848, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 43549 + }, + { + "epoch": 0.4355, + "grad_norm": 1.007991967875797, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 43550 + }, + { + "epoch": 0.43551, + "grad_norm": 1.1004242217729912, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 43551 + }, + { + "epoch": 0.43552, + "grad_norm": 0.865501322115587, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 43552 + }, + { + "epoch": 0.43553, + "grad_norm": 0.7674484204925369, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 43553 + }, + { + "epoch": 0.43554, + "grad_norm": 0.7233398812422117, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 43554 + }, + { + "epoch": 0.43555, + "grad_norm": 0.7067381071064458, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 43555 + }, + { + "epoch": 0.43556, + "grad_norm": 0.7519842830039656, + "learning_rate": 0.003, + "loss": 4.0051, + "step": 43556 + }, + { + "epoch": 0.43557, + "grad_norm": 0.8990044465610623, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 43557 + }, + { + "epoch": 0.43558, + "grad_norm": 1.1307644764861018, + "learning_rate": 0.003, + "loss": 4.04, + "step": 43558 + }, + { + "epoch": 0.43559, + "grad_norm": 0.9307862194074417, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 43559 + }, + { + "epoch": 0.4356, + "grad_norm": 0.9531566597753026, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 43560 + }, + { + "epoch": 0.43561, + "grad_norm": 0.8389757729547679, + "learning_rate": 0.003, + "loss": 4.0046, + "step": 43561 + }, + { + "epoch": 0.43562, + "grad_norm": 0.5886220654776227, + "learning_rate": 0.003, + "loss": 3.9776, + "step": 43562 + }, + { + "epoch": 0.43563, + "grad_norm": 0.6219131131891884, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 43563 + }, + { + "epoch": 0.43564, + "grad_norm": 0.6327065273177263, + "learning_rate": 0.003, + "loss": 4.052, + "step": 43564 + }, + { + "epoch": 0.43565, + "grad_norm": 0.7238415799236745, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 43565 + }, + { + "epoch": 0.43566, + "grad_norm": 0.7914962521164898, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 43566 + }, + { + "epoch": 0.43567, + "grad_norm": 0.9157980564439149, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 43567 + }, + { + "epoch": 0.43568, + "grad_norm": 1.0459574655960349, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 43568 + }, + { + "epoch": 0.43569, + "grad_norm": 1.004248128269837, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 43569 + }, + { + "epoch": 0.4357, + "grad_norm": 0.877538038734419, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 43570 + }, + { + "epoch": 0.43571, + "grad_norm": 0.7986621618658707, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 43571 + }, + { + "epoch": 0.43572, + "grad_norm": 0.8421118665337293, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 43572 + }, + { + "epoch": 0.43573, + "grad_norm": 0.7618089628502466, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 43573 + }, + { + "epoch": 0.43574, + "grad_norm": 0.6360508208883476, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 43574 + }, + { + "epoch": 0.43575, + "grad_norm": 0.596601650446271, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 43575 + }, + { + "epoch": 0.43576, + "grad_norm": 0.6604166645115785, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 43576 + }, + { + "epoch": 0.43577, + "grad_norm": 0.8578688777057806, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 43577 + }, + { + "epoch": 0.43578, + "grad_norm": 0.9352488125769708, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 43578 + }, + { + "epoch": 0.43579, + "grad_norm": 0.7994858810314104, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 43579 + }, + { + "epoch": 0.4358, + "grad_norm": 0.8313695914638451, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 43580 + }, + { + "epoch": 0.43581, + "grad_norm": 0.9349987149621266, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 43581 + }, + { + "epoch": 0.43582, + "grad_norm": 0.9032163562560028, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 43582 + }, + { + "epoch": 0.43583, + "grad_norm": 0.859020873372819, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 43583 + }, + { + "epoch": 0.43584, + "grad_norm": 0.8961598951831397, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 43584 + }, + { + "epoch": 0.43585, + "grad_norm": 1.1340986731148943, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 43585 + }, + { + "epoch": 0.43586, + "grad_norm": 0.9823971764716412, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 43586 + }, + { + "epoch": 0.43587, + "grad_norm": 0.8960833779189977, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 43587 + }, + { + "epoch": 0.43588, + "grad_norm": 0.9274935877151271, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 43588 + }, + { + "epoch": 0.43589, + "grad_norm": 1.00628392210459, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 43589 + }, + { + "epoch": 0.4359, + "grad_norm": 0.9964162730221586, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 43590 + }, + { + "epoch": 0.43591, + "grad_norm": 0.891398387203535, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 43591 + }, + { + "epoch": 0.43592, + "grad_norm": 0.8520253977064486, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 43592 + }, + { + "epoch": 0.43593, + "grad_norm": 0.7648214881516356, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 43593 + }, + { + "epoch": 0.43594, + "grad_norm": 0.723470103462091, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 43594 + }, + { + "epoch": 0.43595, + "grad_norm": 0.7744371257999508, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 43595 + }, + { + "epoch": 0.43596, + "grad_norm": 0.7641761447604569, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 43596 + }, + { + "epoch": 0.43597, + "grad_norm": 0.7615966461661432, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 43597 + }, + { + "epoch": 0.43598, + "grad_norm": 0.7511617307630233, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 43598 + }, + { + "epoch": 0.43599, + "grad_norm": 0.9083382729655608, + "learning_rate": 0.003, + "loss": 4.023, + "step": 43599 + }, + { + "epoch": 0.436, + "grad_norm": 0.9167975286891183, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 43600 + }, + { + "epoch": 0.43601, + "grad_norm": 0.9864217216447463, + "learning_rate": 0.003, + "loss": 4.025, + "step": 43601 + }, + { + "epoch": 0.43602, + "grad_norm": 1.0051007973875676, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 43602 + }, + { + "epoch": 0.43603, + "grad_norm": 1.0667646257815433, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 43603 + }, + { + "epoch": 0.43604, + "grad_norm": 0.9379973430833632, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 43604 + }, + { + "epoch": 0.43605, + "grad_norm": 0.8624512429584691, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 43605 + }, + { + "epoch": 0.43606, + "grad_norm": 1.027515269466285, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 43606 + }, + { + "epoch": 0.43607, + "grad_norm": 0.9753313742236311, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 43607 + }, + { + "epoch": 0.43608, + "grad_norm": 0.8405065452730699, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 43608 + }, + { + "epoch": 0.43609, + "grad_norm": 0.7544616005627323, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 43609 + }, + { + "epoch": 0.4361, + "grad_norm": 0.7514482726836268, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 43610 + }, + { + "epoch": 0.43611, + "grad_norm": 0.7582182012773405, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 43611 + }, + { + "epoch": 0.43612, + "grad_norm": 0.6837640394350336, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 43612 + }, + { + "epoch": 0.43613, + "grad_norm": 0.7287788460633016, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 43613 + }, + { + "epoch": 0.43614, + "grad_norm": 0.6827439215767648, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 43614 + }, + { + "epoch": 0.43615, + "grad_norm": 0.6742166561659645, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 43615 + }, + { + "epoch": 0.43616, + "grad_norm": 0.6895274764082011, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 43616 + }, + { + "epoch": 0.43617, + "grad_norm": 0.7568823562494753, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 43617 + }, + { + "epoch": 0.43618, + "grad_norm": 0.808346107031345, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 43618 + }, + { + "epoch": 0.43619, + "grad_norm": 0.9792959983702156, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 43619 + }, + { + "epoch": 0.4362, + "grad_norm": 1.228009532654363, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 43620 + }, + { + "epoch": 0.43621, + "grad_norm": 0.8995114705991926, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 43621 + }, + { + "epoch": 0.43622, + "grad_norm": 0.7112660252129157, + "learning_rate": 0.003, + "loss": 4.03, + "step": 43622 + }, + { + "epoch": 0.43623, + "grad_norm": 0.7065076902680454, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 43623 + }, + { + "epoch": 0.43624, + "grad_norm": 0.7223845627578227, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 43624 + }, + { + "epoch": 0.43625, + "grad_norm": 0.7273274026177785, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 43625 + }, + { + "epoch": 0.43626, + "grad_norm": 0.7196374721383494, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 43626 + }, + { + "epoch": 0.43627, + "grad_norm": 0.8304806107056881, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 43627 + }, + { + "epoch": 0.43628, + "grad_norm": 0.9087854519381247, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 43628 + }, + { + "epoch": 0.43629, + "grad_norm": 0.8800081698001629, + "learning_rate": 0.003, + "loss": 4.0101, + "step": 43629 + }, + { + "epoch": 0.4363, + "grad_norm": 0.8723084206729643, + "learning_rate": 0.003, + "loss": 4.0068, + "step": 43630 + }, + { + "epoch": 0.43631, + "grad_norm": 0.8768051713965507, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 43631 + }, + { + "epoch": 0.43632, + "grad_norm": 1.0161579405269452, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 43632 + }, + { + "epoch": 0.43633, + "grad_norm": 1.0098711546175574, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 43633 + }, + { + "epoch": 0.43634, + "grad_norm": 0.8756522062573975, + "learning_rate": 0.003, + "loss": 3.9974, + "step": 43634 + }, + { + "epoch": 0.43635, + "grad_norm": 0.7306073516897457, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 43635 + }, + { + "epoch": 0.43636, + "grad_norm": 0.780317935357182, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 43636 + }, + { + "epoch": 0.43637, + "grad_norm": 0.7940069944739298, + "learning_rate": 0.003, + "loss": 4.038, + "step": 43637 + }, + { + "epoch": 0.43638, + "grad_norm": 0.7127112046616086, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 43638 + }, + { + "epoch": 0.43639, + "grad_norm": 0.6103243574178097, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 43639 + }, + { + "epoch": 0.4364, + "grad_norm": 0.5585712202021296, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 43640 + }, + { + "epoch": 0.43641, + "grad_norm": 0.6309053682657723, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 43641 + }, + { + "epoch": 0.43642, + "grad_norm": 0.6704453944949209, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 43642 + }, + { + "epoch": 0.43643, + "grad_norm": 0.7412444769288309, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 43643 + }, + { + "epoch": 0.43644, + "grad_norm": 0.7694408619360733, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 43644 + }, + { + "epoch": 0.43645, + "grad_norm": 0.8783981799973537, + "learning_rate": 0.003, + "loss": 3.9915, + "step": 43645 + }, + { + "epoch": 0.43646, + "grad_norm": 1.0385534044891849, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 43646 + }, + { + "epoch": 0.43647, + "grad_norm": 1.1743410320559766, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 43647 + }, + { + "epoch": 0.43648, + "grad_norm": 0.8539124592757874, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 43648 + }, + { + "epoch": 0.43649, + "grad_norm": 0.7974235345469242, + "learning_rate": 0.003, + "loss": 4.065, + "step": 43649 + }, + { + "epoch": 0.4365, + "grad_norm": 0.7816768821249592, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 43650 + }, + { + "epoch": 0.43651, + "grad_norm": 0.8346087201675073, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 43651 + }, + { + "epoch": 0.43652, + "grad_norm": 0.8836046231114684, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 43652 + }, + { + "epoch": 0.43653, + "grad_norm": 0.9712025014892725, + "learning_rate": 0.003, + "loss": 4.035, + "step": 43653 + }, + { + "epoch": 0.43654, + "grad_norm": 1.1656380056193156, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 43654 + }, + { + "epoch": 0.43655, + "grad_norm": 0.7730118257108004, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 43655 + }, + { + "epoch": 0.43656, + "grad_norm": 0.7583227821821023, + "learning_rate": 0.003, + "loss": 4.026, + "step": 43656 + }, + { + "epoch": 0.43657, + "grad_norm": 0.7835912806848665, + "learning_rate": 0.003, + "loss": 3.9888, + "step": 43657 + }, + { + "epoch": 0.43658, + "grad_norm": 0.8006774668710417, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 43658 + }, + { + "epoch": 0.43659, + "grad_norm": 0.7845703103859939, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 43659 + }, + { + "epoch": 0.4366, + "grad_norm": 0.7598224669349526, + "learning_rate": 0.003, + "loss": 4.038, + "step": 43660 + }, + { + "epoch": 0.43661, + "grad_norm": 0.7883884705700969, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 43661 + }, + { + "epoch": 0.43662, + "grad_norm": 0.9170756353871653, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 43662 + }, + { + "epoch": 0.43663, + "grad_norm": 0.9625676516083045, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 43663 + }, + { + "epoch": 0.43664, + "grad_norm": 0.7773587103060355, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 43664 + }, + { + "epoch": 0.43665, + "grad_norm": 0.6296435649597066, + "learning_rate": 0.003, + "loss": 4.0648, + "step": 43665 + }, + { + "epoch": 0.43666, + "grad_norm": 0.6264752139878511, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 43666 + }, + { + "epoch": 0.43667, + "grad_norm": 0.629054667172083, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 43667 + }, + { + "epoch": 0.43668, + "grad_norm": 0.6641426079853481, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 43668 + }, + { + "epoch": 0.43669, + "grad_norm": 0.6907534478171128, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 43669 + }, + { + "epoch": 0.4367, + "grad_norm": 0.7192714777419619, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 43670 + }, + { + "epoch": 0.43671, + "grad_norm": 0.827200474036715, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 43671 + }, + { + "epoch": 0.43672, + "grad_norm": 1.07404340669986, + "learning_rate": 0.003, + "loss": 4.026, + "step": 43672 + }, + { + "epoch": 0.43673, + "grad_norm": 1.097912863725856, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 43673 + }, + { + "epoch": 0.43674, + "grad_norm": 0.854390097395205, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 43674 + }, + { + "epoch": 0.43675, + "grad_norm": 0.9371278316303467, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 43675 + }, + { + "epoch": 0.43676, + "grad_norm": 0.8967610993701578, + "learning_rate": 0.003, + "loss": 4.037, + "step": 43676 + }, + { + "epoch": 0.43677, + "grad_norm": 0.9163839037679764, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 43677 + }, + { + "epoch": 0.43678, + "grad_norm": 0.9433142427442684, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 43678 + }, + { + "epoch": 0.43679, + "grad_norm": 0.9515442635246973, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 43679 + }, + { + "epoch": 0.4368, + "grad_norm": 0.9598390380086556, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 43680 + }, + { + "epoch": 0.43681, + "grad_norm": 1.0428331184418138, + "learning_rate": 0.003, + "loss": 4.04, + "step": 43681 + }, + { + "epoch": 0.43682, + "grad_norm": 0.9177052163571146, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 43682 + }, + { + "epoch": 0.43683, + "grad_norm": 0.8307271089432371, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 43683 + }, + { + "epoch": 0.43684, + "grad_norm": 0.8017410245206816, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 43684 + }, + { + "epoch": 0.43685, + "grad_norm": 0.8205200568584381, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 43685 + }, + { + "epoch": 0.43686, + "grad_norm": 0.7928139747768793, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 43686 + }, + { + "epoch": 0.43687, + "grad_norm": 0.7030164905335193, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 43687 + }, + { + "epoch": 0.43688, + "grad_norm": 0.738863121364952, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 43688 + }, + { + "epoch": 0.43689, + "grad_norm": 0.6480447785785315, + "learning_rate": 0.003, + "loss": 4.03, + "step": 43689 + }, + { + "epoch": 0.4369, + "grad_norm": 0.6573725715866526, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 43690 + }, + { + "epoch": 0.43691, + "grad_norm": 0.641498024726287, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 43691 + }, + { + "epoch": 0.43692, + "grad_norm": 0.6387317609832923, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 43692 + }, + { + "epoch": 0.43693, + "grad_norm": 0.6057204645919011, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 43693 + }, + { + "epoch": 0.43694, + "grad_norm": 0.747366757688891, + "learning_rate": 0.003, + "loss": 3.9997, + "step": 43694 + }, + { + "epoch": 0.43695, + "grad_norm": 0.9967511814343437, + "learning_rate": 0.003, + "loss": 4.022, + "step": 43695 + }, + { + "epoch": 0.43696, + "grad_norm": 1.3219869439035608, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 43696 + }, + { + "epoch": 0.43697, + "grad_norm": 0.7184131255843988, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 43697 + }, + { + "epoch": 0.43698, + "grad_norm": 0.7290172283286975, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 43698 + }, + { + "epoch": 0.43699, + "grad_norm": 0.684203353197687, + "learning_rate": 0.003, + "loss": 4.008, + "step": 43699 + }, + { + "epoch": 0.437, + "grad_norm": 0.7642178118692318, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 43700 + }, + { + "epoch": 0.43701, + "grad_norm": 0.7466949586346036, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 43701 + }, + { + "epoch": 0.43702, + "grad_norm": 0.737330362899626, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 43702 + }, + { + "epoch": 0.43703, + "grad_norm": 0.8015721842674208, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 43703 + }, + { + "epoch": 0.43704, + "grad_norm": 1.0121686293996222, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 43704 + }, + { + "epoch": 0.43705, + "grad_norm": 1.0179200212472466, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 43705 + }, + { + "epoch": 0.43706, + "grad_norm": 0.9246014052363343, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 43706 + }, + { + "epoch": 0.43707, + "grad_norm": 0.9805019871516508, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 43707 + }, + { + "epoch": 0.43708, + "grad_norm": 1.16029051301002, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 43708 + }, + { + "epoch": 0.43709, + "grad_norm": 1.009529798936841, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 43709 + }, + { + "epoch": 0.4371, + "grad_norm": 0.9476433593610263, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 43710 + }, + { + "epoch": 0.43711, + "grad_norm": 0.8445287027105665, + "learning_rate": 0.003, + "loss": 4.045, + "step": 43711 + }, + { + "epoch": 0.43712, + "grad_norm": 0.8340288704588807, + "learning_rate": 0.003, + "loss": 4.035, + "step": 43712 + }, + { + "epoch": 0.43713, + "grad_norm": 0.8594510032752282, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 43713 + }, + { + "epoch": 0.43714, + "grad_norm": 0.8530498366294856, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 43714 + }, + { + "epoch": 0.43715, + "grad_norm": 0.920313997339346, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 43715 + }, + { + "epoch": 0.43716, + "grad_norm": 0.9172215150230826, + "learning_rate": 0.003, + "loss": 4.047, + "step": 43716 + }, + { + "epoch": 0.43717, + "grad_norm": 0.7923100055919986, + "learning_rate": 0.003, + "loss": 4.049, + "step": 43717 + }, + { + "epoch": 0.43718, + "grad_norm": 0.8475330333698339, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 43718 + }, + { + "epoch": 0.43719, + "grad_norm": 0.7883631810169808, + "learning_rate": 0.003, + "loss": 4.036, + "step": 43719 + }, + { + "epoch": 0.4372, + "grad_norm": 0.8733635656498006, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 43720 + }, + { + "epoch": 0.43721, + "grad_norm": 0.9207785979140042, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 43721 + }, + { + "epoch": 0.43722, + "grad_norm": 0.8345695031155208, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 43722 + }, + { + "epoch": 0.43723, + "grad_norm": 0.8582404089353433, + "learning_rate": 0.003, + "loss": 4.0772, + "step": 43723 + }, + { + "epoch": 0.43724, + "grad_norm": 0.8179320730500325, + "learning_rate": 0.003, + "loss": 4.042, + "step": 43724 + }, + { + "epoch": 0.43725, + "grad_norm": 0.8598147489221746, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 43725 + }, + { + "epoch": 0.43726, + "grad_norm": 0.8492760351946453, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 43726 + }, + { + "epoch": 0.43727, + "grad_norm": 0.7741057243491544, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 43727 + }, + { + "epoch": 0.43728, + "grad_norm": 0.7561791008585321, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 43728 + }, + { + "epoch": 0.43729, + "grad_norm": 0.7930540120201163, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 43729 + }, + { + "epoch": 0.4373, + "grad_norm": 0.7071993807081285, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 43730 + }, + { + "epoch": 0.43731, + "grad_norm": 0.6420118219412768, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 43731 + }, + { + "epoch": 0.43732, + "grad_norm": 0.7161134200202998, + "learning_rate": 0.003, + "loss": 3.9973, + "step": 43732 + }, + { + "epoch": 0.43733, + "grad_norm": 0.8626375862327169, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 43733 + }, + { + "epoch": 0.43734, + "grad_norm": 0.8829461030480137, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 43734 + }, + { + "epoch": 0.43735, + "grad_norm": 1.043550879780699, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 43735 + }, + { + "epoch": 0.43736, + "grad_norm": 1.0303678885128091, + "learning_rate": 0.003, + "loss": 4.05, + "step": 43736 + }, + { + "epoch": 0.43737, + "grad_norm": 1.0587139686636327, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 43737 + }, + { + "epoch": 0.43738, + "grad_norm": 0.83360677792497, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 43738 + }, + { + "epoch": 0.43739, + "grad_norm": 0.6561171247304962, + "learning_rate": 0.003, + "loss": 4.033, + "step": 43739 + }, + { + "epoch": 0.4374, + "grad_norm": 0.6412748384064002, + "learning_rate": 0.003, + "loss": 4.0035, + "step": 43740 + }, + { + "epoch": 0.43741, + "grad_norm": 0.6307328184377886, + "learning_rate": 0.003, + "loss": 3.9843, + "step": 43741 + }, + { + "epoch": 0.43742, + "grad_norm": 0.5939569882258765, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 43742 + }, + { + "epoch": 0.43743, + "grad_norm": 0.6089549577740188, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 43743 + }, + { + "epoch": 0.43744, + "grad_norm": 0.7358364442097876, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 43744 + }, + { + "epoch": 0.43745, + "grad_norm": 0.9569256255576649, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 43745 + }, + { + "epoch": 0.43746, + "grad_norm": 1.209715228118045, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 43746 + }, + { + "epoch": 0.43747, + "grad_norm": 1.0470991483023662, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 43747 + }, + { + "epoch": 0.43748, + "grad_norm": 0.8199436519049769, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 43748 + }, + { + "epoch": 0.43749, + "grad_norm": 0.7202730655150564, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 43749 + }, + { + "epoch": 0.4375, + "grad_norm": 0.8028270815995392, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 43750 + }, + { + "epoch": 0.43751, + "grad_norm": 0.9621354245903381, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 43751 + }, + { + "epoch": 0.43752, + "grad_norm": 1.1969031258720417, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 43752 + }, + { + "epoch": 0.43753, + "grad_norm": 0.6767668219879285, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 43753 + }, + { + "epoch": 0.43754, + "grad_norm": 0.6326595521224221, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 43754 + }, + { + "epoch": 0.43755, + "grad_norm": 0.7129583806241276, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 43755 + }, + { + "epoch": 0.43756, + "grad_norm": 0.821476514929802, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 43756 + }, + { + "epoch": 0.43757, + "grad_norm": 0.9029622548054527, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 43757 + }, + { + "epoch": 0.43758, + "grad_norm": 0.8364912397994476, + "learning_rate": 0.003, + "loss": 3.9995, + "step": 43758 + }, + { + "epoch": 0.43759, + "grad_norm": 0.7748467042879074, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 43759 + }, + { + "epoch": 0.4376, + "grad_norm": 0.7867118852963325, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 43760 + }, + { + "epoch": 0.43761, + "grad_norm": 0.7447531108256785, + "learning_rate": 0.003, + "loss": 4.0665, + "step": 43761 + }, + { + "epoch": 0.43762, + "grad_norm": 0.7321327377780108, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 43762 + }, + { + "epoch": 0.43763, + "grad_norm": 0.7188223439837306, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 43763 + }, + { + "epoch": 0.43764, + "grad_norm": 0.8312641726156154, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 43764 + }, + { + "epoch": 0.43765, + "grad_norm": 0.8797942893222533, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 43765 + }, + { + "epoch": 0.43766, + "grad_norm": 0.8568764680007582, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 43766 + }, + { + "epoch": 0.43767, + "grad_norm": 1.0203875424050057, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 43767 + }, + { + "epoch": 0.43768, + "grad_norm": 1.0611170371454766, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 43768 + }, + { + "epoch": 0.43769, + "grad_norm": 0.858979695959546, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 43769 + }, + { + "epoch": 0.4377, + "grad_norm": 0.8357254875872636, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 43770 + }, + { + "epoch": 0.43771, + "grad_norm": 0.8009930962969032, + "learning_rate": 0.003, + "loss": 3.996, + "step": 43771 + }, + { + "epoch": 0.43772, + "grad_norm": 0.7824417508283428, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 43772 + }, + { + "epoch": 0.43773, + "grad_norm": 0.9347933842807086, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 43773 + }, + { + "epoch": 0.43774, + "grad_norm": 1.0860654529789677, + "learning_rate": 0.003, + "loss": 4.0749, + "step": 43774 + }, + { + "epoch": 0.43775, + "grad_norm": 0.878714316371001, + "learning_rate": 0.003, + "loss": 4.0043, + "step": 43775 + }, + { + "epoch": 0.43776, + "grad_norm": 0.813412025115277, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 43776 + }, + { + "epoch": 0.43777, + "grad_norm": 0.9242183223886045, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 43777 + }, + { + "epoch": 0.43778, + "grad_norm": 1.107490084757637, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 43778 + }, + { + "epoch": 0.43779, + "grad_norm": 0.8915120397206154, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 43779 + }, + { + "epoch": 0.4378, + "grad_norm": 0.8288993005755066, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 43780 + }, + { + "epoch": 0.43781, + "grad_norm": 0.7921610688875661, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 43781 + }, + { + "epoch": 0.43782, + "grad_norm": 0.7362489525709955, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 43782 + }, + { + "epoch": 0.43783, + "grad_norm": 0.8035754712227787, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 43783 + }, + { + "epoch": 0.43784, + "grad_norm": 0.8541721483253047, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 43784 + }, + { + "epoch": 0.43785, + "grad_norm": 0.8828650992292971, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 43785 + }, + { + "epoch": 0.43786, + "grad_norm": 0.7822668576079818, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 43786 + }, + { + "epoch": 0.43787, + "grad_norm": 0.6439720209403188, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 43787 + }, + { + "epoch": 0.43788, + "grad_norm": 0.6784785820617693, + "learning_rate": 0.003, + "loss": 4.021, + "step": 43788 + }, + { + "epoch": 0.43789, + "grad_norm": 0.8217375912564252, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 43789 + }, + { + "epoch": 0.4379, + "grad_norm": 0.9738784679025428, + "learning_rate": 0.003, + "loss": 4.0729, + "step": 43790 + }, + { + "epoch": 0.43791, + "grad_norm": 0.9576074910903846, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 43791 + }, + { + "epoch": 0.43792, + "grad_norm": 0.8410355550245964, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 43792 + }, + { + "epoch": 0.43793, + "grad_norm": 0.9633941919866177, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 43793 + }, + { + "epoch": 0.43794, + "grad_norm": 1.1731056872142056, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 43794 + }, + { + "epoch": 0.43795, + "grad_norm": 1.1803017173688821, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 43795 + }, + { + "epoch": 0.43796, + "grad_norm": 0.9280369530878195, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 43796 + }, + { + "epoch": 0.43797, + "grad_norm": 0.8952018331645242, + "learning_rate": 0.003, + "loss": 4.0683, + "step": 43797 + }, + { + "epoch": 0.43798, + "grad_norm": 0.7644403380782996, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 43798 + }, + { + "epoch": 0.43799, + "grad_norm": 0.7535440212624968, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 43799 + }, + { + "epoch": 0.438, + "grad_norm": 0.7270443229539699, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 43800 + }, + { + "epoch": 0.43801, + "grad_norm": 0.7379804882218052, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 43801 + }, + { + "epoch": 0.43802, + "grad_norm": 0.8300025786825702, + "learning_rate": 0.003, + "loss": 4.007, + "step": 43802 + }, + { + "epoch": 0.43803, + "grad_norm": 0.8989737818289447, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 43803 + }, + { + "epoch": 0.43804, + "grad_norm": 0.8708589810269115, + "learning_rate": 0.003, + "loss": 4.0736, + "step": 43804 + }, + { + "epoch": 0.43805, + "grad_norm": 0.8307398861515211, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 43805 + }, + { + "epoch": 0.43806, + "grad_norm": 0.7956905978509782, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 43806 + }, + { + "epoch": 0.43807, + "grad_norm": 0.7489570849917697, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 43807 + }, + { + "epoch": 0.43808, + "grad_norm": 0.7273234267536471, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 43808 + }, + { + "epoch": 0.43809, + "grad_norm": 0.6969708602146337, + "learning_rate": 0.003, + "loss": 4.049, + "step": 43809 + }, + { + "epoch": 0.4381, + "grad_norm": 0.6351066024294719, + "learning_rate": 0.003, + "loss": 3.9853, + "step": 43810 + }, + { + "epoch": 0.43811, + "grad_norm": 0.6705218634960048, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 43811 + }, + { + "epoch": 0.43812, + "grad_norm": 0.8045234647254564, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 43812 + }, + { + "epoch": 0.43813, + "grad_norm": 0.86236789948204, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 43813 + }, + { + "epoch": 0.43814, + "grad_norm": 1.00024620729506, + "learning_rate": 0.003, + "loss": 3.981, + "step": 43814 + }, + { + "epoch": 0.43815, + "grad_norm": 1.121865093989346, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 43815 + }, + { + "epoch": 0.43816, + "grad_norm": 0.807191348593951, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 43816 + }, + { + "epoch": 0.43817, + "grad_norm": 0.7528280719545773, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 43817 + }, + { + "epoch": 0.43818, + "grad_norm": 0.733449046273484, + "learning_rate": 0.003, + "loss": 4.026, + "step": 43818 + }, + { + "epoch": 0.43819, + "grad_norm": 0.7707760427378492, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 43819 + }, + { + "epoch": 0.4382, + "grad_norm": 0.7839501957572693, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 43820 + }, + { + "epoch": 0.43821, + "grad_norm": 0.7650730420775004, + "learning_rate": 0.003, + "loss": 3.9908, + "step": 43821 + }, + { + "epoch": 0.43822, + "grad_norm": 0.6533051722672142, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 43822 + }, + { + "epoch": 0.43823, + "grad_norm": 0.5712208431544881, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 43823 + }, + { + "epoch": 0.43824, + "grad_norm": 0.5775463202951254, + "learning_rate": 0.003, + "loss": 3.988, + "step": 43824 + }, + { + "epoch": 0.43825, + "grad_norm": 0.5568530046130461, + "learning_rate": 0.003, + "loss": 4.0003, + "step": 43825 + }, + { + "epoch": 0.43826, + "grad_norm": 0.5634651704713486, + "learning_rate": 0.003, + "loss": 3.9965, + "step": 43826 + }, + { + "epoch": 0.43827, + "grad_norm": 0.6364854069604248, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 43827 + }, + { + "epoch": 0.43828, + "grad_norm": 0.9719339895664049, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 43828 + }, + { + "epoch": 0.43829, + "grad_norm": 1.3601222123408618, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 43829 + }, + { + "epoch": 0.4383, + "grad_norm": 0.7291459748540245, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 43830 + }, + { + "epoch": 0.43831, + "grad_norm": 0.6823960588663128, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 43831 + }, + { + "epoch": 0.43832, + "grad_norm": 0.7924317330031935, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 43832 + }, + { + "epoch": 0.43833, + "grad_norm": 0.6974823715348989, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 43833 + }, + { + "epoch": 0.43834, + "grad_norm": 0.7034444829181046, + "learning_rate": 0.003, + "loss": 3.9982, + "step": 43834 + }, + { + "epoch": 0.43835, + "grad_norm": 0.7356509816184226, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 43835 + }, + { + "epoch": 0.43836, + "grad_norm": 0.8203201215966446, + "learning_rate": 0.003, + "loss": 3.9995, + "step": 43836 + }, + { + "epoch": 0.43837, + "grad_norm": 0.8996478170445237, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 43837 + }, + { + "epoch": 0.43838, + "grad_norm": 0.9479370638809743, + "learning_rate": 0.003, + "loss": 4.0026, + "step": 43838 + }, + { + "epoch": 0.43839, + "grad_norm": 1.0718313452110664, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 43839 + }, + { + "epoch": 0.4384, + "grad_norm": 1.223332120681942, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 43840 + }, + { + "epoch": 0.43841, + "grad_norm": 0.821565746001853, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 43841 + }, + { + "epoch": 0.43842, + "grad_norm": 0.8513499661834827, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 43842 + }, + { + "epoch": 0.43843, + "grad_norm": 0.808658338953666, + "learning_rate": 0.003, + "loss": 4.031, + "step": 43843 + }, + { + "epoch": 0.43844, + "grad_norm": 0.8028202690027022, + "learning_rate": 0.003, + "loss": 3.9991, + "step": 43844 + }, + { + "epoch": 0.43845, + "grad_norm": 0.8816274128729761, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 43845 + }, + { + "epoch": 0.43846, + "grad_norm": 1.0482624561011267, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 43846 + }, + { + "epoch": 0.43847, + "grad_norm": 1.1367759300185345, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 43847 + }, + { + "epoch": 0.43848, + "grad_norm": 0.9563879676405411, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 43848 + }, + { + "epoch": 0.43849, + "grad_norm": 0.9710290856310586, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 43849 + }, + { + "epoch": 0.4385, + "grad_norm": 0.9743133067859898, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 43850 + }, + { + "epoch": 0.43851, + "grad_norm": 0.9364905995390924, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 43851 + }, + { + "epoch": 0.43852, + "grad_norm": 0.9202762350984625, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 43852 + }, + { + "epoch": 0.43853, + "grad_norm": 0.9071291260889695, + "learning_rate": 0.003, + "loss": 4.051, + "step": 43853 + }, + { + "epoch": 0.43854, + "grad_norm": 0.873345670795417, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 43854 + }, + { + "epoch": 0.43855, + "grad_norm": 0.7808152126813547, + "learning_rate": 0.003, + "loss": 4.035, + "step": 43855 + }, + { + "epoch": 0.43856, + "grad_norm": 0.7493853306339195, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 43856 + }, + { + "epoch": 0.43857, + "grad_norm": 0.8170205575559368, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 43857 + }, + { + "epoch": 0.43858, + "grad_norm": 0.8006783836278468, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 43858 + }, + { + "epoch": 0.43859, + "grad_norm": 0.7914706593280365, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 43859 + }, + { + "epoch": 0.4386, + "grad_norm": 0.9643839820900632, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 43860 + }, + { + "epoch": 0.43861, + "grad_norm": 1.1004408908327306, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 43861 + }, + { + "epoch": 0.43862, + "grad_norm": 0.9949138403463627, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 43862 + }, + { + "epoch": 0.43863, + "grad_norm": 1.0095052990850149, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 43863 + }, + { + "epoch": 0.43864, + "grad_norm": 0.7754670761334327, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 43864 + }, + { + "epoch": 0.43865, + "grad_norm": 0.603901261941917, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 43865 + }, + { + "epoch": 0.43866, + "grad_norm": 0.6154354744720771, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 43866 + }, + { + "epoch": 0.43867, + "grad_norm": 0.619633243296498, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 43867 + }, + { + "epoch": 0.43868, + "grad_norm": 0.6086789191306422, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 43868 + }, + { + "epoch": 0.43869, + "grad_norm": 0.6121512707795487, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 43869 + }, + { + "epoch": 0.4387, + "grad_norm": 0.5750406771484038, + "learning_rate": 0.003, + "loss": 4.034, + "step": 43870 + }, + { + "epoch": 0.43871, + "grad_norm": 0.5329924028759155, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 43871 + }, + { + "epoch": 0.43872, + "grad_norm": 0.49787322246183435, + "learning_rate": 0.003, + "loss": 3.9751, + "step": 43872 + }, + { + "epoch": 0.43873, + "grad_norm": 0.4335896425888664, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 43873 + }, + { + "epoch": 0.43874, + "grad_norm": 0.5169931942638575, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 43874 + }, + { + "epoch": 0.43875, + "grad_norm": 0.5770432773877978, + "learning_rate": 0.003, + "loss": 3.9984, + "step": 43875 + }, + { + "epoch": 0.43876, + "grad_norm": 0.7001277555322546, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 43876 + }, + { + "epoch": 0.43877, + "grad_norm": 0.8320659239932607, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 43877 + }, + { + "epoch": 0.43878, + "grad_norm": 1.0486987781436383, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 43878 + }, + { + "epoch": 0.43879, + "grad_norm": 1.0974152392759091, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 43879 + }, + { + "epoch": 0.4388, + "grad_norm": 0.8650030519622853, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 43880 + }, + { + "epoch": 0.43881, + "grad_norm": 0.8253594213833104, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 43881 + }, + { + "epoch": 0.43882, + "grad_norm": 0.875580235329341, + "learning_rate": 0.003, + "loss": 3.9786, + "step": 43882 + }, + { + "epoch": 0.43883, + "grad_norm": 0.8348693665879652, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 43883 + }, + { + "epoch": 0.43884, + "grad_norm": 0.7165153698370842, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 43884 + }, + { + "epoch": 0.43885, + "grad_norm": 0.7515201527320918, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 43885 + }, + { + "epoch": 0.43886, + "grad_norm": 0.8678020892303993, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 43886 + }, + { + "epoch": 0.43887, + "grad_norm": 0.9579809812187753, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 43887 + }, + { + "epoch": 0.43888, + "grad_norm": 1.092261316596719, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 43888 + }, + { + "epoch": 0.43889, + "grad_norm": 0.9598624165653467, + "learning_rate": 0.003, + "loss": 4.0089, + "step": 43889 + }, + { + "epoch": 0.4389, + "grad_norm": 1.0662852186158416, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 43890 + }, + { + "epoch": 0.43891, + "grad_norm": 0.9684405084066344, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 43891 + }, + { + "epoch": 0.43892, + "grad_norm": 0.9384186392905285, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 43892 + }, + { + "epoch": 0.43893, + "grad_norm": 0.8487767129475401, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 43893 + }, + { + "epoch": 0.43894, + "grad_norm": 0.8492561795276321, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 43894 + }, + { + "epoch": 0.43895, + "grad_norm": 0.7451163749602853, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 43895 + }, + { + "epoch": 0.43896, + "grad_norm": 0.8658860735328182, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 43896 + }, + { + "epoch": 0.43897, + "grad_norm": 1.0641436828077546, + "learning_rate": 0.003, + "loss": 3.9826, + "step": 43897 + }, + { + "epoch": 0.43898, + "grad_norm": 1.1881246415155493, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 43898 + }, + { + "epoch": 0.43899, + "grad_norm": 0.9485211618335933, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 43899 + }, + { + "epoch": 0.439, + "grad_norm": 0.8030723255143156, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 43900 + }, + { + "epoch": 0.43901, + "grad_norm": 0.684090396745211, + "learning_rate": 0.003, + "loss": 4.0632, + "step": 43901 + }, + { + "epoch": 0.43902, + "grad_norm": 0.6872498644676204, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 43902 + }, + { + "epoch": 0.43903, + "grad_norm": 0.6017226107320595, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 43903 + }, + { + "epoch": 0.43904, + "grad_norm": 0.5851062372866224, + "learning_rate": 0.003, + "loss": 4.037, + "step": 43904 + }, + { + "epoch": 0.43905, + "grad_norm": 0.6012400600627876, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 43905 + }, + { + "epoch": 0.43906, + "grad_norm": 0.49811506056851407, + "learning_rate": 0.003, + "loss": 3.9961, + "step": 43906 + }, + { + "epoch": 0.43907, + "grad_norm": 0.4506728249359675, + "learning_rate": 0.003, + "loss": 4.013, + "step": 43907 + }, + { + "epoch": 0.43908, + "grad_norm": 0.5154755680295866, + "learning_rate": 0.003, + "loss": 4.0535, + "step": 43908 + }, + { + "epoch": 0.43909, + "grad_norm": 0.7412310765451736, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 43909 + }, + { + "epoch": 0.4391, + "grad_norm": 1.043253996854465, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 43910 + }, + { + "epoch": 0.43911, + "grad_norm": 1.265262429429305, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 43911 + }, + { + "epoch": 0.43912, + "grad_norm": 0.528324539609655, + "learning_rate": 0.003, + "loss": 4.0073, + "step": 43912 + }, + { + "epoch": 0.43913, + "grad_norm": 0.7949114393996195, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 43913 + }, + { + "epoch": 0.43914, + "grad_norm": 1.2788997449279933, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 43914 + }, + { + "epoch": 0.43915, + "grad_norm": 0.766619098408572, + "learning_rate": 0.003, + "loss": 4.0, + "step": 43915 + }, + { + "epoch": 0.43916, + "grad_norm": 0.7570877631559613, + "learning_rate": 0.003, + "loss": 4.0061, + "step": 43916 + }, + { + "epoch": 0.43917, + "grad_norm": 0.7109960105467594, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 43917 + }, + { + "epoch": 0.43918, + "grad_norm": 0.6353929727820665, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 43918 + }, + { + "epoch": 0.43919, + "grad_norm": 0.7373990389481787, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 43919 + }, + { + "epoch": 0.4392, + "grad_norm": 0.7941368779924017, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 43920 + }, + { + "epoch": 0.43921, + "grad_norm": 0.8780794625722569, + "learning_rate": 0.003, + "loss": 4.047, + "step": 43921 + }, + { + "epoch": 0.43922, + "grad_norm": 0.9818218485811455, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 43922 + }, + { + "epoch": 0.43923, + "grad_norm": 0.9687069848601503, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 43923 + }, + { + "epoch": 0.43924, + "grad_norm": 0.9075729151090561, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 43924 + }, + { + "epoch": 0.43925, + "grad_norm": 0.919371598706093, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 43925 + }, + { + "epoch": 0.43926, + "grad_norm": 0.9303165319160025, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 43926 + }, + { + "epoch": 0.43927, + "grad_norm": 1.0971303938698322, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 43927 + }, + { + "epoch": 0.43928, + "grad_norm": 0.972889497830554, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 43928 + }, + { + "epoch": 0.43929, + "grad_norm": 0.9399537596444889, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 43929 + }, + { + "epoch": 0.4393, + "grad_norm": 0.8412689887017626, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 43930 + }, + { + "epoch": 0.43931, + "grad_norm": 1.0185609665639899, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 43931 + }, + { + "epoch": 0.43932, + "grad_norm": 1.170758121386502, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 43932 + }, + { + "epoch": 0.43933, + "grad_norm": 0.8494377951851976, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 43933 + }, + { + "epoch": 0.43934, + "grad_norm": 0.7715074946587895, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 43934 + }, + { + "epoch": 0.43935, + "grad_norm": 0.7593193789077878, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 43935 + }, + { + "epoch": 0.43936, + "grad_norm": 0.8200957873178553, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 43936 + }, + { + "epoch": 0.43937, + "grad_norm": 0.9902165914772627, + "learning_rate": 0.003, + "loss": 4.045, + "step": 43937 + }, + { + "epoch": 0.43938, + "grad_norm": 1.2307921031795592, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 43938 + }, + { + "epoch": 0.43939, + "grad_norm": 0.8014348339660479, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 43939 + }, + { + "epoch": 0.4394, + "grad_norm": 0.8612126631913487, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 43940 + }, + { + "epoch": 0.43941, + "grad_norm": 0.8779262673076993, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 43941 + }, + { + "epoch": 0.43942, + "grad_norm": 0.8573517081024021, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 43942 + }, + { + "epoch": 0.43943, + "grad_norm": 0.9006084721665383, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 43943 + }, + { + "epoch": 0.43944, + "grad_norm": 0.7706328355338327, + "learning_rate": 0.003, + "loss": 4.0027, + "step": 43944 + }, + { + "epoch": 0.43945, + "grad_norm": 0.793290881360629, + "learning_rate": 0.003, + "loss": 4.0118, + "step": 43945 + }, + { + "epoch": 0.43946, + "grad_norm": 0.7200502481532963, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 43946 + }, + { + "epoch": 0.43947, + "grad_norm": 0.7652796843016471, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 43947 + }, + { + "epoch": 0.43948, + "grad_norm": 0.779999822854883, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 43948 + }, + { + "epoch": 0.43949, + "grad_norm": 0.8138169767509585, + "learning_rate": 0.003, + "loss": 4.045, + "step": 43949 + }, + { + "epoch": 0.4395, + "grad_norm": 0.9029825857722676, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 43950 + }, + { + "epoch": 0.43951, + "grad_norm": 0.7326951513476563, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 43951 + }, + { + "epoch": 0.43952, + "grad_norm": 0.7516559947541338, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 43952 + }, + { + "epoch": 0.43953, + "grad_norm": 0.9312809638010731, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 43953 + }, + { + "epoch": 0.43954, + "grad_norm": 1.1166482006552358, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 43954 + }, + { + "epoch": 0.43955, + "grad_norm": 0.8801647172816691, + "learning_rate": 0.003, + "loss": 4.05, + "step": 43955 + }, + { + "epoch": 0.43956, + "grad_norm": 0.7475729935033139, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 43956 + }, + { + "epoch": 0.43957, + "grad_norm": 0.9301243638926918, + "learning_rate": 0.003, + "loss": 4.055, + "step": 43957 + }, + { + "epoch": 0.43958, + "grad_norm": 1.1143260018977108, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 43958 + }, + { + "epoch": 0.43959, + "grad_norm": 1.0151235261826315, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 43959 + }, + { + "epoch": 0.4396, + "grad_norm": 0.9173391321560801, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 43960 + }, + { + "epoch": 0.43961, + "grad_norm": 0.8647625330068376, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 43961 + }, + { + "epoch": 0.43962, + "grad_norm": 0.7575325678583706, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 43962 + }, + { + "epoch": 0.43963, + "grad_norm": 0.732847128154973, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 43963 + }, + { + "epoch": 0.43964, + "grad_norm": 0.6713119739408269, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 43964 + }, + { + "epoch": 0.43965, + "grad_norm": 0.683653771536651, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 43965 + }, + { + "epoch": 0.43966, + "grad_norm": 0.7015132017621836, + "learning_rate": 0.003, + "loss": 3.989, + "step": 43966 + }, + { + "epoch": 0.43967, + "grad_norm": 0.669551564559191, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 43967 + }, + { + "epoch": 0.43968, + "grad_norm": 0.6221960425760589, + "learning_rate": 0.003, + "loss": 4.0052, + "step": 43968 + }, + { + "epoch": 0.43969, + "grad_norm": 0.5731241297364353, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 43969 + }, + { + "epoch": 0.4397, + "grad_norm": 0.46920735169030986, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 43970 + }, + { + "epoch": 0.43971, + "grad_norm": 0.47017586848806836, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 43971 + }, + { + "epoch": 0.43972, + "grad_norm": 0.5343515391875364, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 43972 + }, + { + "epoch": 0.43973, + "grad_norm": 0.5875428058287067, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 43973 + }, + { + "epoch": 0.43974, + "grad_norm": 0.590165600815303, + "learning_rate": 0.003, + "loss": 4.0036, + "step": 43974 + }, + { + "epoch": 0.43975, + "grad_norm": 0.7320708798339571, + "learning_rate": 0.003, + "loss": 3.9992, + "step": 43975 + }, + { + "epoch": 0.43976, + "grad_norm": 0.9466582726175717, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 43976 + }, + { + "epoch": 0.43977, + "grad_norm": 1.227397216311488, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 43977 + }, + { + "epoch": 0.43978, + "grad_norm": 0.7466291374536467, + "learning_rate": 0.003, + "loss": 4.004, + "step": 43978 + }, + { + "epoch": 0.43979, + "grad_norm": 0.863953216188542, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 43979 + }, + { + "epoch": 0.4398, + "grad_norm": 1.0620882900434307, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 43980 + }, + { + "epoch": 0.43981, + "grad_norm": 0.9507729724640989, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 43981 + }, + { + "epoch": 0.43982, + "grad_norm": 1.0645390499303318, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 43982 + }, + { + "epoch": 0.43983, + "grad_norm": 0.8280174978085718, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 43983 + }, + { + "epoch": 0.43984, + "grad_norm": 0.7940908314098636, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 43984 + }, + { + "epoch": 0.43985, + "grad_norm": 0.6815684836193457, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 43985 + }, + { + "epoch": 0.43986, + "grad_norm": 0.7771838990990955, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 43986 + }, + { + "epoch": 0.43987, + "grad_norm": 0.9049933971200625, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 43987 + }, + { + "epoch": 0.43988, + "grad_norm": 0.9314439473118004, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 43988 + }, + { + "epoch": 0.43989, + "grad_norm": 0.8987106273083606, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 43989 + }, + { + "epoch": 0.4399, + "grad_norm": 0.9254792616111572, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 43990 + }, + { + "epoch": 0.43991, + "grad_norm": 0.9649485059729633, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 43991 + }, + { + "epoch": 0.43992, + "grad_norm": 0.8888356223575948, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 43992 + }, + { + "epoch": 0.43993, + "grad_norm": 0.911556663721669, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 43993 + }, + { + "epoch": 0.43994, + "grad_norm": 1.1137152511162836, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 43994 + }, + { + "epoch": 0.43995, + "grad_norm": 1.02774324013839, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 43995 + }, + { + "epoch": 0.43996, + "grad_norm": 0.9025151488241134, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 43996 + }, + { + "epoch": 0.43997, + "grad_norm": 0.8571418710171863, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 43997 + }, + { + "epoch": 0.43998, + "grad_norm": 0.8797593382703494, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 43998 + }, + { + "epoch": 0.43999, + "grad_norm": 0.8644490350219338, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 43999 + }, + { + "epoch": 0.44, + "grad_norm": 0.8588846800920615, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 44000 + }, + { + "epoch": 0.44001, + "grad_norm": 0.9777752120234477, + "learning_rate": 0.003, + "loss": 4.0837, + "step": 44001 + }, + { + "epoch": 0.44002, + "grad_norm": 0.977595780312396, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 44002 + }, + { + "epoch": 0.44003, + "grad_norm": 0.885439747878391, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 44003 + }, + { + "epoch": 0.44004, + "grad_norm": 0.879617028186005, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 44004 + }, + { + "epoch": 0.44005, + "grad_norm": 1.0067338863537476, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 44005 + }, + { + "epoch": 0.44006, + "grad_norm": 1.0618759022647293, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 44006 + }, + { + "epoch": 0.44007, + "grad_norm": 1.0014473409303766, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 44007 + }, + { + "epoch": 0.44008, + "grad_norm": 1.1882551208632126, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 44008 + }, + { + "epoch": 0.44009, + "grad_norm": 1.0700420889104256, + "learning_rate": 0.003, + "loss": 4.076, + "step": 44009 + }, + { + "epoch": 0.4401, + "grad_norm": 1.0368338156205834, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 44010 + }, + { + "epoch": 0.44011, + "grad_norm": 0.9423012438482538, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 44011 + }, + { + "epoch": 0.44012, + "grad_norm": 0.9911320371904695, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 44012 + }, + { + "epoch": 0.44013, + "grad_norm": 1.0591318878649278, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 44013 + }, + { + "epoch": 0.44014, + "grad_norm": 0.9230287843700801, + "learning_rate": 0.003, + "loss": 4.042, + "step": 44014 + }, + { + "epoch": 0.44015, + "grad_norm": 0.8762694761693124, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 44015 + }, + { + "epoch": 0.44016, + "grad_norm": 0.8908073296429491, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 44016 + }, + { + "epoch": 0.44017, + "grad_norm": 0.8342492351734107, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 44017 + }, + { + "epoch": 0.44018, + "grad_norm": 0.899625885932149, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 44018 + }, + { + "epoch": 0.44019, + "grad_norm": 0.9158291906504674, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 44019 + }, + { + "epoch": 0.4402, + "grad_norm": 0.7403081112718825, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 44020 + }, + { + "epoch": 0.44021, + "grad_norm": 0.5906215248144588, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 44021 + }, + { + "epoch": 0.44022, + "grad_norm": 0.6167954293849912, + "learning_rate": 0.003, + "loss": 4.07, + "step": 44022 + }, + { + "epoch": 0.44023, + "grad_norm": 0.6497921118741021, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 44023 + }, + { + "epoch": 0.44024, + "grad_norm": 0.7219202252853177, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 44024 + }, + { + "epoch": 0.44025, + "grad_norm": 0.7624696474336745, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 44025 + }, + { + "epoch": 0.44026, + "grad_norm": 0.701802337066599, + "learning_rate": 0.003, + "loss": 4.018, + "step": 44026 + }, + { + "epoch": 0.44027, + "grad_norm": 0.6962077183773769, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 44027 + }, + { + "epoch": 0.44028, + "grad_norm": 0.7070294518934406, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 44028 + }, + { + "epoch": 0.44029, + "grad_norm": 0.7756729524112027, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 44029 + }, + { + "epoch": 0.4403, + "grad_norm": 0.840575859540335, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 44030 + }, + { + "epoch": 0.44031, + "grad_norm": 0.9157522321181899, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 44031 + }, + { + "epoch": 0.44032, + "grad_norm": 0.9629665632826938, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 44032 + }, + { + "epoch": 0.44033, + "grad_norm": 0.8295150922252873, + "learning_rate": 0.003, + "loss": 4.037, + "step": 44033 + }, + { + "epoch": 0.44034, + "grad_norm": 0.6873440284989509, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 44034 + }, + { + "epoch": 0.44035, + "grad_norm": 0.7594352420548086, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 44035 + }, + { + "epoch": 0.44036, + "grad_norm": 0.8434063380578891, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 44036 + }, + { + "epoch": 0.44037, + "grad_norm": 0.8199606448368207, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 44037 + }, + { + "epoch": 0.44038, + "grad_norm": 0.7636111953153023, + "learning_rate": 0.003, + "loss": 4.0025, + "step": 44038 + }, + { + "epoch": 0.44039, + "grad_norm": 0.7210667044730376, + "learning_rate": 0.003, + "loss": 3.9769, + "step": 44039 + }, + { + "epoch": 0.4404, + "grad_norm": 0.6334176121354617, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 44040 + }, + { + "epoch": 0.44041, + "grad_norm": 0.6348829932843202, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 44041 + }, + { + "epoch": 0.44042, + "grad_norm": 0.6482711434882571, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 44042 + }, + { + "epoch": 0.44043, + "grad_norm": 0.6113057099530907, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 44043 + }, + { + "epoch": 0.44044, + "grad_norm": 0.6341282530890122, + "learning_rate": 0.003, + "loss": 3.9925, + "step": 44044 + }, + { + "epoch": 0.44045, + "grad_norm": 0.6585254683055857, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 44045 + }, + { + "epoch": 0.44046, + "grad_norm": 0.6216708577423955, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 44046 + }, + { + "epoch": 0.44047, + "grad_norm": 0.5971408056838667, + "learning_rate": 0.003, + "loss": 3.9921, + "step": 44047 + }, + { + "epoch": 0.44048, + "grad_norm": 0.6590403295658205, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 44048 + }, + { + "epoch": 0.44049, + "grad_norm": 0.7270435466298605, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 44049 + }, + { + "epoch": 0.4405, + "grad_norm": 0.8797756765733789, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 44050 + }, + { + "epoch": 0.44051, + "grad_norm": 1.1267053006058672, + "learning_rate": 0.003, + "loss": 4.033, + "step": 44051 + }, + { + "epoch": 0.44052, + "grad_norm": 0.972856157818856, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 44052 + }, + { + "epoch": 0.44053, + "grad_norm": 1.0178459073847326, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 44053 + }, + { + "epoch": 0.44054, + "grad_norm": 0.9748634177046103, + "learning_rate": 0.003, + "loss": 4.0028, + "step": 44054 + }, + { + "epoch": 0.44055, + "grad_norm": 0.9208474615784734, + "learning_rate": 0.003, + "loss": 4.013, + "step": 44055 + }, + { + "epoch": 0.44056, + "grad_norm": 0.8319301184748619, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 44056 + }, + { + "epoch": 0.44057, + "grad_norm": 0.9101830751753988, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 44057 + }, + { + "epoch": 0.44058, + "grad_norm": 0.9169840301461112, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 44058 + }, + { + "epoch": 0.44059, + "grad_norm": 0.8580214226198026, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 44059 + }, + { + "epoch": 0.4406, + "grad_norm": 0.9111933662865612, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 44060 + }, + { + "epoch": 0.44061, + "grad_norm": 1.0158250668464028, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 44061 + }, + { + "epoch": 0.44062, + "grad_norm": 1.0870995523692588, + "learning_rate": 0.003, + "loss": 4.0019, + "step": 44062 + }, + { + "epoch": 0.44063, + "grad_norm": 0.9009250760247202, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 44063 + }, + { + "epoch": 0.44064, + "grad_norm": 0.9564226094249402, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 44064 + }, + { + "epoch": 0.44065, + "grad_norm": 0.8232768700568186, + "learning_rate": 0.003, + "loss": 4.026, + "step": 44065 + }, + { + "epoch": 0.44066, + "grad_norm": 0.7640245292089429, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 44066 + }, + { + "epoch": 0.44067, + "grad_norm": 0.7441609835689376, + "learning_rate": 0.003, + "loss": 4.0653, + "step": 44067 + }, + { + "epoch": 0.44068, + "grad_norm": 0.6767589318931856, + "learning_rate": 0.003, + "loss": 4.026, + "step": 44068 + }, + { + "epoch": 0.44069, + "grad_norm": 0.7490072338930779, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 44069 + }, + { + "epoch": 0.4407, + "grad_norm": 0.8813050132563559, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 44070 + }, + { + "epoch": 0.44071, + "grad_norm": 1.032517125826434, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 44071 + }, + { + "epoch": 0.44072, + "grad_norm": 1.0306460090780518, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 44072 + }, + { + "epoch": 0.44073, + "grad_norm": 1.0247931221422466, + "learning_rate": 0.003, + "loss": 4.071, + "step": 44073 + }, + { + "epoch": 0.44074, + "grad_norm": 0.9697971178027586, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 44074 + }, + { + "epoch": 0.44075, + "grad_norm": 0.7729719602767091, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 44075 + }, + { + "epoch": 0.44076, + "grad_norm": 0.6809309274810565, + "learning_rate": 0.003, + "loss": 4.046, + "step": 44076 + }, + { + "epoch": 0.44077, + "grad_norm": 0.651799836818177, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 44077 + }, + { + "epoch": 0.44078, + "grad_norm": 0.5736259387091623, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 44078 + }, + { + "epoch": 0.44079, + "grad_norm": 0.5954762411192951, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 44079 + }, + { + "epoch": 0.4408, + "grad_norm": 0.5515138452964236, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 44080 + }, + { + "epoch": 0.44081, + "grad_norm": 0.664401329610333, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 44081 + }, + { + "epoch": 0.44082, + "grad_norm": 0.645954233630572, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 44082 + }, + { + "epoch": 0.44083, + "grad_norm": 0.580223327558829, + "learning_rate": 0.003, + "loss": 4.0061, + "step": 44083 + }, + { + "epoch": 0.44084, + "grad_norm": 0.6537686039791832, + "learning_rate": 0.003, + "loss": 4.018, + "step": 44084 + }, + { + "epoch": 0.44085, + "grad_norm": 0.7409869560928374, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 44085 + }, + { + "epoch": 0.44086, + "grad_norm": 0.8693128232135663, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 44086 + }, + { + "epoch": 0.44087, + "grad_norm": 0.9534333004786673, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 44087 + }, + { + "epoch": 0.44088, + "grad_norm": 0.9781705396803522, + "learning_rate": 0.003, + "loss": 4.033, + "step": 44088 + }, + { + "epoch": 0.44089, + "grad_norm": 0.9554425155462907, + "learning_rate": 0.003, + "loss": 4.018, + "step": 44089 + }, + { + "epoch": 0.4409, + "grad_norm": 1.077275177393194, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 44090 + }, + { + "epoch": 0.44091, + "grad_norm": 0.9774075605874751, + "learning_rate": 0.003, + "loss": 4.078, + "step": 44091 + }, + { + "epoch": 0.44092, + "grad_norm": 1.0022872498214537, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 44092 + }, + { + "epoch": 0.44093, + "grad_norm": 0.8813736750491308, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 44093 + }, + { + "epoch": 0.44094, + "grad_norm": 0.7410336864778546, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 44094 + }, + { + "epoch": 0.44095, + "grad_norm": 0.720465020412799, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 44095 + }, + { + "epoch": 0.44096, + "grad_norm": 0.7206843004098169, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 44096 + }, + { + "epoch": 0.44097, + "grad_norm": 0.7722497603871079, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 44097 + }, + { + "epoch": 0.44098, + "grad_norm": 0.8350371910114411, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 44098 + }, + { + "epoch": 0.44099, + "grad_norm": 1.1287183661577505, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 44099 + }, + { + "epoch": 0.441, + "grad_norm": 1.1273392154640787, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 44100 + }, + { + "epoch": 0.44101, + "grad_norm": 0.8964338374840172, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 44101 + }, + { + "epoch": 0.44102, + "grad_norm": 0.7552642351540583, + "learning_rate": 0.003, + "loss": 3.9967, + "step": 44102 + }, + { + "epoch": 0.44103, + "grad_norm": 0.7360483748821005, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 44103 + }, + { + "epoch": 0.44104, + "grad_norm": 0.730940643586094, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 44104 + }, + { + "epoch": 0.44105, + "grad_norm": 0.6919726938386156, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 44105 + }, + { + "epoch": 0.44106, + "grad_norm": 0.5843444993020321, + "learning_rate": 0.003, + "loss": 4.048, + "step": 44106 + }, + { + "epoch": 0.44107, + "grad_norm": 0.5802085579345326, + "learning_rate": 0.003, + "loss": 3.9994, + "step": 44107 + }, + { + "epoch": 0.44108, + "grad_norm": 0.6255829573172111, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 44108 + }, + { + "epoch": 0.44109, + "grad_norm": 0.688232343986272, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 44109 + }, + { + "epoch": 0.4411, + "grad_norm": 0.6647703920669736, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 44110 + }, + { + "epoch": 0.44111, + "grad_norm": 0.7129443133310871, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 44111 + }, + { + "epoch": 0.44112, + "grad_norm": 0.7294200697548063, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 44112 + }, + { + "epoch": 0.44113, + "grad_norm": 0.7505677945151662, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 44113 + }, + { + "epoch": 0.44114, + "grad_norm": 0.8220066530742626, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 44114 + }, + { + "epoch": 0.44115, + "grad_norm": 1.0991577075354462, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 44115 + }, + { + "epoch": 0.44116, + "grad_norm": 1.0030719087071356, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 44116 + }, + { + "epoch": 0.44117, + "grad_norm": 0.9065319649401671, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 44117 + }, + { + "epoch": 0.44118, + "grad_norm": 0.9921320413449829, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 44118 + }, + { + "epoch": 0.44119, + "grad_norm": 1.0265076443869934, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 44119 + }, + { + "epoch": 0.4412, + "grad_norm": 0.9319391776795571, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 44120 + }, + { + "epoch": 0.44121, + "grad_norm": 0.8574497606256603, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 44121 + }, + { + "epoch": 0.44122, + "grad_norm": 0.8329915150453361, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 44122 + }, + { + "epoch": 0.44123, + "grad_norm": 0.9769086365502375, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 44123 + }, + { + "epoch": 0.44124, + "grad_norm": 1.0433044380780974, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 44124 + }, + { + "epoch": 0.44125, + "grad_norm": 1.0049898440787517, + "learning_rate": 0.003, + "loss": 4.037, + "step": 44125 + }, + { + "epoch": 0.44126, + "grad_norm": 0.9430989791556379, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 44126 + }, + { + "epoch": 0.44127, + "grad_norm": 0.8671644031359429, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 44127 + }, + { + "epoch": 0.44128, + "grad_norm": 0.9459041926021475, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 44128 + }, + { + "epoch": 0.44129, + "grad_norm": 1.0532107461136133, + "learning_rate": 0.003, + "loss": 4.0741, + "step": 44129 + }, + { + "epoch": 0.4413, + "grad_norm": 0.980283668603673, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 44130 + }, + { + "epoch": 0.44131, + "grad_norm": 1.012558394881445, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 44131 + }, + { + "epoch": 0.44132, + "grad_norm": 1.1183273924500985, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 44132 + }, + { + "epoch": 0.44133, + "grad_norm": 0.8523837400329118, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 44133 + }, + { + "epoch": 0.44134, + "grad_norm": 0.866866321812697, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 44134 + }, + { + "epoch": 0.44135, + "grad_norm": 0.8799490375786783, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 44135 + }, + { + "epoch": 0.44136, + "grad_norm": 1.028042712585663, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 44136 + }, + { + "epoch": 0.44137, + "grad_norm": 1.0345122042122517, + "learning_rate": 0.003, + "loss": 4.0515, + "step": 44137 + }, + { + "epoch": 0.44138, + "grad_norm": 0.9335483825511706, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 44138 + }, + { + "epoch": 0.44139, + "grad_norm": 0.9972812144163915, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 44139 + }, + { + "epoch": 0.4414, + "grad_norm": 1.1767102338525122, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 44140 + }, + { + "epoch": 0.44141, + "grad_norm": 0.97456858186008, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 44141 + }, + { + "epoch": 0.44142, + "grad_norm": 0.9976821378903761, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 44142 + }, + { + "epoch": 0.44143, + "grad_norm": 0.935185081198074, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 44143 + }, + { + "epoch": 0.44144, + "grad_norm": 0.7826314651121216, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 44144 + }, + { + "epoch": 0.44145, + "grad_norm": 0.6676157810072233, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 44145 + }, + { + "epoch": 0.44146, + "grad_norm": 0.630405524911028, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 44146 + }, + { + "epoch": 0.44147, + "grad_norm": 0.6299893639590376, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 44147 + }, + { + "epoch": 0.44148, + "grad_norm": 0.6156878523021655, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 44148 + }, + { + "epoch": 0.44149, + "grad_norm": 0.7456061173201373, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 44149 + }, + { + "epoch": 0.4415, + "grad_norm": 0.75597328865923, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 44150 + }, + { + "epoch": 0.44151, + "grad_norm": 0.652285077830266, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 44151 + }, + { + "epoch": 0.44152, + "grad_norm": 0.6360767382222805, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 44152 + }, + { + "epoch": 0.44153, + "grad_norm": 0.5998174756435739, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 44153 + }, + { + "epoch": 0.44154, + "grad_norm": 0.6013300144491283, + "learning_rate": 0.003, + "loss": 4.0599, + "step": 44154 + }, + { + "epoch": 0.44155, + "grad_norm": 0.5558128640689006, + "learning_rate": 0.003, + "loss": 4.045, + "step": 44155 + }, + { + "epoch": 0.44156, + "grad_norm": 0.5837262017782773, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 44156 + }, + { + "epoch": 0.44157, + "grad_norm": 0.6547134291853359, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 44157 + }, + { + "epoch": 0.44158, + "grad_norm": 0.8210190748059952, + "learning_rate": 0.003, + "loss": 4.0084, + "step": 44158 + }, + { + "epoch": 0.44159, + "grad_norm": 1.038102123922282, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 44159 + }, + { + "epoch": 0.4416, + "grad_norm": 1.2079296584173542, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 44160 + }, + { + "epoch": 0.44161, + "grad_norm": 0.7895895773408379, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 44161 + }, + { + "epoch": 0.44162, + "grad_norm": 0.6642443586734962, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 44162 + }, + { + "epoch": 0.44163, + "grad_norm": 0.6601375737164927, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 44163 + }, + { + "epoch": 0.44164, + "grad_norm": 0.5806172596855882, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 44164 + }, + { + "epoch": 0.44165, + "grad_norm": 0.5793804912615058, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 44165 + }, + { + "epoch": 0.44166, + "grad_norm": 0.5558016411302428, + "learning_rate": 0.003, + "loss": 3.9988, + "step": 44166 + }, + { + "epoch": 0.44167, + "grad_norm": 0.6043677116361216, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 44167 + }, + { + "epoch": 0.44168, + "grad_norm": 0.5964144787158315, + "learning_rate": 0.003, + "loss": 3.996, + "step": 44168 + }, + { + "epoch": 0.44169, + "grad_norm": 0.7209457416493116, + "learning_rate": 0.003, + "loss": 4.055, + "step": 44169 + }, + { + "epoch": 0.4417, + "grad_norm": 0.7961039835906147, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 44170 + }, + { + "epoch": 0.44171, + "grad_norm": 0.8371213851185951, + "learning_rate": 0.003, + "loss": 4.032, + "step": 44171 + }, + { + "epoch": 0.44172, + "grad_norm": 1.0043666359133716, + "learning_rate": 0.003, + "loss": 4.023, + "step": 44172 + }, + { + "epoch": 0.44173, + "grad_norm": 1.183732520890307, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 44173 + }, + { + "epoch": 0.44174, + "grad_norm": 0.8540598278910649, + "learning_rate": 0.003, + "loss": 4.021, + "step": 44174 + }, + { + "epoch": 0.44175, + "grad_norm": 0.7189561098362562, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 44175 + }, + { + "epoch": 0.44176, + "grad_norm": 0.620722950309627, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 44176 + }, + { + "epoch": 0.44177, + "grad_norm": 0.6587456052196509, + "learning_rate": 0.003, + "loss": 4.0014, + "step": 44177 + }, + { + "epoch": 0.44178, + "grad_norm": 0.6507678508504277, + "learning_rate": 0.003, + "loss": 3.9911, + "step": 44178 + }, + { + "epoch": 0.44179, + "grad_norm": 0.7038094367415944, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 44179 + }, + { + "epoch": 0.4418, + "grad_norm": 0.7707674673545267, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 44180 + }, + { + "epoch": 0.44181, + "grad_norm": 0.8315225421753778, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 44181 + }, + { + "epoch": 0.44182, + "grad_norm": 0.788665041207857, + "learning_rate": 0.003, + "loss": 4.0096, + "step": 44182 + }, + { + "epoch": 0.44183, + "grad_norm": 0.816191410333943, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 44183 + }, + { + "epoch": 0.44184, + "grad_norm": 0.7898831088380259, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 44184 + }, + { + "epoch": 0.44185, + "grad_norm": 0.7577775584820949, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 44185 + }, + { + "epoch": 0.44186, + "grad_norm": 0.8696630972735656, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 44186 + }, + { + "epoch": 0.44187, + "grad_norm": 1.2084935245813577, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 44187 + }, + { + "epoch": 0.44188, + "grad_norm": 1.1408264728017299, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 44188 + }, + { + "epoch": 0.44189, + "grad_norm": 1.1114770323832417, + "learning_rate": 0.003, + "loss": 4.02, + "step": 44189 + }, + { + "epoch": 0.4419, + "grad_norm": 0.9538636654589197, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 44190 + }, + { + "epoch": 0.44191, + "grad_norm": 0.9001862608146143, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 44191 + }, + { + "epoch": 0.44192, + "grad_norm": 0.7767378963609212, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 44192 + }, + { + "epoch": 0.44193, + "grad_norm": 0.8352367346793931, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 44193 + }, + { + "epoch": 0.44194, + "grad_norm": 0.798554552628115, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 44194 + }, + { + "epoch": 0.44195, + "grad_norm": 0.7854515265642097, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 44195 + }, + { + "epoch": 0.44196, + "grad_norm": 0.8953177444795964, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 44196 + }, + { + "epoch": 0.44197, + "grad_norm": 1.000751598158447, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 44197 + }, + { + "epoch": 0.44198, + "grad_norm": 0.9843080329234434, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 44198 + }, + { + "epoch": 0.44199, + "grad_norm": 0.887115608674083, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 44199 + }, + { + "epoch": 0.442, + "grad_norm": 0.736356112473564, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 44200 + }, + { + "epoch": 0.44201, + "grad_norm": 0.8775694042352521, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 44201 + }, + { + "epoch": 0.44202, + "grad_norm": 0.9883733369280384, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 44202 + }, + { + "epoch": 0.44203, + "grad_norm": 1.0236890729323362, + "learning_rate": 0.003, + "loss": 4.0663, + "step": 44203 + }, + { + "epoch": 0.44204, + "grad_norm": 1.100001311088388, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 44204 + }, + { + "epoch": 0.44205, + "grad_norm": 0.832519533205425, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 44205 + }, + { + "epoch": 0.44206, + "grad_norm": 0.7539684033949031, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 44206 + }, + { + "epoch": 0.44207, + "grad_norm": 0.8459062998035037, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 44207 + }, + { + "epoch": 0.44208, + "grad_norm": 0.8283462374770796, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 44208 + }, + { + "epoch": 0.44209, + "grad_norm": 0.8151532493087859, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 44209 + }, + { + "epoch": 0.4421, + "grad_norm": 0.8442249711162754, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 44210 + }, + { + "epoch": 0.44211, + "grad_norm": 0.8349012942160521, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 44211 + }, + { + "epoch": 0.44212, + "grad_norm": 0.7926161085318991, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 44212 + }, + { + "epoch": 0.44213, + "grad_norm": 0.8372960301799078, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 44213 + }, + { + "epoch": 0.44214, + "grad_norm": 0.8522409372391001, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 44214 + }, + { + "epoch": 0.44215, + "grad_norm": 0.8583305214465287, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 44215 + }, + { + "epoch": 0.44216, + "grad_norm": 0.8462802881007329, + "learning_rate": 0.003, + "loss": 4.0089, + "step": 44216 + }, + { + "epoch": 0.44217, + "grad_norm": 0.7929547303830656, + "learning_rate": 0.003, + "loss": 4.0044, + "step": 44217 + }, + { + "epoch": 0.44218, + "grad_norm": 0.6420753099036802, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 44218 + }, + { + "epoch": 0.44219, + "grad_norm": 0.824011822790601, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 44219 + }, + { + "epoch": 0.4422, + "grad_norm": 0.7285827220637117, + "learning_rate": 0.003, + "loss": 3.9962, + "step": 44220 + }, + { + "epoch": 0.44221, + "grad_norm": 0.7471471989414594, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 44221 + }, + { + "epoch": 0.44222, + "grad_norm": 0.8065261284188623, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 44222 + }, + { + "epoch": 0.44223, + "grad_norm": 1.0855014637033018, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 44223 + }, + { + "epoch": 0.44224, + "grad_norm": 2.214031226873047, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 44224 + }, + { + "epoch": 0.44225, + "grad_norm": 2.0940559964528696, + "learning_rate": 0.003, + "loss": 4.1485, + "step": 44225 + }, + { + "epoch": 0.44226, + "grad_norm": 1.12353463020469, + "learning_rate": 0.003, + "loss": 4.0968, + "step": 44226 + }, + { + "epoch": 0.44227, + "grad_norm": 1.7021714450010559, + "learning_rate": 0.003, + "loss": 4.1456, + "step": 44227 + }, + { + "epoch": 0.44228, + "grad_norm": 1.3197826047045849, + "learning_rate": 0.003, + "loss": 4.1179, + "step": 44228 + }, + { + "epoch": 0.44229, + "grad_norm": 1.5027738727915694, + "learning_rate": 0.003, + "loss": 4.114, + "step": 44229 + }, + { + "epoch": 0.4423, + "grad_norm": 1.9343452179974217, + "learning_rate": 0.003, + "loss": 4.179, + "step": 44230 + }, + { + "epoch": 0.44231, + "grad_norm": 1.6850081507585688, + "learning_rate": 0.003, + "loss": 4.1665, + "step": 44231 + }, + { + "epoch": 0.44232, + "grad_norm": 1.325292362664382, + "learning_rate": 0.003, + "loss": 4.1898, + "step": 44232 + }, + { + "epoch": 0.44233, + "grad_norm": 1.44331922714318, + "learning_rate": 0.003, + "loss": 4.1398, + "step": 44233 + }, + { + "epoch": 0.44234, + "grad_norm": 1.1526088448707332, + "learning_rate": 0.003, + "loss": 4.1112, + "step": 44234 + }, + { + "epoch": 0.44235, + "grad_norm": 1.6806661818275523, + "learning_rate": 0.003, + "loss": 4.1677, + "step": 44235 + }, + { + "epoch": 0.44236, + "grad_norm": 1.4237821446607632, + "learning_rate": 0.003, + "loss": 4.1416, + "step": 44236 + }, + { + "epoch": 0.44237, + "grad_norm": 1.4851803520277562, + "learning_rate": 0.003, + "loss": 4.1702, + "step": 44237 + }, + { + "epoch": 0.44238, + "grad_norm": 1.1355845139300933, + "learning_rate": 0.003, + "loss": 4.1548, + "step": 44238 + }, + { + "epoch": 0.44239, + "grad_norm": 1.399255545245979, + "learning_rate": 0.003, + "loss": 4.1219, + "step": 44239 + }, + { + "epoch": 0.4424, + "grad_norm": 1.2381999425329326, + "learning_rate": 0.003, + "loss": 4.1453, + "step": 44240 + }, + { + "epoch": 0.44241, + "grad_norm": 1.2181910524620758, + "learning_rate": 0.003, + "loss": 4.1234, + "step": 44241 + }, + { + "epoch": 0.44242, + "grad_norm": 1.331606063989834, + "learning_rate": 0.003, + "loss": 4.1336, + "step": 44242 + }, + { + "epoch": 0.44243, + "grad_norm": 1.5100011462858143, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 44243 + }, + { + "epoch": 0.44244, + "grad_norm": 1.3183445548024564, + "learning_rate": 0.003, + "loss": 4.1169, + "step": 44244 + }, + { + "epoch": 0.44245, + "grad_norm": 1.2249130425265204, + "learning_rate": 0.003, + "loss": 4.1301, + "step": 44245 + }, + { + "epoch": 0.44246, + "grad_norm": 0.9574264891070167, + "learning_rate": 0.003, + "loss": 4.11, + "step": 44246 + }, + { + "epoch": 0.44247, + "grad_norm": 1.182764133860245, + "learning_rate": 0.003, + "loss": 4.1165, + "step": 44247 + }, + { + "epoch": 0.44248, + "grad_norm": 1.1999872758062193, + "learning_rate": 0.003, + "loss": 4.1203, + "step": 44248 + }, + { + "epoch": 0.44249, + "grad_norm": 1.6339478557518732, + "learning_rate": 0.003, + "loss": 4.1246, + "step": 44249 + }, + { + "epoch": 0.4425, + "grad_norm": 1.0520458270669637, + "learning_rate": 0.003, + "loss": 4.1141, + "step": 44250 + }, + { + "epoch": 0.44251, + "grad_norm": 1.2119168947620906, + "learning_rate": 0.003, + "loss": 4.0844, + "step": 44251 + }, + { + "epoch": 0.44252, + "grad_norm": 1.0532344042950244, + "learning_rate": 0.003, + "loss": 4.1191, + "step": 44252 + }, + { + "epoch": 0.44253, + "grad_norm": 0.958243538614914, + "learning_rate": 0.003, + "loss": 4.0986, + "step": 44253 + }, + { + "epoch": 0.44254, + "grad_norm": 0.9891573646028046, + "learning_rate": 0.003, + "loss": 4.1002, + "step": 44254 + }, + { + "epoch": 0.44255, + "grad_norm": 1.201440561635195, + "learning_rate": 0.003, + "loss": 4.0939, + "step": 44255 + }, + { + "epoch": 0.44256, + "grad_norm": 1.0768236606881199, + "learning_rate": 0.003, + "loss": 4.1003, + "step": 44256 + }, + { + "epoch": 0.44257, + "grad_norm": 1.0646243142241687, + "learning_rate": 0.003, + "loss": 4.1028, + "step": 44257 + }, + { + "epoch": 0.44258, + "grad_norm": 1.0223104856584797, + "learning_rate": 0.003, + "loss": 4.0909, + "step": 44258 + }, + { + "epoch": 0.44259, + "grad_norm": 0.9494313363666466, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 44259 + }, + { + "epoch": 0.4426, + "grad_norm": 0.9902708595218856, + "learning_rate": 0.003, + "loss": 4.1037, + "step": 44260 + }, + { + "epoch": 0.44261, + "grad_norm": 1.2101166796329124, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 44261 + }, + { + "epoch": 0.44262, + "grad_norm": 1.0297173799316957, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 44262 + }, + { + "epoch": 0.44263, + "grad_norm": 1.0377195558675518, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 44263 + }, + { + "epoch": 0.44264, + "grad_norm": 0.7829464179894648, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 44264 + }, + { + "epoch": 0.44265, + "grad_norm": 0.6649001309290009, + "learning_rate": 0.003, + "loss": 4.0624, + "step": 44265 + }, + { + "epoch": 0.44266, + "grad_norm": 0.584998828584141, + "learning_rate": 0.003, + "loss": 4.054, + "step": 44266 + }, + { + "epoch": 0.44267, + "grad_norm": 0.5001595134204864, + "learning_rate": 0.003, + "loss": 4.018, + "step": 44267 + }, + { + "epoch": 0.44268, + "grad_norm": 0.510174360103208, + "learning_rate": 0.003, + "loss": 4.0598, + "step": 44268 + }, + { + "epoch": 0.44269, + "grad_norm": 0.4795306510976547, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 44269 + }, + { + "epoch": 0.4427, + "grad_norm": 0.4497096890730822, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 44270 + }, + { + "epoch": 0.44271, + "grad_norm": 0.4128894250869866, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 44271 + }, + { + "epoch": 0.44272, + "grad_norm": 0.3866716699808721, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 44272 + }, + { + "epoch": 0.44273, + "grad_norm": 0.3938849057710849, + "learning_rate": 0.003, + "loss": 4.0058, + "step": 44273 + }, + { + "epoch": 0.44274, + "grad_norm": 0.36648478382764405, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 44274 + }, + { + "epoch": 0.44275, + "grad_norm": 0.35519892789786467, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 44275 + }, + { + "epoch": 0.44276, + "grad_norm": 0.3570101830735636, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 44276 + }, + { + "epoch": 0.44277, + "grad_norm": 0.36661650748748115, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 44277 + }, + { + "epoch": 0.44278, + "grad_norm": 0.3724831107269321, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 44278 + }, + { + "epoch": 0.44279, + "grad_norm": 0.42818177957021125, + "learning_rate": 0.003, + "loss": 3.9884, + "step": 44279 + }, + { + "epoch": 0.4428, + "grad_norm": 0.5712892836649063, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 44280 + }, + { + "epoch": 0.44281, + "grad_norm": 0.8455407504843352, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 44281 + }, + { + "epoch": 0.44282, + "grad_norm": 1.0503181914866535, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 44282 + }, + { + "epoch": 0.44283, + "grad_norm": 1.0268093995823633, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 44283 + }, + { + "epoch": 0.44284, + "grad_norm": 0.6952404089302443, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 44284 + }, + { + "epoch": 0.44285, + "grad_norm": 0.4903090834061082, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 44285 + }, + { + "epoch": 0.44286, + "grad_norm": 0.8347874789819708, + "learning_rate": 0.003, + "loss": 4.029, + "step": 44286 + }, + { + "epoch": 0.44287, + "grad_norm": 0.8343177126044634, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 44287 + }, + { + "epoch": 0.44288, + "grad_norm": 0.7039820861773362, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 44288 + }, + { + "epoch": 0.44289, + "grad_norm": 0.6578167606601668, + "learning_rate": 0.003, + "loss": 4.021, + "step": 44289 + }, + { + "epoch": 0.4429, + "grad_norm": 0.6657412264549696, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 44290 + }, + { + "epoch": 0.44291, + "grad_norm": 0.6267773601121628, + "learning_rate": 0.003, + "loss": 3.9963, + "step": 44291 + }, + { + "epoch": 0.44292, + "grad_norm": 0.6122287996282593, + "learning_rate": 0.003, + "loss": 4.027, + "step": 44292 + }, + { + "epoch": 0.44293, + "grad_norm": 0.6077558952767155, + "learning_rate": 0.003, + "loss": 3.9994, + "step": 44293 + }, + { + "epoch": 0.44294, + "grad_norm": 0.6614015575104383, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 44294 + }, + { + "epoch": 0.44295, + "grad_norm": 0.7788706364643359, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 44295 + }, + { + "epoch": 0.44296, + "grad_norm": 0.9386057882862827, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 44296 + }, + { + "epoch": 0.44297, + "grad_norm": 1.107436242522669, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 44297 + }, + { + "epoch": 0.44298, + "grad_norm": 0.7393772981962613, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 44298 + }, + { + "epoch": 0.44299, + "grad_norm": 0.6325798729154534, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 44299 + }, + { + "epoch": 0.443, + "grad_norm": 0.710370787325055, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 44300 + }, + { + "epoch": 0.44301, + "grad_norm": 0.6823597115991785, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 44301 + }, + { + "epoch": 0.44302, + "grad_norm": 0.71348902542274, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 44302 + }, + { + "epoch": 0.44303, + "grad_norm": 0.8581043126493803, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 44303 + }, + { + "epoch": 0.44304, + "grad_norm": 0.841126082960817, + "learning_rate": 0.003, + "loss": 4.03, + "step": 44304 + }, + { + "epoch": 0.44305, + "grad_norm": 0.7417105110950916, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 44305 + }, + { + "epoch": 0.44306, + "grad_norm": 0.8992576838434472, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 44306 + }, + { + "epoch": 0.44307, + "grad_norm": 1.122048682218649, + "learning_rate": 0.003, + "loss": 4.047, + "step": 44307 + }, + { + "epoch": 0.44308, + "grad_norm": 0.8589522295665122, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 44308 + }, + { + "epoch": 0.44309, + "grad_norm": 0.7055435410177899, + "learning_rate": 0.003, + "loss": 4.005, + "step": 44309 + }, + { + "epoch": 0.4431, + "grad_norm": 0.7217252557939536, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 44310 + }, + { + "epoch": 0.44311, + "grad_norm": 0.5870960611916612, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 44311 + }, + { + "epoch": 0.44312, + "grad_norm": 0.5515210503949782, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 44312 + }, + { + "epoch": 0.44313, + "grad_norm": 0.44155959759125385, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 44313 + }, + { + "epoch": 0.44314, + "grad_norm": 0.4644352508792665, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 44314 + }, + { + "epoch": 0.44315, + "grad_norm": 0.5048065643141838, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 44315 + }, + { + "epoch": 0.44316, + "grad_norm": 0.5319968697665245, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 44316 + }, + { + "epoch": 0.44317, + "grad_norm": 0.58846147773927, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 44317 + }, + { + "epoch": 0.44318, + "grad_norm": 0.6573900926238004, + "learning_rate": 0.003, + "loss": 4.0078, + "step": 44318 + }, + { + "epoch": 0.44319, + "grad_norm": 0.7546684979998195, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 44319 + }, + { + "epoch": 0.4432, + "grad_norm": 0.9618276399184054, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 44320 + }, + { + "epoch": 0.44321, + "grad_norm": 1.0649115214621785, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 44321 + }, + { + "epoch": 0.44322, + "grad_norm": 0.8054061352570472, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 44322 + }, + { + "epoch": 0.44323, + "grad_norm": 0.6504103707704175, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 44323 + }, + { + "epoch": 0.44324, + "grad_norm": 0.6631017946314723, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 44324 + }, + { + "epoch": 0.44325, + "grad_norm": 0.7106522060779018, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 44325 + }, + { + "epoch": 0.44326, + "grad_norm": 0.6108486468150937, + "learning_rate": 0.003, + "loss": 4.0066, + "step": 44326 + }, + { + "epoch": 0.44327, + "grad_norm": 0.6576285162346959, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 44327 + }, + { + "epoch": 0.44328, + "grad_norm": 0.7714845848039646, + "learning_rate": 0.003, + "loss": 3.964, + "step": 44328 + }, + { + "epoch": 0.44329, + "grad_norm": 0.7435813572359677, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 44329 + }, + { + "epoch": 0.4433, + "grad_norm": 0.8127523786668818, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 44330 + }, + { + "epoch": 0.44331, + "grad_norm": 0.8819561524312759, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 44331 + }, + { + "epoch": 0.44332, + "grad_norm": 0.9165297716123811, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 44332 + }, + { + "epoch": 0.44333, + "grad_norm": 1.016619781145086, + "learning_rate": 0.003, + "loss": 4.036, + "step": 44333 + }, + { + "epoch": 0.44334, + "grad_norm": 0.9443676971044683, + "learning_rate": 0.003, + "loss": 3.998, + "step": 44334 + }, + { + "epoch": 0.44335, + "grad_norm": 0.8757703765709494, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 44335 + }, + { + "epoch": 0.44336, + "grad_norm": 0.7911191152996383, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 44336 + }, + { + "epoch": 0.44337, + "grad_norm": 0.7826622511911204, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 44337 + }, + { + "epoch": 0.44338, + "grad_norm": 0.8663829680924489, + "learning_rate": 0.003, + "loss": 3.9907, + "step": 44338 + }, + { + "epoch": 0.44339, + "grad_norm": 0.8174342025483102, + "learning_rate": 0.003, + "loss": 3.9975, + "step": 44339 + }, + { + "epoch": 0.4434, + "grad_norm": 0.7039553050130382, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 44340 + }, + { + "epoch": 0.44341, + "grad_norm": 0.6694321583994047, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 44341 + }, + { + "epoch": 0.44342, + "grad_norm": 0.6393189443394405, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 44342 + }, + { + "epoch": 0.44343, + "grad_norm": 0.5662649137012838, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 44343 + }, + { + "epoch": 0.44344, + "grad_norm": 0.5744471821950582, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 44344 + }, + { + "epoch": 0.44345, + "grad_norm": 0.6197950423924945, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 44345 + }, + { + "epoch": 0.44346, + "grad_norm": 0.6695170953339198, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 44346 + }, + { + "epoch": 0.44347, + "grad_norm": 0.6584849779815161, + "learning_rate": 0.003, + "loss": 3.983, + "step": 44347 + }, + { + "epoch": 0.44348, + "grad_norm": 0.7349440007759218, + "learning_rate": 0.003, + "loss": 3.9986, + "step": 44348 + }, + { + "epoch": 0.44349, + "grad_norm": 0.809724018236151, + "learning_rate": 0.003, + "loss": 4.029, + "step": 44349 + }, + { + "epoch": 0.4435, + "grad_norm": 1.0722385278663036, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 44350 + }, + { + "epoch": 0.44351, + "grad_norm": 1.3912435133882342, + "learning_rate": 0.003, + "loss": 4.03, + "step": 44351 + }, + { + "epoch": 0.44352, + "grad_norm": 0.5688999696612269, + "learning_rate": 0.003, + "loss": 4.029, + "step": 44352 + }, + { + "epoch": 0.44353, + "grad_norm": 0.9158927301552607, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 44353 + }, + { + "epoch": 0.44354, + "grad_norm": 1.1382894934714296, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 44354 + }, + { + "epoch": 0.44355, + "grad_norm": 0.8331964562750014, + "learning_rate": 0.003, + "loss": 4.031, + "step": 44355 + }, + { + "epoch": 0.44356, + "grad_norm": 0.7160348268467149, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 44356 + }, + { + "epoch": 0.44357, + "grad_norm": 0.7085056006829482, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 44357 + }, + { + "epoch": 0.44358, + "grad_norm": 0.7446821615103703, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 44358 + }, + { + "epoch": 0.44359, + "grad_norm": 0.6452752627001599, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 44359 + }, + { + "epoch": 0.4436, + "grad_norm": 0.6433139095387843, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 44360 + }, + { + "epoch": 0.44361, + "grad_norm": 0.6844887200684246, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 44361 + }, + { + "epoch": 0.44362, + "grad_norm": 0.7255008740767797, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 44362 + }, + { + "epoch": 0.44363, + "grad_norm": 0.7804167465216808, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 44363 + }, + { + "epoch": 0.44364, + "grad_norm": 0.7352994447541847, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 44364 + }, + { + "epoch": 0.44365, + "grad_norm": 0.7941262773812158, + "learning_rate": 0.003, + "loss": 3.9741, + "step": 44365 + }, + { + "epoch": 0.44366, + "grad_norm": 0.9400570954464328, + "learning_rate": 0.003, + "loss": 4.0855, + "step": 44366 + }, + { + "epoch": 0.44367, + "grad_norm": 1.1086803796583748, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 44367 + }, + { + "epoch": 0.44368, + "grad_norm": 0.9108824194430957, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 44368 + }, + { + "epoch": 0.44369, + "grad_norm": 0.8214868103772454, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 44369 + }, + { + "epoch": 0.4437, + "grad_norm": 0.8167811344965532, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 44370 + }, + { + "epoch": 0.44371, + "grad_norm": 0.7154007807298587, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 44371 + }, + { + "epoch": 0.44372, + "grad_norm": 0.6505984574077041, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 44372 + }, + { + "epoch": 0.44373, + "grad_norm": 0.6183379096685941, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 44373 + }, + { + "epoch": 0.44374, + "grad_norm": 0.6703460510075526, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 44374 + }, + { + "epoch": 0.44375, + "grad_norm": 0.71851565029464, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 44375 + }, + { + "epoch": 0.44376, + "grad_norm": 0.7255509659540043, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 44376 + }, + { + "epoch": 0.44377, + "grad_norm": 0.8139815247802745, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 44377 + }, + { + "epoch": 0.44378, + "grad_norm": 0.983678146750607, + "learning_rate": 0.003, + "loss": 3.9946, + "step": 44378 + }, + { + "epoch": 0.44379, + "grad_norm": 1.3422572404435809, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 44379 + }, + { + "epoch": 0.4438, + "grad_norm": 0.7781310142596081, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 44380 + }, + { + "epoch": 0.44381, + "grad_norm": 0.7972851412941543, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 44381 + }, + { + "epoch": 0.44382, + "grad_norm": 0.8484387969028026, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 44382 + }, + { + "epoch": 0.44383, + "grad_norm": 0.8614801340797478, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 44383 + }, + { + "epoch": 0.44384, + "grad_norm": 0.8745579429517959, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 44384 + }, + { + "epoch": 0.44385, + "grad_norm": 0.8052119839683518, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 44385 + }, + { + "epoch": 0.44386, + "grad_norm": 0.9538833691641448, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 44386 + }, + { + "epoch": 0.44387, + "grad_norm": 0.8675266017965522, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 44387 + }, + { + "epoch": 0.44388, + "grad_norm": 0.8350713571168558, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 44388 + }, + { + "epoch": 0.44389, + "grad_norm": 0.7467196403455434, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 44389 + }, + { + "epoch": 0.4439, + "grad_norm": 0.7474175329055305, + "learning_rate": 0.003, + "loss": 4.0737, + "step": 44390 + }, + { + "epoch": 0.44391, + "grad_norm": 0.870883113394127, + "learning_rate": 0.003, + "loss": 4.029, + "step": 44391 + }, + { + "epoch": 0.44392, + "grad_norm": 0.9910352988238785, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 44392 + }, + { + "epoch": 0.44393, + "grad_norm": 1.1878030055005608, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 44393 + }, + { + "epoch": 0.44394, + "grad_norm": 0.6884578268805556, + "learning_rate": 0.003, + "loss": 4.004, + "step": 44394 + }, + { + "epoch": 0.44395, + "grad_norm": 0.5912080375102118, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 44395 + }, + { + "epoch": 0.44396, + "grad_norm": 0.7451519236244057, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 44396 + }, + { + "epoch": 0.44397, + "grad_norm": 0.7299306603881872, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 44397 + }, + { + "epoch": 0.44398, + "grad_norm": 0.7391674865522504, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 44398 + }, + { + "epoch": 0.44399, + "grad_norm": 0.7711165575187882, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 44399 + }, + { + "epoch": 0.444, + "grad_norm": 0.7269897935273801, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 44400 + }, + { + "epoch": 0.44401, + "grad_norm": 0.7214977331764962, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 44401 + }, + { + "epoch": 0.44402, + "grad_norm": 0.7025505072774482, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 44402 + }, + { + "epoch": 0.44403, + "grad_norm": 0.8248729015258707, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 44403 + }, + { + "epoch": 0.44404, + "grad_norm": 1.0140354752657605, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 44404 + }, + { + "epoch": 0.44405, + "grad_norm": 1.041494202156674, + "learning_rate": 0.003, + "loss": 4.0068, + "step": 44405 + }, + { + "epoch": 0.44406, + "grad_norm": 0.9440309048649134, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 44406 + }, + { + "epoch": 0.44407, + "grad_norm": 0.910784316420292, + "learning_rate": 0.003, + "loss": 4.024, + "step": 44407 + }, + { + "epoch": 0.44408, + "grad_norm": 0.8976591040218382, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 44408 + }, + { + "epoch": 0.44409, + "grad_norm": 0.7732818681301713, + "learning_rate": 0.003, + "loss": 4.027, + "step": 44409 + }, + { + "epoch": 0.4441, + "grad_norm": 0.782337843401537, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 44410 + }, + { + "epoch": 0.44411, + "grad_norm": 0.7404712744030226, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 44411 + }, + { + "epoch": 0.44412, + "grad_norm": 0.7953649057619688, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 44412 + }, + { + "epoch": 0.44413, + "grad_norm": 0.9402942407342416, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 44413 + }, + { + "epoch": 0.44414, + "grad_norm": 1.092053736711565, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 44414 + }, + { + "epoch": 0.44415, + "grad_norm": 0.9802529062766315, + "learning_rate": 0.003, + "loss": 4.044, + "step": 44415 + }, + { + "epoch": 0.44416, + "grad_norm": 0.8396756873758192, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 44416 + }, + { + "epoch": 0.44417, + "grad_norm": 0.7571263741245011, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 44417 + }, + { + "epoch": 0.44418, + "grad_norm": 0.7993129451630957, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 44418 + }, + { + "epoch": 0.44419, + "grad_norm": 0.7182595555262425, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 44419 + }, + { + "epoch": 0.4442, + "grad_norm": 0.6173658637106689, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 44420 + }, + { + "epoch": 0.44421, + "grad_norm": 0.7076391613189558, + "learning_rate": 0.003, + "loss": 4.009, + "step": 44421 + }, + { + "epoch": 0.44422, + "grad_norm": 0.7578150620222969, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 44422 + }, + { + "epoch": 0.44423, + "grad_norm": 0.9401549472107357, + "learning_rate": 0.003, + "loss": 3.9953, + "step": 44423 + }, + { + "epoch": 0.44424, + "grad_norm": 1.269223099552251, + "learning_rate": 0.003, + "loss": 4.018, + "step": 44424 + }, + { + "epoch": 0.44425, + "grad_norm": 0.7445308261537333, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 44425 + }, + { + "epoch": 0.44426, + "grad_norm": 0.6979135333609826, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 44426 + }, + { + "epoch": 0.44427, + "grad_norm": 0.7469294162198064, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 44427 + }, + { + "epoch": 0.44428, + "grad_norm": 0.6720931430452383, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 44428 + }, + { + "epoch": 0.44429, + "grad_norm": 0.6768494831533021, + "learning_rate": 0.003, + "loss": 4.052, + "step": 44429 + }, + { + "epoch": 0.4443, + "grad_norm": 0.7516347879822951, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 44430 + }, + { + "epoch": 0.44431, + "grad_norm": 0.7677962269702681, + "learning_rate": 0.003, + "loss": 4.028, + "step": 44431 + }, + { + "epoch": 0.44432, + "grad_norm": 0.7286441050137331, + "learning_rate": 0.003, + "loss": 4.036, + "step": 44432 + }, + { + "epoch": 0.44433, + "grad_norm": 0.7124050050206989, + "learning_rate": 0.003, + "loss": 3.9883, + "step": 44433 + }, + { + "epoch": 0.44434, + "grad_norm": 0.7033908481619763, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 44434 + }, + { + "epoch": 0.44435, + "grad_norm": 0.6519760460206762, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 44435 + }, + { + "epoch": 0.44436, + "grad_norm": 0.6565859266658454, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 44436 + }, + { + "epoch": 0.44437, + "grad_norm": 0.5569348653267735, + "learning_rate": 0.003, + "loss": 3.9955, + "step": 44437 + }, + { + "epoch": 0.44438, + "grad_norm": 0.5591253104001882, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 44438 + }, + { + "epoch": 0.44439, + "grad_norm": 0.6379153279447578, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 44439 + }, + { + "epoch": 0.4444, + "grad_norm": 0.8556795971360028, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 44440 + }, + { + "epoch": 0.44441, + "grad_norm": 1.229474602886337, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 44441 + }, + { + "epoch": 0.44442, + "grad_norm": 0.9796047737550782, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 44442 + }, + { + "epoch": 0.44443, + "grad_norm": 0.9356099603186995, + "learning_rate": 0.003, + "loss": 4.031, + "step": 44443 + }, + { + "epoch": 0.44444, + "grad_norm": 0.7664028497283915, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 44444 + }, + { + "epoch": 0.44445, + "grad_norm": 0.7518138851233349, + "learning_rate": 0.003, + "loss": 3.9978, + "step": 44445 + }, + { + "epoch": 0.44446, + "grad_norm": 0.8999657331366984, + "learning_rate": 0.003, + "loss": 4.017, + "step": 44446 + }, + { + "epoch": 0.44447, + "grad_norm": 1.0393605282862133, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 44447 + }, + { + "epoch": 0.44448, + "grad_norm": 1.0426778442726359, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 44448 + }, + { + "epoch": 0.44449, + "grad_norm": 0.9253228860280135, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 44449 + }, + { + "epoch": 0.4445, + "grad_norm": 0.8669680464117784, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 44450 + }, + { + "epoch": 0.44451, + "grad_norm": 1.0045303886996633, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 44451 + }, + { + "epoch": 0.44452, + "grad_norm": 1.0429172277639516, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 44452 + }, + { + "epoch": 0.44453, + "grad_norm": 1.0366107718966817, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 44453 + }, + { + "epoch": 0.44454, + "grad_norm": 1.1021452305294586, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 44454 + }, + { + "epoch": 0.44455, + "grad_norm": 0.9297985806277069, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 44455 + }, + { + "epoch": 0.44456, + "grad_norm": 0.9666871724164356, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 44456 + }, + { + "epoch": 0.44457, + "grad_norm": 0.952230291062906, + "learning_rate": 0.003, + "loss": 4.051, + "step": 44457 + }, + { + "epoch": 0.44458, + "grad_norm": 1.0788646451196586, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 44458 + }, + { + "epoch": 0.44459, + "grad_norm": 1.0044452988948835, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 44459 + }, + { + "epoch": 0.4446, + "grad_norm": 0.9061837363582649, + "learning_rate": 0.003, + "loss": 4.009, + "step": 44460 + }, + { + "epoch": 0.44461, + "grad_norm": 0.8031612962263965, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 44461 + }, + { + "epoch": 0.44462, + "grad_norm": 0.7311169061339858, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 44462 + }, + { + "epoch": 0.44463, + "grad_norm": 0.7063581631917082, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 44463 + }, + { + "epoch": 0.44464, + "grad_norm": 0.8255479727996751, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 44464 + }, + { + "epoch": 0.44465, + "grad_norm": 0.8793641291330082, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 44465 + }, + { + "epoch": 0.44466, + "grad_norm": 0.7530910644387114, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 44466 + }, + { + "epoch": 0.44467, + "grad_norm": 0.7096245404087289, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 44467 + }, + { + "epoch": 0.44468, + "grad_norm": 0.6860678195232595, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 44468 + }, + { + "epoch": 0.44469, + "grad_norm": 0.6932010687674456, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 44469 + }, + { + "epoch": 0.4447, + "grad_norm": 0.7278935831868518, + "learning_rate": 0.003, + "loss": 4.0057, + "step": 44470 + }, + { + "epoch": 0.44471, + "grad_norm": 0.8324878922613286, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 44471 + }, + { + "epoch": 0.44472, + "grad_norm": 0.8759293757241389, + "learning_rate": 0.003, + "loss": 4.0073, + "step": 44472 + }, + { + "epoch": 0.44473, + "grad_norm": 0.7243805486629423, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 44473 + }, + { + "epoch": 0.44474, + "grad_norm": 0.6587750417638926, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 44474 + }, + { + "epoch": 0.44475, + "grad_norm": 0.6693272139863984, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 44475 + }, + { + "epoch": 0.44476, + "grad_norm": 0.7947395558170128, + "learning_rate": 0.003, + "loss": 4.023, + "step": 44476 + }, + { + "epoch": 0.44477, + "grad_norm": 0.827399470140345, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 44477 + }, + { + "epoch": 0.44478, + "grad_norm": 0.9036299808304072, + "learning_rate": 0.003, + "loss": 4.0534, + "step": 44478 + }, + { + "epoch": 0.44479, + "grad_norm": 0.8984965972536298, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 44479 + }, + { + "epoch": 0.4448, + "grad_norm": 0.9917947873975483, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 44480 + }, + { + "epoch": 0.44481, + "grad_norm": 1.0218171329411536, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 44481 + }, + { + "epoch": 0.44482, + "grad_norm": 0.9962873168150203, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 44482 + }, + { + "epoch": 0.44483, + "grad_norm": 0.9609262512466874, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 44483 + }, + { + "epoch": 0.44484, + "grad_norm": 0.8492974318707974, + "learning_rate": 0.003, + "loss": 4.064, + "step": 44484 + }, + { + "epoch": 0.44485, + "grad_norm": 0.7432625933932507, + "learning_rate": 0.003, + "loss": 4.007, + "step": 44485 + }, + { + "epoch": 0.44486, + "grad_norm": 0.7182670278982555, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 44486 + }, + { + "epoch": 0.44487, + "grad_norm": 0.8005212095870103, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 44487 + }, + { + "epoch": 0.44488, + "grad_norm": 0.7707869437636441, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 44488 + }, + { + "epoch": 0.44489, + "grad_norm": 0.7043574825767898, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 44489 + }, + { + "epoch": 0.4449, + "grad_norm": 0.7708012055345953, + "learning_rate": 0.003, + "loss": 4.03, + "step": 44490 + }, + { + "epoch": 0.44491, + "grad_norm": 0.8958427329760406, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 44491 + }, + { + "epoch": 0.44492, + "grad_norm": 0.8247050737969682, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 44492 + }, + { + "epoch": 0.44493, + "grad_norm": 0.7836265755980938, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 44493 + }, + { + "epoch": 0.44494, + "grad_norm": 0.6714266202241649, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 44494 + }, + { + "epoch": 0.44495, + "grad_norm": 0.6569126893801379, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 44495 + }, + { + "epoch": 0.44496, + "grad_norm": 0.561720728930449, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 44496 + }, + { + "epoch": 0.44497, + "grad_norm": 0.6562528360947205, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 44497 + }, + { + "epoch": 0.44498, + "grad_norm": 0.9070782918216275, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 44498 + }, + { + "epoch": 0.44499, + "grad_norm": 1.272370491867959, + "learning_rate": 0.003, + "loss": 4.0022, + "step": 44499 + }, + { + "epoch": 0.445, + "grad_norm": 0.8722417405090689, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 44500 + }, + { + "epoch": 0.44501, + "grad_norm": 0.8338138244438342, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 44501 + }, + { + "epoch": 0.44502, + "grad_norm": 0.8390072755943975, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 44502 + }, + { + "epoch": 0.44503, + "grad_norm": 0.819753208825678, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 44503 + }, + { + "epoch": 0.44504, + "grad_norm": 0.8063471251922104, + "learning_rate": 0.003, + "loss": 3.9968, + "step": 44504 + }, + { + "epoch": 0.44505, + "grad_norm": 0.8224928657858251, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 44505 + }, + { + "epoch": 0.44506, + "grad_norm": 0.8102742981242997, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 44506 + }, + { + "epoch": 0.44507, + "grad_norm": 0.8440659809370862, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 44507 + }, + { + "epoch": 0.44508, + "grad_norm": 0.9287647256391747, + "learning_rate": 0.003, + "loss": 4.0032, + "step": 44508 + }, + { + "epoch": 0.44509, + "grad_norm": 1.0935604662164935, + "learning_rate": 0.003, + "loss": 4.023, + "step": 44509 + }, + { + "epoch": 0.4451, + "grad_norm": 1.1564202828137382, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 44510 + }, + { + "epoch": 0.44511, + "grad_norm": 0.8573207940990221, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 44511 + }, + { + "epoch": 0.44512, + "grad_norm": 0.8271854930265953, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 44512 + }, + { + "epoch": 0.44513, + "grad_norm": 0.8391733503574771, + "learning_rate": 0.003, + "loss": 4.017, + "step": 44513 + }, + { + "epoch": 0.44514, + "grad_norm": 0.8773740737997849, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 44514 + }, + { + "epoch": 0.44515, + "grad_norm": 0.98695418325065, + "learning_rate": 0.003, + "loss": 4.0798, + "step": 44515 + }, + { + "epoch": 0.44516, + "grad_norm": 1.0891732165944688, + "learning_rate": 0.003, + "loss": 4.0777, + "step": 44516 + }, + { + "epoch": 0.44517, + "grad_norm": 0.7649427885725664, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 44517 + }, + { + "epoch": 0.44518, + "grad_norm": 0.6638325206491937, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 44518 + }, + { + "epoch": 0.44519, + "grad_norm": 0.7282671030781203, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 44519 + }, + { + "epoch": 0.4452, + "grad_norm": 0.7557980166779684, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 44520 + }, + { + "epoch": 0.44521, + "grad_norm": 0.9090810659886286, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 44521 + }, + { + "epoch": 0.44522, + "grad_norm": 1.110848651574813, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 44522 + }, + { + "epoch": 0.44523, + "grad_norm": 0.8489955632756778, + "learning_rate": 0.003, + "loss": 4.0014, + "step": 44523 + }, + { + "epoch": 0.44524, + "grad_norm": 0.7484718255837195, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 44524 + }, + { + "epoch": 0.44525, + "grad_norm": 0.7678188133803073, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 44525 + }, + { + "epoch": 0.44526, + "grad_norm": 0.9388187744369507, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 44526 + }, + { + "epoch": 0.44527, + "grad_norm": 1.117109200619637, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 44527 + }, + { + "epoch": 0.44528, + "grad_norm": 0.8046585357339253, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 44528 + }, + { + "epoch": 0.44529, + "grad_norm": 0.7449825163236233, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 44529 + }, + { + "epoch": 0.4453, + "grad_norm": 0.7714788436877261, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 44530 + }, + { + "epoch": 0.44531, + "grad_norm": 0.8669928678922368, + "learning_rate": 0.003, + "loss": 4.028, + "step": 44531 + }, + { + "epoch": 0.44532, + "grad_norm": 0.7845440843003068, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 44532 + }, + { + "epoch": 0.44533, + "grad_norm": 0.7986553914509367, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 44533 + }, + { + "epoch": 0.44534, + "grad_norm": 0.8008092446463162, + "learning_rate": 0.003, + "loss": 4.026, + "step": 44534 + }, + { + "epoch": 0.44535, + "grad_norm": 0.6530018264863205, + "learning_rate": 0.003, + "loss": 4.005, + "step": 44535 + }, + { + "epoch": 0.44536, + "grad_norm": 0.6369436477324366, + "learning_rate": 0.003, + "loss": 4.0676, + "step": 44536 + }, + { + "epoch": 0.44537, + "grad_norm": 0.5566120184508933, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 44537 + }, + { + "epoch": 0.44538, + "grad_norm": 0.5398619792048174, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 44538 + }, + { + "epoch": 0.44539, + "grad_norm": 0.508689760337075, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 44539 + }, + { + "epoch": 0.4454, + "grad_norm": 0.5898194324330902, + "learning_rate": 0.003, + "loss": 4.0023, + "step": 44540 + }, + { + "epoch": 0.44541, + "grad_norm": 0.6870110351579679, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 44541 + }, + { + "epoch": 0.44542, + "grad_norm": 0.7948293724959559, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 44542 + }, + { + "epoch": 0.44543, + "grad_norm": 0.9575158219154396, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 44543 + }, + { + "epoch": 0.44544, + "grad_norm": 1.2532786264165083, + "learning_rate": 0.003, + "loss": 4.043, + "step": 44544 + }, + { + "epoch": 0.44545, + "grad_norm": 0.8433747904736377, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 44545 + }, + { + "epoch": 0.44546, + "grad_norm": 0.7889136286883797, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 44546 + }, + { + "epoch": 0.44547, + "grad_norm": 0.718402192980922, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 44547 + }, + { + "epoch": 0.44548, + "grad_norm": 0.6671291688270261, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 44548 + }, + { + "epoch": 0.44549, + "grad_norm": 0.6525051792084945, + "learning_rate": 0.003, + "loss": 4.011, + "step": 44549 + }, + { + "epoch": 0.4455, + "grad_norm": 0.6935749373023029, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 44550 + }, + { + "epoch": 0.44551, + "grad_norm": 0.715320341150297, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 44551 + }, + { + "epoch": 0.44552, + "grad_norm": 0.774672901408266, + "learning_rate": 0.003, + "loss": 4.028, + "step": 44552 + }, + { + "epoch": 0.44553, + "grad_norm": 0.6713763885259659, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 44553 + }, + { + "epoch": 0.44554, + "grad_norm": 0.664722060477263, + "learning_rate": 0.003, + "loss": 4.025, + "step": 44554 + }, + { + "epoch": 0.44555, + "grad_norm": 0.6346300680402658, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 44555 + }, + { + "epoch": 0.44556, + "grad_norm": 0.6591869798585824, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 44556 + }, + { + "epoch": 0.44557, + "grad_norm": 0.7031059438330689, + "learning_rate": 0.003, + "loss": 4.0006, + "step": 44557 + }, + { + "epoch": 0.44558, + "grad_norm": 0.8621800318576796, + "learning_rate": 0.003, + "loss": 3.9975, + "step": 44558 + }, + { + "epoch": 0.44559, + "grad_norm": 1.1543459614212126, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 44559 + }, + { + "epoch": 0.4456, + "grad_norm": 1.0521446792170843, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 44560 + }, + { + "epoch": 0.44561, + "grad_norm": 1.01090453930092, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 44561 + }, + { + "epoch": 0.44562, + "grad_norm": 0.9540533090544355, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 44562 + }, + { + "epoch": 0.44563, + "grad_norm": 1.0008371654825818, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 44563 + }, + { + "epoch": 0.44564, + "grad_norm": 0.9197228394081514, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 44564 + }, + { + "epoch": 0.44565, + "grad_norm": 0.9051809381619125, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 44565 + }, + { + "epoch": 0.44566, + "grad_norm": 0.822864639139161, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 44566 + }, + { + "epoch": 0.44567, + "grad_norm": 0.9383117795664673, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 44567 + }, + { + "epoch": 0.44568, + "grad_norm": 1.1458176737517036, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 44568 + }, + { + "epoch": 0.44569, + "grad_norm": 0.9404232698515718, + "learning_rate": 0.003, + "loss": 3.9988, + "step": 44569 + }, + { + "epoch": 0.4457, + "grad_norm": 0.8801311207130535, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 44570 + }, + { + "epoch": 0.44571, + "grad_norm": 0.9357504011228243, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 44571 + }, + { + "epoch": 0.44572, + "grad_norm": 0.9539206334645222, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 44572 + }, + { + "epoch": 0.44573, + "grad_norm": 0.9354035214532082, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 44573 + }, + { + "epoch": 0.44574, + "grad_norm": 0.9124707124621058, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 44574 + }, + { + "epoch": 0.44575, + "grad_norm": 0.9657621042690653, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 44575 + }, + { + "epoch": 0.44576, + "grad_norm": 0.8506832778481718, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 44576 + }, + { + "epoch": 0.44577, + "grad_norm": 0.8260578805830986, + "learning_rate": 0.003, + "loss": 4.027, + "step": 44577 + }, + { + "epoch": 0.44578, + "grad_norm": 0.968334221520941, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 44578 + }, + { + "epoch": 0.44579, + "grad_norm": 1.1164743263490642, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 44579 + }, + { + "epoch": 0.4458, + "grad_norm": 0.9603920417614097, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 44580 + }, + { + "epoch": 0.44581, + "grad_norm": 0.8180731069578177, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 44581 + }, + { + "epoch": 0.44582, + "grad_norm": 0.8599645159920881, + "learning_rate": 0.003, + "loss": 4.0032, + "step": 44582 + }, + { + "epoch": 0.44583, + "grad_norm": 0.956775404051775, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 44583 + }, + { + "epoch": 0.44584, + "grad_norm": 0.9674296211494148, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 44584 + }, + { + "epoch": 0.44585, + "grad_norm": 0.9460751863525836, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 44585 + }, + { + "epoch": 0.44586, + "grad_norm": 0.89227083131111, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 44586 + }, + { + "epoch": 0.44587, + "grad_norm": 0.6939311878812481, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 44587 + }, + { + "epoch": 0.44588, + "grad_norm": 0.6362563962025708, + "learning_rate": 0.003, + "loss": 4.0018, + "step": 44588 + }, + { + "epoch": 0.44589, + "grad_norm": 0.6312938821202664, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 44589 + }, + { + "epoch": 0.4459, + "grad_norm": 0.6772303588117377, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 44590 + }, + { + "epoch": 0.44591, + "grad_norm": 0.6396760890657345, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 44591 + }, + { + "epoch": 0.44592, + "grad_norm": 0.5477656371706471, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 44592 + }, + { + "epoch": 0.44593, + "grad_norm": 0.5552617964964982, + "learning_rate": 0.003, + "loss": 4.0053, + "step": 44593 + }, + { + "epoch": 0.44594, + "grad_norm": 0.5332275029774938, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 44594 + }, + { + "epoch": 0.44595, + "grad_norm": 0.5372514848334573, + "learning_rate": 0.003, + "loss": 4.0031, + "step": 44595 + }, + { + "epoch": 0.44596, + "grad_norm": 0.5808012553061069, + "learning_rate": 0.003, + "loss": 4.0052, + "step": 44596 + }, + { + "epoch": 0.44597, + "grad_norm": 0.6633325202895426, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 44597 + }, + { + "epoch": 0.44598, + "grad_norm": 0.7534645714413957, + "learning_rate": 0.003, + "loss": 4.0051, + "step": 44598 + }, + { + "epoch": 0.44599, + "grad_norm": 0.7687638151518503, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 44599 + }, + { + "epoch": 0.446, + "grad_norm": 0.7519096275895053, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 44600 + }, + { + "epoch": 0.44601, + "grad_norm": 0.761387212767351, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 44601 + }, + { + "epoch": 0.44602, + "grad_norm": 0.7695824363485888, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 44602 + }, + { + "epoch": 0.44603, + "grad_norm": 0.6659608561088015, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 44603 + }, + { + "epoch": 0.44604, + "grad_norm": 0.7947256928734998, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 44604 + }, + { + "epoch": 0.44605, + "grad_norm": 0.9570757758358235, + "learning_rate": 0.003, + "loss": 4.028, + "step": 44605 + }, + { + "epoch": 0.44606, + "grad_norm": 1.2773483439325806, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 44606 + }, + { + "epoch": 0.44607, + "grad_norm": 1.049312606460707, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 44607 + }, + { + "epoch": 0.44608, + "grad_norm": 0.8960469912886647, + "learning_rate": 0.003, + "loss": 4.0594, + "step": 44608 + }, + { + "epoch": 0.44609, + "grad_norm": 0.7975179727492022, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 44609 + }, + { + "epoch": 0.4461, + "grad_norm": 0.7146710115666751, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 44610 + }, + { + "epoch": 0.44611, + "grad_norm": 0.7656864867399236, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 44611 + }, + { + "epoch": 0.44612, + "grad_norm": 0.8398362900044125, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 44612 + }, + { + "epoch": 0.44613, + "grad_norm": 0.9091946912775306, + "learning_rate": 0.003, + "loss": 4.012, + "step": 44613 + }, + { + "epoch": 0.44614, + "grad_norm": 0.8338849815086935, + "learning_rate": 0.003, + "loss": 4.0559, + "step": 44614 + }, + { + "epoch": 0.44615, + "grad_norm": 0.8733308091792182, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 44615 + }, + { + "epoch": 0.44616, + "grad_norm": 0.964613962949843, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 44616 + }, + { + "epoch": 0.44617, + "grad_norm": 1.1015728755366003, + "learning_rate": 0.003, + "loss": 4.018, + "step": 44617 + }, + { + "epoch": 0.44618, + "grad_norm": 1.1057421385691324, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 44618 + }, + { + "epoch": 0.44619, + "grad_norm": 0.8394498600911608, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 44619 + }, + { + "epoch": 0.4462, + "grad_norm": 0.6729369440917192, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 44620 + }, + { + "epoch": 0.44621, + "grad_norm": 0.6624350496947029, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 44621 + }, + { + "epoch": 0.44622, + "grad_norm": 0.6992548626972774, + "learning_rate": 0.003, + "loss": 4.042, + "step": 44622 + }, + { + "epoch": 0.44623, + "grad_norm": 0.6933448355273407, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 44623 + }, + { + "epoch": 0.44624, + "grad_norm": 0.6516738323297501, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 44624 + }, + { + "epoch": 0.44625, + "grad_norm": 0.7323437414342271, + "learning_rate": 0.003, + "loss": 4.036, + "step": 44625 + }, + { + "epoch": 0.44626, + "grad_norm": 0.7308724582703929, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 44626 + }, + { + "epoch": 0.44627, + "grad_norm": 0.6699138082029564, + "learning_rate": 0.003, + "loss": 4.033, + "step": 44627 + }, + { + "epoch": 0.44628, + "grad_norm": 0.6738423752043015, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 44628 + }, + { + "epoch": 0.44629, + "grad_norm": 0.6465414723574364, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 44629 + }, + { + "epoch": 0.4463, + "grad_norm": 0.6781552332300497, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 44630 + }, + { + "epoch": 0.44631, + "grad_norm": 0.8367162783912487, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 44631 + }, + { + "epoch": 0.44632, + "grad_norm": 1.1489622454809556, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 44632 + }, + { + "epoch": 0.44633, + "grad_norm": 1.037493059670564, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 44633 + }, + { + "epoch": 0.44634, + "grad_norm": 1.1309575282415985, + "learning_rate": 0.003, + "loss": 4.0005, + "step": 44634 + }, + { + "epoch": 0.44635, + "grad_norm": 0.8284361011098877, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 44635 + }, + { + "epoch": 0.44636, + "grad_norm": 0.6437008984284098, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 44636 + }, + { + "epoch": 0.44637, + "grad_norm": 0.7442671098437439, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 44637 + }, + { + "epoch": 0.44638, + "grad_norm": 0.8208922869073825, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 44638 + }, + { + "epoch": 0.44639, + "grad_norm": 0.8885230712207443, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 44639 + }, + { + "epoch": 0.4464, + "grad_norm": 0.8499967455598809, + "learning_rate": 0.003, + "loss": 4.0058, + "step": 44640 + }, + { + "epoch": 0.44641, + "grad_norm": 0.8770104496402833, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 44641 + }, + { + "epoch": 0.44642, + "grad_norm": 0.900497471499688, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 44642 + }, + { + "epoch": 0.44643, + "grad_norm": 0.8144160140946995, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 44643 + }, + { + "epoch": 0.44644, + "grad_norm": 0.8335486324866953, + "learning_rate": 0.003, + "loss": 4.0032, + "step": 44644 + }, + { + "epoch": 0.44645, + "grad_norm": 0.9048913745114943, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 44645 + }, + { + "epoch": 0.44646, + "grad_norm": 0.9359910566424117, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 44646 + }, + { + "epoch": 0.44647, + "grad_norm": 0.9537868117020513, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 44647 + }, + { + "epoch": 0.44648, + "grad_norm": 1.0010885606841153, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 44648 + }, + { + "epoch": 0.44649, + "grad_norm": 1.1202269104468545, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 44649 + }, + { + "epoch": 0.4465, + "grad_norm": 0.824430895133488, + "learning_rate": 0.003, + "loss": 4.0052, + "step": 44650 + }, + { + "epoch": 0.44651, + "grad_norm": 0.7804648512875191, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 44651 + }, + { + "epoch": 0.44652, + "grad_norm": 0.7679444969538227, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 44652 + }, + { + "epoch": 0.44653, + "grad_norm": 0.8463838652239023, + "learning_rate": 0.003, + "loss": 4.025, + "step": 44653 + }, + { + "epoch": 0.44654, + "grad_norm": 1.011696410124887, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 44654 + }, + { + "epoch": 0.44655, + "grad_norm": 1.124076168663328, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 44655 + }, + { + "epoch": 0.44656, + "grad_norm": 0.8533682191671929, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 44656 + }, + { + "epoch": 0.44657, + "grad_norm": 0.8985032184906039, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 44657 + }, + { + "epoch": 0.44658, + "grad_norm": 0.9863545666123672, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 44658 + }, + { + "epoch": 0.44659, + "grad_norm": 1.1142705570317037, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 44659 + }, + { + "epoch": 0.4466, + "grad_norm": 0.90436327582856, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 44660 + }, + { + "epoch": 0.44661, + "grad_norm": 0.9684410539013822, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 44661 + }, + { + "epoch": 0.44662, + "grad_norm": 0.9407712384086099, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 44662 + }, + { + "epoch": 0.44663, + "grad_norm": 0.8209931921075292, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 44663 + }, + { + "epoch": 0.44664, + "grad_norm": 0.7720216923818023, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 44664 + }, + { + "epoch": 0.44665, + "grad_norm": 0.7301556067974184, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 44665 + }, + { + "epoch": 0.44666, + "grad_norm": 0.7081858563406195, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 44666 + }, + { + "epoch": 0.44667, + "grad_norm": 0.6880083040753293, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 44667 + }, + { + "epoch": 0.44668, + "grad_norm": 0.6573230206282666, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 44668 + }, + { + "epoch": 0.44669, + "grad_norm": 0.6643395480355485, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 44669 + }, + { + "epoch": 0.4467, + "grad_norm": 0.6506323993399027, + "learning_rate": 0.003, + "loss": 3.9947, + "step": 44670 + }, + { + "epoch": 0.44671, + "grad_norm": 0.7245444858673981, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 44671 + }, + { + "epoch": 0.44672, + "grad_norm": 0.8935767683364999, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 44672 + }, + { + "epoch": 0.44673, + "grad_norm": 1.0835804651417094, + "learning_rate": 0.003, + "loss": 4.06, + "step": 44673 + }, + { + "epoch": 0.44674, + "grad_norm": 0.8340056269574022, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 44674 + }, + { + "epoch": 0.44675, + "grad_norm": 0.6338568390196635, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 44675 + }, + { + "epoch": 0.44676, + "grad_norm": 0.5836840811703513, + "learning_rate": 0.003, + "loss": 4.0076, + "step": 44676 + }, + { + "epoch": 0.44677, + "grad_norm": 0.55984562810515, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 44677 + }, + { + "epoch": 0.44678, + "grad_norm": 0.5941446992137729, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 44678 + }, + { + "epoch": 0.44679, + "grad_norm": 0.6088983874351144, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 44679 + }, + { + "epoch": 0.4468, + "grad_norm": 0.5502123479768842, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 44680 + }, + { + "epoch": 0.44681, + "grad_norm": 0.5263823496601527, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 44681 + }, + { + "epoch": 0.44682, + "grad_norm": 0.626310878639171, + "learning_rate": 0.003, + "loss": 4.05, + "step": 44682 + }, + { + "epoch": 0.44683, + "grad_norm": 0.750153820519921, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 44683 + }, + { + "epoch": 0.44684, + "grad_norm": 0.8155526306451775, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 44684 + }, + { + "epoch": 0.44685, + "grad_norm": 0.8575382483777702, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 44685 + }, + { + "epoch": 0.44686, + "grad_norm": 0.9714260637851004, + "learning_rate": 0.003, + "loss": 4.002, + "step": 44686 + }, + { + "epoch": 0.44687, + "grad_norm": 1.219722845150763, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 44687 + }, + { + "epoch": 0.44688, + "grad_norm": 0.9099437534322463, + "learning_rate": 0.003, + "loss": 3.9952, + "step": 44688 + }, + { + "epoch": 0.44689, + "grad_norm": 0.9113106679608132, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 44689 + }, + { + "epoch": 0.4469, + "grad_norm": 0.9527072830599232, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 44690 + }, + { + "epoch": 0.44691, + "grad_norm": 0.9736422948768619, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 44691 + }, + { + "epoch": 0.44692, + "grad_norm": 1.0612644212652647, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 44692 + }, + { + "epoch": 0.44693, + "grad_norm": 0.9770380207970472, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 44693 + }, + { + "epoch": 0.44694, + "grad_norm": 1.0032021172644658, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 44694 + }, + { + "epoch": 0.44695, + "grad_norm": 0.9492557841642619, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 44695 + }, + { + "epoch": 0.44696, + "grad_norm": 0.9868442669940389, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 44696 + }, + { + "epoch": 0.44697, + "grad_norm": 0.86670848044439, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 44697 + }, + { + "epoch": 0.44698, + "grad_norm": 0.876162251717411, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 44698 + }, + { + "epoch": 0.44699, + "grad_norm": 0.8591149744412047, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 44699 + }, + { + "epoch": 0.447, + "grad_norm": 0.8101095544902989, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 44700 + }, + { + "epoch": 0.44701, + "grad_norm": 0.7692065930923424, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 44701 + }, + { + "epoch": 0.44702, + "grad_norm": 0.8771122161791809, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 44702 + }, + { + "epoch": 0.44703, + "grad_norm": 0.9552545809599782, + "learning_rate": 0.003, + "loss": 4.0625, + "step": 44703 + }, + { + "epoch": 0.44704, + "grad_norm": 1.0238799508024468, + "learning_rate": 0.003, + "loss": 4.047, + "step": 44704 + }, + { + "epoch": 0.44705, + "grad_norm": 0.8745310685513004, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 44705 + }, + { + "epoch": 0.44706, + "grad_norm": 0.8872187791730856, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 44706 + }, + { + "epoch": 0.44707, + "grad_norm": 0.9314084466929016, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 44707 + }, + { + "epoch": 0.44708, + "grad_norm": 0.9762913286624542, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 44708 + }, + { + "epoch": 0.44709, + "grad_norm": 0.8820006373737364, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 44709 + }, + { + "epoch": 0.4471, + "grad_norm": 0.8601939978720758, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 44710 + }, + { + "epoch": 0.44711, + "grad_norm": 0.9566757212703275, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 44711 + }, + { + "epoch": 0.44712, + "grad_norm": 1.3511998545247093, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 44712 + }, + { + "epoch": 0.44713, + "grad_norm": 0.968470421281923, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 44713 + }, + { + "epoch": 0.44714, + "grad_norm": 0.8776728754203985, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 44714 + }, + { + "epoch": 0.44715, + "grad_norm": 0.7898020765048238, + "learning_rate": 0.003, + "loss": 4.052, + "step": 44715 + }, + { + "epoch": 0.44716, + "grad_norm": 0.7328774753888634, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 44716 + }, + { + "epoch": 0.44717, + "grad_norm": 0.7476047989861494, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 44717 + }, + { + "epoch": 0.44718, + "grad_norm": 0.7448866888842194, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 44718 + }, + { + "epoch": 0.44719, + "grad_norm": 0.782720804613381, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 44719 + }, + { + "epoch": 0.4472, + "grad_norm": 0.9150016449831777, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 44720 + }, + { + "epoch": 0.44721, + "grad_norm": 0.9567378589740139, + "learning_rate": 0.003, + "loss": 4.04, + "step": 44721 + }, + { + "epoch": 0.44722, + "grad_norm": 0.9710984191811274, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 44722 + }, + { + "epoch": 0.44723, + "grad_norm": 0.9761116157200964, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 44723 + }, + { + "epoch": 0.44724, + "grad_norm": 0.8122607194020821, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 44724 + }, + { + "epoch": 0.44725, + "grad_norm": 0.6360609186255204, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 44725 + }, + { + "epoch": 0.44726, + "grad_norm": 0.588448398845036, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 44726 + }, + { + "epoch": 0.44727, + "grad_norm": 0.5932498622799682, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 44727 + }, + { + "epoch": 0.44728, + "grad_norm": 0.7363498287983129, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 44728 + }, + { + "epoch": 0.44729, + "grad_norm": 0.7679945536995891, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 44729 + }, + { + "epoch": 0.4473, + "grad_norm": 0.6934931009726961, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 44730 + }, + { + "epoch": 0.44731, + "grad_norm": 0.7150907480159111, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 44731 + }, + { + "epoch": 0.44732, + "grad_norm": 0.839459568772688, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 44732 + }, + { + "epoch": 0.44733, + "grad_norm": 1.0574578910314711, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 44733 + }, + { + "epoch": 0.44734, + "grad_norm": 1.0063311333867417, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 44734 + }, + { + "epoch": 0.44735, + "grad_norm": 0.8804275261775152, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 44735 + }, + { + "epoch": 0.44736, + "grad_norm": 0.7735243876100613, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 44736 + }, + { + "epoch": 0.44737, + "grad_norm": 0.7834186485020166, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 44737 + }, + { + "epoch": 0.44738, + "grad_norm": 0.862778565099324, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 44738 + }, + { + "epoch": 0.44739, + "grad_norm": 0.822178297287686, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 44739 + }, + { + "epoch": 0.4474, + "grad_norm": 0.8118151379628966, + "learning_rate": 0.003, + "loss": 4.024, + "step": 44740 + }, + { + "epoch": 0.44741, + "grad_norm": 0.7733730723530483, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 44741 + }, + { + "epoch": 0.44742, + "grad_norm": 0.6314264542533291, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 44742 + }, + { + "epoch": 0.44743, + "grad_norm": 0.6478470541587057, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 44743 + }, + { + "epoch": 0.44744, + "grad_norm": 0.7812457263456668, + "learning_rate": 0.003, + "loss": 4.0084, + "step": 44744 + }, + { + "epoch": 0.44745, + "grad_norm": 0.9680963319226537, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 44745 + }, + { + "epoch": 0.44746, + "grad_norm": 0.9550463851103254, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 44746 + }, + { + "epoch": 0.44747, + "grad_norm": 0.751115617124285, + "learning_rate": 0.003, + "loss": 3.9985, + "step": 44747 + }, + { + "epoch": 0.44748, + "grad_norm": 0.7385902618246146, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 44748 + }, + { + "epoch": 0.44749, + "grad_norm": 0.7457225727724325, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 44749 + }, + { + "epoch": 0.4475, + "grad_norm": 0.7740836628814013, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 44750 + }, + { + "epoch": 0.44751, + "grad_norm": 0.6898947354997359, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 44751 + }, + { + "epoch": 0.44752, + "grad_norm": 0.6631877386655087, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 44752 + }, + { + "epoch": 0.44753, + "grad_norm": 0.6669253989695083, + "learning_rate": 0.003, + "loss": 3.9906, + "step": 44753 + }, + { + "epoch": 0.44754, + "grad_norm": 0.652639376274079, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 44754 + }, + { + "epoch": 0.44755, + "grad_norm": 0.7375014463061915, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 44755 + }, + { + "epoch": 0.44756, + "grad_norm": 0.9660390382864958, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 44756 + }, + { + "epoch": 0.44757, + "grad_norm": 1.2114542115397398, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 44757 + }, + { + "epoch": 0.44758, + "grad_norm": 0.8132273943802548, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 44758 + }, + { + "epoch": 0.44759, + "grad_norm": 0.735975682887762, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 44759 + }, + { + "epoch": 0.4476, + "grad_norm": 0.6959287608334418, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 44760 + }, + { + "epoch": 0.44761, + "grad_norm": 0.6462384382052208, + "learning_rate": 0.003, + "loss": 4.016, + "step": 44761 + }, + { + "epoch": 0.44762, + "grad_norm": 0.7085936750585804, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 44762 + }, + { + "epoch": 0.44763, + "grad_norm": 0.7408677345522746, + "learning_rate": 0.003, + "loss": 3.9919, + "step": 44763 + }, + { + "epoch": 0.44764, + "grad_norm": 0.7736123422639773, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 44764 + }, + { + "epoch": 0.44765, + "grad_norm": 0.8019022133323822, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 44765 + }, + { + "epoch": 0.44766, + "grad_norm": 0.9041520359913473, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 44766 + }, + { + "epoch": 0.44767, + "grad_norm": 1.0261969391432604, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 44767 + }, + { + "epoch": 0.44768, + "grad_norm": 0.9543766108834087, + "learning_rate": 0.003, + "loss": 4.0576, + "step": 44768 + }, + { + "epoch": 0.44769, + "grad_norm": 0.8775057006396733, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 44769 + }, + { + "epoch": 0.4477, + "grad_norm": 0.7584545774986585, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 44770 + }, + { + "epoch": 0.44771, + "grad_norm": 0.8243403373622902, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 44771 + }, + { + "epoch": 0.44772, + "grad_norm": 0.9068372540896776, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 44772 + }, + { + "epoch": 0.44773, + "grad_norm": 1.0815996244610304, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 44773 + }, + { + "epoch": 0.44774, + "grad_norm": 1.077103515616508, + "learning_rate": 0.003, + "loss": 4.019, + "step": 44774 + }, + { + "epoch": 0.44775, + "grad_norm": 0.9903275169734536, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 44775 + }, + { + "epoch": 0.44776, + "grad_norm": 0.9194990902024092, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 44776 + }, + { + "epoch": 0.44777, + "grad_norm": 0.9925539479214095, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 44777 + }, + { + "epoch": 0.44778, + "grad_norm": 1.0068232468200584, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 44778 + }, + { + "epoch": 0.44779, + "grad_norm": 1.0209568338401152, + "learning_rate": 0.003, + "loss": 4.0617, + "step": 44779 + }, + { + "epoch": 0.4478, + "grad_norm": 0.8998354636202068, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 44780 + }, + { + "epoch": 0.44781, + "grad_norm": 1.0114014299526517, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 44781 + }, + { + "epoch": 0.44782, + "grad_norm": 1.1333628947360463, + "learning_rate": 0.003, + "loss": 4.0654, + "step": 44782 + }, + { + "epoch": 0.44783, + "grad_norm": 0.9316411392811551, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 44783 + }, + { + "epoch": 0.44784, + "grad_norm": 0.8658386177096226, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 44784 + }, + { + "epoch": 0.44785, + "grad_norm": 0.7943512754413585, + "learning_rate": 0.003, + "loss": 4.0685, + "step": 44785 + }, + { + "epoch": 0.44786, + "grad_norm": 0.8560968885209572, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 44786 + }, + { + "epoch": 0.44787, + "grad_norm": 0.7848399981281378, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 44787 + }, + { + "epoch": 0.44788, + "grad_norm": 0.7353641073085956, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 44788 + }, + { + "epoch": 0.44789, + "grad_norm": 0.7339881550758733, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 44789 + }, + { + "epoch": 0.4479, + "grad_norm": 0.6765503706470835, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 44790 + }, + { + "epoch": 0.44791, + "grad_norm": 0.6362808164878248, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 44791 + }, + { + "epoch": 0.44792, + "grad_norm": 0.6288843268115318, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 44792 + }, + { + "epoch": 0.44793, + "grad_norm": 0.6608670473132926, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 44793 + }, + { + "epoch": 0.44794, + "grad_norm": 0.6860021546998651, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 44794 + }, + { + "epoch": 0.44795, + "grad_norm": 0.8076003530584935, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 44795 + }, + { + "epoch": 0.44796, + "grad_norm": 0.9866788686569297, + "learning_rate": 0.003, + "loss": 4.029, + "step": 44796 + }, + { + "epoch": 0.44797, + "grad_norm": 1.0484936009206354, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 44797 + }, + { + "epoch": 0.44798, + "grad_norm": 0.8747305167127858, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 44798 + }, + { + "epoch": 0.44799, + "grad_norm": 0.859797466629834, + "learning_rate": 0.003, + "loss": 4.0696, + "step": 44799 + }, + { + "epoch": 0.448, + "grad_norm": 0.9307572325375703, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 44800 + }, + { + "epoch": 0.44801, + "grad_norm": 0.9914879318745691, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 44801 + }, + { + "epoch": 0.44802, + "grad_norm": 0.9953230255373563, + "learning_rate": 0.003, + "loss": 4.026, + "step": 44802 + }, + { + "epoch": 0.44803, + "grad_norm": 0.9464814848885789, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 44803 + }, + { + "epoch": 0.44804, + "grad_norm": 0.9999581508152733, + "learning_rate": 0.003, + "loss": 3.9864, + "step": 44804 + }, + { + "epoch": 0.44805, + "grad_norm": 1.1252927411118236, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 44805 + }, + { + "epoch": 0.44806, + "grad_norm": 1.050002739459309, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 44806 + }, + { + "epoch": 0.44807, + "grad_norm": 1.0801215165084637, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 44807 + }, + { + "epoch": 0.44808, + "grad_norm": 0.84466118092812, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 44808 + }, + { + "epoch": 0.44809, + "grad_norm": 0.7218929925638032, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 44809 + }, + { + "epoch": 0.4481, + "grad_norm": 0.6467591955703084, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 44810 + }, + { + "epoch": 0.44811, + "grad_norm": 0.4754260663160637, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 44811 + }, + { + "epoch": 0.44812, + "grad_norm": 0.43394469303093297, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 44812 + }, + { + "epoch": 0.44813, + "grad_norm": 0.47343124221509303, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 44813 + }, + { + "epoch": 0.44814, + "grad_norm": 0.49878317753658147, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 44814 + }, + { + "epoch": 0.44815, + "grad_norm": 0.5400946218114385, + "learning_rate": 0.003, + "loss": 4.0061, + "step": 44815 + }, + { + "epoch": 0.44816, + "grad_norm": 0.6038504628778975, + "learning_rate": 0.003, + "loss": 4.0022, + "step": 44816 + }, + { + "epoch": 0.44817, + "grad_norm": 0.6378751625030677, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 44817 + }, + { + "epoch": 0.44818, + "grad_norm": 0.6876355672983605, + "learning_rate": 0.003, + "loss": 4.0031, + "step": 44818 + }, + { + "epoch": 0.44819, + "grad_norm": 0.8109634242027943, + "learning_rate": 0.003, + "loss": 3.9777, + "step": 44819 + }, + { + "epoch": 0.4482, + "grad_norm": 1.0059773031734411, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 44820 + }, + { + "epoch": 0.44821, + "grad_norm": 1.008698117895592, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 44821 + }, + { + "epoch": 0.44822, + "grad_norm": 0.8554877520707824, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 44822 + }, + { + "epoch": 0.44823, + "grad_norm": 0.8394491033319782, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 44823 + }, + { + "epoch": 0.44824, + "grad_norm": 0.8943265293837417, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 44824 + }, + { + "epoch": 0.44825, + "grad_norm": 0.8709284551242237, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 44825 + }, + { + "epoch": 0.44826, + "grad_norm": 0.9193871701477544, + "learning_rate": 0.003, + "loss": 4.0027, + "step": 44826 + }, + { + "epoch": 0.44827, + "grad_norm": 0.9854251881895169, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 44827 + }, + { + "epoch": 0.44828, + "grad_norm": 0.9626789491699206, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 44828 + }, + { + "epoch": 0.44829, + "grad_norm": 1.0368620706238496, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 44829 + }, + { + "epoch": 0.4483, + "grad_norm": 0.9385323944931043, + "learning_rate": 0.003, + "loss": 4.019, + "step": 44830 + }, + { + "epoch": 0.44831, + "grad_norm": 0.9728887765562884, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 44831 + }, + { + "epoch": 0.44832, + "grad_norm": 0.9084230534534552, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 44832 + }, + { + "epoch": 0.44833, + "grad_norm": 0.8568962060192169, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 44833 + }, + { + "epoch": 0.44834, + "grad_norm": 0.7462057170636273, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 44834 + }, + { + "epoch": 0.44835, + "grad_norm": 0.7361179194813576, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 44835 + }, + { + "epoch": 0.44836, + "grad_norm": 0.6515738000280498, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 44836 + }, + { + "epoch": 0.44837, + "grad_norm": 0.6448630963535875, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 44837 + }, + { + "epoch": 0.44838, + "grad_norm": 0.740225462155164, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 44838 + }, + { + "epoch": 0.44839, + "grad_norm": 0.834881867226871, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 44839 + }, + { + "epoch": 0.4484, + "grad_norm": 0.8865619564298144, + "learning_rate": 0.003, + "loss": 4.005, + "step": 44840 + }, + { + "epoch": 0.44841, + "grad_norm": 0.9752131293368917, + "learning_rate": 0.003, + "loss": 4.011, + "step": 44841 + }, + { + "epoch": 0.44842, + "grad_norm": 1.1180707509991865, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 44842 + }, + { + "epoch": 0.44843, + "grad_norm": 0.7968075608783759, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 44843 + }, + { + "epoch": 0.44844, + "grad_norm": 0.708048205050024, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 44844 + }, + { + "epoch": 0.44845, + "grad_norm": 0.7146765434130444, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 44845 + }, + { + "epoch": 0.44846, + "grad_norm": 0.7242451342393925, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 44846 + }, + { + "epoch": 0.44847, + "grad_norm": 0.7719777632256417, + "learning_rate": 0.003, + "loss": 4.0001, + "step": 44847 + }, + { + "epoch": 0.44848, + "grad_norm": 0.825871688328905, + "learning_rate": 0.003, + "loss": 3.9945, + "step": 44848 + }, + { + "epoch": 0.44849, + "grad_norm": 0.8766392993990325, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 44849 + }, + { + "epoch": 0.4485, + "grad_norm": 0.8786405852425374, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 44850 + }, + { + "epoch": 0.44851, + "grad_norm": 0.8333718886267516, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 44851 + }, + { + "epoch": 0.44852, + "grad_norm": 0.8575502771456025, + "learning_rate": 0.003, + "loss": 4.033, + "step": 44852 + }, + { + "epoch": 0.44853, + "grad_norm": 0.8414278955800919, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 44853 + }, + { + "epoch": 0.44854, + "grad_norm": 0.7742621514914932, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 44854 + }, + { + "epoch": 0.44855, + "grad_norm": 0.7966308577990513, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 44855 + }, + { + "epoch": 0.44856, + "grad_norm": 0.9687231514327413, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 44856 + }, + { + "epoch": 0.44857, + "grad_norm": 1.2358486781056661, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 44857 + }, + { + "epoch": 0.44858, + "grad_norm": 0.7060617499033699, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 44858 + }, + { + "epoch": 0.44859, + "grad_norm": 0.7378025762690514, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 44859 + }, + { + "epoch": 0.4486, + "grad_norm": 0.8912452657988272, + "learning_rate": 0.003, + "loss": 4.037, + "step": 44860 + }, + { + "epoch": 0.44861, + "grad_norm": 1.0219585377434635, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 44861 + }, + { + "epoch": 0.44862, + "grad_norm": 0.9673576851172021, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 44862 + }, + { + "epoch": 0.44863, + "grad_norm": 0.9621417316457712, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 44863 + }, + { + "epoch": 0.44864, + "grad_norm": 0.9592853917252309, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 44864 + }, + { + "epoch": 0.44865, + "grad_norm": 0.997543232174904, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 44865 + }, + { + "epoch": 0.44866, + "grad_norm": 0.8976360136675571, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 44866 + }, + { + "epoch": 0.44867, + "grad_norm": 0.9022946881887519, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 44867 + }, + { + "epoch": 0.44868, + "grad_norm": 0.926665950155882, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 44868 + }, + { + "epoch": 0.44869, + "grad_norm": 0.7902759438204565, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 44869 + }, + { + "epoch": 0.4487, + "grad_norm": 0.8064622823081059, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 44870 + }, + { + "epoch": 0.44871, + "grad_norm": 0.8958661061170384, + "learning_rate": 0.003, + "loss": 4.002, + "step": 44871 + }, + { + "epoch": 0.44872, + "grad_norm": 1.0668348001250714, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 44872 + }, + { + "epoch": 0.44873, + "grad_norm": 1.080524264233058, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 44873 + }, + { + "epoch": 0.44874, + "grad_norm": 0.8442599478404975, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 44874 + }, + { + "epoch": 0.44875, + "grad_norm": 0.7796246451823835, + "learning_rate": 0.003, + "loss": 3.9947, + "step": 44875 + }, + { + "epoch": 0.44876, + "grad_norm": 0.7874097184247324, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 44876 + }, + { + "epoch": 0.44877, + "grad_norm": 0.7439935983993814, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 44877 + }, + { + "epoch": 0.44878, + "grad_norm": 0.8058235777098997, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 44878 + }, + { + "epoch": 0.44879, + "grad_norm": 0.714762839462053, + "learning_rate": 0.003, + "loss": 4.034, + "step": 44879 + }, + { + "epoch": 0.4488, + "grad_norm": 0.6495966054041314, + "learning_rate": 0.003, + "loss": 4.016, + "step": 44880 + }, + { + "epoch": 0.44881, + "grad_norm": 0.7799842105287088, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 44881 + }, + { + "epoch": 0.44882, + "grad_norm": 0.7719547659161285, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 44882 + }, + { + "epoch": 0.44883, + "grad_norm": 0.66170505617438, + "learning_rate": 0.003, + "loss": 3.9801, + "step": 44883 + }, + { + "epoch": 0.44884, + "grad_norm": 0.565143084317199, + "learning_rate": 0.003, + "loss": 4.0089, + "step": 44884 + }, + { + "epoch": 0.44885, + "grad_norm": 0.5631120999531664, + "learning_rate": 0.003, + "loss": 3.9952, + "step": 44885 + }, + { + "epoch": 0.44886, + "grad_norm": 0.6008465904841562, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 44886 + }, + { + "epoch": 0.44887, + "grad_norm": 0.6257786172889848, + "learning_rate": 0.003, + "loss": 4.0019, + "step": 44887 + }, + { + "epoch": 0.44888, + "grad_norm": 0.586505859922275, + "learning_rate": 0.003, + "loss": 3.9979, + "step": 44888 + }, + { + "epoch": 0.44889, + "grad_norm": 0.6079957782348907, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 44889 + }, + { + "epoch": 0.4489, + "grad_norm": 0.6464508267517043, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 44890 + }, + { + "epoch": 0.44891, + "grad_norm": 0.6174188154377247, + "learning_rate": 0.003, + "loss": 4.002, + "step": 44891 + }, + { + "epoch": 0.44892, + "grad_norm": 0.6498757321768115, + "learning_rate": 0.003, + "loss": 3.9909, + "step": 44892 + }, + { + "epoch": 0.44893, + "grad_norm": 0.9239913302105206, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 44893 + }, + { + "epoch": 0.44894, + "grad_norm": 1.2129112860471312, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 44894 + }, + { + "epoch": 0.44895, + "grad_norm": 0.9300699306056269, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 44895 + }, + { + "epoch": 0.44896, + "grad_norm": 0.7819451485320231, + "learning_rate": 0.003, + "loss": 3.9822, + "step": 44896 + }, + { + "epoch": 0.44897, + "grad_norm": 0.760575317143938, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 44897 + }, + { + "epoch": 0.44898, + "grad_norm": 0.8355528527945806, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 44898 + }, + { + "epoch": 0.44899, + "grad_norm": 0.8677118196774278, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 44899 + }, + { + "epoch": 0.449, + "grad_norm": 0.8311436739871404, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 44900 + }, + { + "epoch": 0.44901, + "grad_norm": 0.8151744007963929, + "learning_rate": 0.003, + "loss": 4.033, + "step": 44901 + }, + { + "epoch": 0.44902, + "grad_norm": 0.8579982505197433, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 44902 + }, + { + "epoch": 0.44903, + "grad_norm": 0.8026640621137946, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 44903 + }, + { + "epoch": 0.44904, + "grad_norm": 0.8058193977904614, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 44904 + }, + { + "epoch": 0.44905, + "grad_norm": 0.8854558614155198, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 44905 + }, + { + "epoch": 0.44906, + "grad_norm": 0.9457341322796345, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 44906 + }, + { + "epoch": 0.44907, + "grad_norm": 0.9480206231793938, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 44907 + }, + { + "epoch": 0.44908, + "grad_norm": 1.1523163389832194, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 44908 + }, + { + "epoch": 0.44909, + "grad_norm": 0.9777337474655865, + "learning_rate": 0.003, + "loss": 4.031, + "step": 44909 + }, + { + "epoch": 0.4491, + "grad_norm": 0.8179466011786513, + "learning_rate": 0.003, + "loss": 4.037, + "step": 44910 + }, + { + "epoch": 0.44911, + "grad_norm": 0.7830720463408173, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 44911 + }, + { + "epoch": 0.44912, + "grad_norm": 0.9349227590100841, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 44912 + }, + { + "epoch": 0.44913, + "grad_norm": 1.2676693701948132, + "learning_rate": 0.003, + "loss": 4.0627, + "step": 44913 + }, + { + "epoch": 0.44914, + "grad_norm": 1.0145554051196555, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 44914 + }, + { + "epoch": 0.44915, + "grad_norm": 0.9653217841679805, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 44915 + }, + { + "epoch": 0.44916, + "grad_norm": 0.8248802585705206, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 44916 + }, + { + "epoch": 0.44917, + "grad_norm": 0.9649413245828892, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 44917 + }, + { + "epoch": 0.44918, + "grad_norm": 0.8695400064675388, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 44918 + }, + { + "epoch": 0.44919, + "grad_norm": 0.711516330256117, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 44919 + }, + { + "epoch": 0.4492, + "grad_norm": 0.6724004649163021, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 44920 + }, + { + "epoch": 0.44921, + "grad_norm": 0.6780902946712053, + "learning_rate": 0.003, + "loss": 4.045, + "step": 44921 + }, + { + "epoch": 0.44922, + "grad_norm": 0.6212860525834819, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 44922 + }, + { + "epoch": 0.44923, + "grad_norm": 0.5652056150811524, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 44923 + }, + { + "epoch": 0.44924, + "grad_norm": 0.5831105050739768, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 44924 + }, + { + "epoch": 0.44925, + "grad_norm": 0.634079551907866, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 44925 + }, + { + "epoch": 0.44926, + "grad_norm": 0.7419664219671642, + "learning_rate": 0.003, + "loss": 4.04, + "step": 44926 + }, + { + "epoch": 0.44927, + "grad_norm": 0.963269877815032, + "learning_rate": 0.003, + "loss": 3.9998, + "step": 44927 + }, + { + "epoch": 0.44928, + "grad_norm": 1.1748170006729814, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 44928 + }, + { + "epoch": 0.44929, + "grad_norm": 0.8629568835013244, + "learning_rate": 0.003, + "loss": 4.0048, + "step": 44929 + }, + { + "epoch": 0.4493, + "grad_norm": 0.8032395622371823, + "learning_rate": 0.003, + "loss": 4.0052, + "step": 44930 + }, + { + "epoch": 0.44931, + "grad_norm": 0.7688761088838165, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 44931 + }, + { + "epoch": 0.44932, + "grad_norm": 0.765866433124617, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 44932 + }, + { + "epoch": 0.44933, + "grad_norm": 0.7825255541545868, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 44933 + }, + { + "epoch": 0.44934, + "grad_norm": 0.842577904117235, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 44934 + }, + { + "epoch": 0.44935, + "grad_norm": 0.7925479348980838, + "learning_rate": 0.003, + "loss": 3.999, + "step": 44935 + }, + { + "epoch": 0.44936, + "grad_norm": 0.6735958887645308, + "learning_rate": 0.003, + "loss": 4.031, + "step": 44936 + }, + { + "epoch": 0.44937, + "grad_norm": 0.752668237224129, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 44937 + }, + { + "epoch": 0.44938, + "grad_norm": 0.9413618658538949, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 44938 + }, + { + "epoch": 0.44939, + "grad_norm": 1.1592708844608601, + "learning_rate": 0.003, + "loss": 4.05, + "step": 44939 + }, + { + "epoch": 0.4494, + "grad_norm": 0.8211383990382892, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 44940 + }, + { + "epoch": 0.44941, + "grad_norm": 0.8056648994784659, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 44941 + }, + { + "epoch": 0.44942, + "grad_norm": 0.8667313249641143, + "learning_rate": 0.003, + "loss": 4.064, + "step": 44942 + }, + { + "epoch": 0.44943, + "grad_norm": 0.887661636341329, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 44943 + }, + { + "epoch": 0.44944, + "grad_norm": 0.8281136391508597, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 44944 + }, + { + "epoch": 0.44945, + "grad_norm": 0.7792069106239038, + "learning_rate": 0.003, + "loss": 4.001, + "step": 44945 + }, + { + "epoch": 0.44946, + "grad_norm": 0.8633312031315322, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 44946 + }, + { + "epoch": 0.44947, + "grad_norm": 1.029561604041567, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 44947 + }, + { + "epoch": 0.44948, + "grad_norm": 1.1417137441969283, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 44948 + }, + { + "epoch": 0.44949, + "grad_norm": 0.8409219944439937, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 44949 + }, + { + "epoch": 0.4495, + "grad_norm": 0.7439785464008253, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 44950 + }, + { + "epoch": 0.44951, + "grad_norm": 0.7769858967799695, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 44951 + }, + { + "epoch": 0.44952, + "grad_norm": 0.8899696950776723, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 44952 + }, + { + "epoch": 0.44953, + "grad_norm": 0.8874111599145913, + "learning_rate": 0.003, + "loss": 4.0013, + "step": 44953 + }, + { + "epoch": 0.44954, + "grad_norm": 0.8511957027578296, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 44954 + }, + { + "epoch": 0.44955, + "grad_norm": 0.8268491454114244, + "learning_rate": 0.003, + "loss": 4.0096, + "step": 44955 + }, + { + "epoch": 0.44956, + "grad_norm": 0.8244574300612201, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 44956 + }, + { + "epoch": 0.44957, + "grad_norm": 0.768789243832262, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 44957 + }, + { + "epoch": 0.44958, + "grad_norm": 0.7838744876053655, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 44958 + }, + { + "epoch": 0.44959, + "grad_norm": 0.8736779838323918, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 44959 + }, + { + "epoch": 0.4496, + "grad_norm": 0.9165437921236178, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 44960 + }, + { + "epoch": 0.44961, + "grad_norm": 1.033370292780627, + "learning_rate": 0.003, + "loss": 3.9999, + "step": 44961 + }, + { + "epoch": 0.44962, + "grad_norm": 1.1403150310728152, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 44962 + }, + { + "epoch": 0.44963, + "grad_norm": 0.8568830599706214, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 44963 + }, + { + "epoch": 0.44964, + "grad_norm": 0.8626618585875385, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 44964 + }, + { + "epoch": 0.44965, + "grad_norm": 0.9066506175697916, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 44965 + }, + { + "epoch": 0.44966, + "grad_norm": 0.9089420890161255, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 44966 + }, + { + "epoch": 0.44967, + "grad_norm": 0.8854654403232651, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 44967 + }, + { + "epoch": 0.44968, + "grad_norm": 0.8750332832621266, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 44968 + }, + { + "epoch": 0.44969, + "grad_norm": 0.9357707239729709, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 44969 + }, + { + "epoch": 0.4497, + "grad_norm": 0.9421059370479137, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 44970 + }, + { + "epoch": 0.44971, + "grad_norm": 0.8181767187235098, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 44971 + }, + { + "epoch": 0.44972, + "grad_norm": 0.7194990451024972, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 44972 + }, + { + "epoch": 0.44973, + "grad_norm": 0.7435133355330718, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 44973 + }, + { + "epoch": 0.44974, + "grad_norm": 0.8821010666309961, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 44974 + }, + { + "epoch": 0.44975, + "grad_norm": 1.0401149376408771, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 44975 + }, + { + "epoch": 0.44976, + "grad_norm": 1.0070414945122728, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 44976 + }, + { + "epoch": 0.44977, + "grad_norm": 0.8072914793674746, + "learning_rate": 0.003, + "loss": 4.0071, + "step": 44977 + }, + { + "epoch": 0.44978, + "grad_norm": 0.7099747000882289, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 44978 + }, + { + "epoch": 0.44979, + "grad_norm": 0.7077896945495147, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 44979 + }, + { + "epoch": 0.4498, + "grad_norm": 0.6929901274807588, + "learning_rate": 0.003, + "loss": 3.9829, + "step": 44980 + }, + { + "epoch": 0.44981, + "grad_norm": 0.6218875219119027, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 44981 + }, + { + "epoch": 0.44982, + "grad_norm": 0.6389380441578694, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 44982 + }, + { + "epoch": 0.44983, + "grad_norm": 0.6057772287517197, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 44983 + }, + { + "epoch": 0.44984, + "grad_norm": 0.7048911473989297, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 44984 + }, + { + "epoch": 0.44985, + "grad_norm": 0.7375545515585127, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 44985 + }, + { + "epoch": 0.44986, + "grad_norm": 0.7391812579125602, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 44986 + }, + { + "epoch": 0.44987, + "grad_norm": 0.6882189396852213, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 44987 + }, + { + "epoch": 0.44988, + "grad_norm": 0.7046657475770751, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 44988 + }, + { + "epoch": 0.44989, + "grad_norm": 0.848800244957528, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 44989 + }, + { + "epoch": 0.4499, + "grad_norm": 1.0223959208637166, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 44990 + }, + { + "epoch": 0.44991, + "grad_norm": 1.1069088696810796, + "learning_rate": 0.003, + "loss": 4.042, + "step": 44991 + }, + { + "epoch": 0.44992, + "grad_norm": 0.912221511410239, + "learning_rate": 0.003, + "loss": 4.0662, + "step": 44992 + }, + { + "epoch": 0.44993, + "grad_norm": 0.84446941903583, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 44993 + }, + { + "epoch": 0.44994, + "grad_norm": 0.8261350938804035, + "learning_rate": 0.003, + "loss": 4.017, + "step": 44994 + }, + { + "epoch": 0.44995, + "grad_norm": 0.8171072491043495, + "learning_rate": 0.003, + "loss": 3.9903, + "step": 44995 + }, + { + "epoch": 0.44996, + "grad_norm": 0.8856232321732947, + "learning_rate": 0.003, + "loss": 4.0005, + "step": 44996 + }, + { + "epoch": 0.44997, + "grad_norm": 0.85163172615629, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 44997 + }, + { + "epoch": 0.44998, + "grad_norm": 0.7252759415334583, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 44998 + }, + { + "epoch": 0.44999, + "grad_norm": 0.7600777528791997, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 44999 + }, + { + "epoch": 0.45, + "grad_norm": 0.7535329585491424, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 45000 + }, + { + "epoch": 0.45001, + "grad_norm": 0.7521716674633142, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 45001 + }, + { + "epoch": 0.45002, + "grad_norm": 0.7707257331450142, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 45002 + }, + { + "epoch": 0.45003, + "grad_norm": 0.8720497577819328, + "learning_rate": 0.003, + "loss": 4.0095, + "step": 45003 + }, + { + "epoch": 0.45004, + "grad_norm": 1.0653287898741208, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 45004 + }, + { + "epoch": 0.45005, + "grad_norm": 1.0653964386355828, + "learning_rate": 0.003, + "loss": 4.047, + "step": 45005 + }, + { + "epoch": 0.45006, + "grad_norm": 0.9617903736122821, + "learning_rate": 0.003, + "loss": 3.9689, + "step": 45006 + }, + { + "epoch": 0.45007, + "grad_norm": 0.9325466813669033, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 45007 + }, + { + "epoch": 0.45008, + "grad_norm": 1.0041601863305167, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 45008 + }, + { + "epoch": 0.45009, + "grad_norm": 0.913781130998759, + "learning_rate": 0.003, + "loss": 4.014, + "step": 45009 + }, + { + "epoch": 0.4501, + "grad_norm": 0.7793840373515668, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 45010 + }, + { + "epoch": 0.45011, + "grad_norm": 0.7394817249045288, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 45011 + }, + { + "epoch": 0.45012, + "grad_norm": 0.713998032440413, + "learning_rate": 0.003, + "loss": 3.9984, + "step": 45012 + }, + { + "epoch": 0.45013, + "grad_norm": 0.6962777113942066, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 45013 + }, + { + "epoch": 0.45014, + "grad_norm": 0.6588019565608313, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 45014 + }, + { + "epoch": 0.45015, + "grad_norm": 0.654179523745658, + "learning_rate": 0.003, + "loss": 3.9969, + "step": 45015 + }, + { + "epoch": 0.45016, + "grad_norm": 0.6461429527599848, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 45016 + }, + { + "epoch": 0.45017, + "grad_norm": 0.6363862334663026, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 45017 + }, + { + "epoch": 0.45018, + "grad_norm": 0.6193996244574471, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 45018 + }, + { + "epoch": 0.45019, + "grad_norm": 0.6416638439963763, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 45019 + }, + { + "epoch": 0.4502, + "grad_norm": 0.6954487537069561, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 45020 + }, + { + "epoch": 0.45021, + "grad_norm": 0.7708210802076009, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 45021 + }, + { + "epoch": 0.45022, + "grad_norm": 0.910159714547571, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 45022 + }, + { + "epoch": 0.45023, + "grad_norm": 1.0866473764014173, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 45023 + }, + { + "epoch": 0.45024, + "grad_norm": 1.006341444814511, + "learning_rate": 0.003, + "loss": 4.026, + "step": 45024 + }, + { + "epoch": 0.45025, + "grad_norm": 0.9686862331332353, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 45025 + }, + { + "epoch": 0.45026, + "grad_norm": 0.8782841859545698, + "learning_rate": 0.003, + "loss": 4.053, + "step": 45026 + }, + { + "epoch": 0.45027, + "grad_norm": 0.8150998785373549, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 45027 + }, + { + "epoch": 0.45028, + "grad_norm": 0.9884245802599946, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 45028 + }, + { + "epoch": 0.45029, + "grad_norm": 1.0691874827621064, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 45029 + }, + { + "epoch": 0.4503, + "grad_norm": 0.8915707729530452, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 45030 + }, + { + "epoch": 0.45031, + "grad_norm": 0.6963067164047406, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 45031 + }, + { + "epoch": 0.45032, + "grad_norm": 0.7214492381045771, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 45032 + }, + { + "epoch": 0.45033, + "grad_norm": 0.771603419740964, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 45033 + }, + { + "epoch": 0.45034, + "grad_norm": 0.7724891444037247, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 45034 + }, + { + "epoch": 0.45035, + "grad_norm": 0.6981566337637499, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 45035 + }, + { + "epoch": 0.45036, + "grad_norm": 0.7293426958658933, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 45036 + }, + { + "epoch": 0.45037, + "grad_norm": 0.8046869958890939, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 45037 + }, + { + "epoch": 0.45038, + "grad_norm": 0.9532018416851911, + "learning_rate": 0.003, + "loss": 3.9941, + "step": 45038 + }, + { + "epoch": 0.45039, + "grad_norm": 1.1470270779695935, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 45039 + }, + { + "epoch": 0.4504, + "grad_norm": 1.0831118203549324, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 45040 + }, + { + "epoch": 0.45041, + "grad_norm": 0.98927822345362, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 45041 + }, + { + "epoch": 0.45042, + "grad_norm": 1.0216933070373027, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 45042 + }, + { + "epoch": 0.45043, + "grad_norm": 0.9570634160448193, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 45043 + }, + { + "epoch": 0.45044, + "grad_norm": 0.9091728266845808, + "learning_rate": 0.003, + "loss": 4.059, + "step": 45044 + }, + { + "epoch": 0.45045, + "grad_norm": 0.852364886469828, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 45045 + }, + { + "epoch": 0.45046, + "grad_norm": 0.9116284711461707, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 45046 + }, + { + "epoch": 0.45047, + "grad_norm": 1.0356485009312644, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 45047 + }, + { + "epoch": 0.45048, + "grad_norm": 1.009111895364392, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 45048 + }, + { + "epoch": 0.45049, + "grad_norm": 1.0564166217592175, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 45049 + }, + { + "epoch": 0.4505, + "grad_norm": 0.9234210186479159, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 45050 + }, + { + "epoch": 0.45051, + "grad_norm": 0.778681748138369, + "learning_rate": 0.003, + "loss": 4.053, + "step": 45051 + }, + { + "epoch": 0.45052, + "grad_norm": 0.7515192664943839, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 45052 + }, + { + "epoch": 0.45053, + "grad_norm": 0.6605571333078823, + "learning_rate": 0.003, + "loss": 4.0656, + "step": 45053 + }, + { + "epoch": 0.45054, + "grad_norm": 0.6053551935938576, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 45054 + }, + { + "epoch": 0.45055, + "grad_norm": 0.652516640084735, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 45055 + }, + { + "epoch": 0.45056, + "grad_norm": 0.7636003050217194, + "learning_rate": 0.003, + "loss": 4.018, + "step": 45056 + }, + { + "epoch": 0.45057, + "grad_norm": 0.8903759774251606, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 45057 + }, + { + "epoch": 0.45058, + "grad_norm": 0.9150792960219439, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 45058 + }, + { + "epoch": 0.45059, + "grad_norm": 0.9783645025066083, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 45059 + }, + { + "epoch": 0.4506, + "grad_norm": 0.948894524289683, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 45060 + }, + { + "epoch": 0.45061, + "grad_norm": 0.9574160737069337, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 45061 + }, + { + "epoch": 0.45062, + "grad_norm": 1.091238283310838, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 45062 + }, + { + "epoch": 0.45063, + "grad_norm": 0.8690198194247771, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 45063 + }, + { + "epoch": 0.45064, + "grad_norm": 0.6961351463695056, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 45064 + }, + { + "epoch": 0.45065, + "grad_norm": 0.6352970654865987, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 45065 + }, + { + "epoch": 0.45066, + "grad_norm": 0.6444779601675132, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 45066 + }, + { + "epoch": 0.45067, + "grad_norm": 0.6753966678748776, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 45067 + }, + { + "epoch": 0.45068, + "grad_norm": 0.7599977200984747, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 45068 + }, + { + "epoch": 0.45069, + "grad_norm": 0.8555644940752456, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 45069 + }, + { + "epoch": 0.4507, + "grad_norm": 0.8888282737172103, + "learning_rate": 0.003, + "loss": 4.026, + "step": 45070 + }, + { + "epoch": 0.45071, + "grad_norm": 0.9070495289408829, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 45071 + }, + { + "epoch": 0.45072, + "grad_norm": 0.8522930313878998, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 45072 + }, + { + "epoch": 0.45073, + "grad_norm": 0.7776846110577744, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 45073 + }, + { + "epoch": 0.45074, + "grad_norm": 0.8576102051050917, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 45074 + }, + { + "epoch": 0.45075, + "grad_norm": 0.8782283817207519, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 45075 + }, + { + "epoch": 0.45076, + "grad_norm": 1.0348950731874038, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 45076 + }, + { + "epoch": 0.45077, + "grad_norm": 1.1431063542359161, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 45077 + }, + { + "epoch": 0.45078, + "grad_norm": 0.9689396102732247, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 45078 + }, + { + "epoch": 0.45079, + "grad_norm": 0.9099423687730107, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 45079 + }, + { + "epoch": 0.4508, + "grad_norm": 0.9137418837764993, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 45080 + }, + { + "epoch": 0.45081, + "grad_norm": 0.906847692909078, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 45081 + }, + { + "epoch": 0.45082, + "grad_norm": 0.8825765663991514, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 45082 + }, + { + "epoch": 0.45083, + "grad_norm": 0.7658266681167847, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 45083 + }, + { + "epoch": 0.45084, + "grad_norm": 0.7667837469646235, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 45084 + }, + { + "epoch": 0.45085, + "grad_norm": 0.7578566131399076, + "learning_rate": 0.003, + "loss": 3.9857, + "step": 45085 + }, + { + "epoch": 0.45086, + "grad_norm": 0.8728859245592717, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 45086 + }, + { + "epoch": 0.45087, + "grad_norm": 1.0742882737460409, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 45087 + }, + { + "epoch": 0.45088, + "grad_norm": 0.9447386898021247, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 45088 + }, + { + "epoch": 0.45089, + "grad_norm": 0.8359083055454793, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 45089 + }, + { + "epoch": 0.4509, + "grad_norm": 0.78255920843746, + "learning_rate": 0.003, + "loss": 4.0714, + "step": 45090 + }, + { + "epoch": 0.45091, + "grad_norm": 0.7376794988780052, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 45091 + }, + { + "epoch": 0.45092, + "grad_norm": 0.7620010734171733, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 45092 + }, + { + "epoch": 0.45093, + "grad_norm": 0.7609979031301813, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 45093 + }, + { + "epoch": 0.45094, + "grad_norm": 0.7341951581096641, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 45094 + }, + { + "epoch": 0.45095, + "grad_norm": 0.7278274016114025, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 45095 + }, + { + "epoch": 0.45096, + "grad_norm": 0.7287473498327361, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 45096 + }, + { + "epoch": 0.45097, + "grad_norm": 0.8244489491617965, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 45097 + }, + { + "epoch": 0.45098, + "grad_norm": 0.9492462740368002, + "learning_rate": 0.003, + "loss": 4.038, + "step": 45098 + }, + { + "epoch": 0.45099, + "grad_norm": 0.995177956078093, + "learning_rate": 0.003, + "loss": 4.063, + "step": 45099 + }, + { + "epoch": 0.451, + "grad_norm": 1.0570702184131582, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 45100 + }, + { + "epoch": 0.45101, + "grad_norm": 0.8336393481150889, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 45101 + }, + { + "epoch": 0.45102, + "grad_norm": 0.8852654419873945, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 45102 + }, + { + "epoch": 0.45103, + "grad_norm": 0.816096360891983, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 45103 + }, + { + "epoch": 0.45104, + "grad_norm": 0.7945110265929439, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 45104 + }, + { + "epoch": 0.45105, + "grad_norm": 0.8729449032344074, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 45105 + }, + { + "epoch": 0.45106, + "grad_norm": 0.9938038996059563, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 45106 + }, + { + "epoch": 0.45107, + "grad_norm": 1.0074851493399455, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 45107 + }, + { + "epoch": 0.45108, + "grad_norm": 0.8871441021297327, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 45108 + }, + { + "epoch": 0.45109, + "grad_norm": 0.8124237980294703, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 45109 + }, + { + "epoch": 0.4511, + "grad_norm": 0.8066413112671668, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 45110 + }, + { + "epoch": 0.45111, + "grad_norm": 0.7082888582013163, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 45111 + }, + { + "epoch": 0.45112, + "grad_norm": 0.678525599488035, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 45112 + }, + { + "epoch": 0.45113, + "grad_norm": 0.6088924771516474, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 45113 + }, + { + "epoch": 0.45114, + "grad_norm": 0.5537259188778674, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 45114 + }, + { + "epoch": 0.45115, + "grad_norm": 0.5263807313169314, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 45115 + }, + { + "epoch": 0.45116, + "grad_norm": 0.45065278919198964, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 45116 + }, + { + "epoch": 0.45117, + "grad_norm": 0.449740675748417, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 45117 + }, + { + "epoch": 0.45118, + "grad_norm": 0.557629231750443, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 45118 + }, + { + "epoch": 0.45119, + "grad_norm": 0.8252085502487921, + "learning_rate": 0.003, + "loss": 4.029, + "step": 45119 + }, + { + "epoch": 0.4512, + "grad_norm": 1.1444562367117772, + "learning_rate": 0.003, + "loss": 3.9929, + "step": 45120 + }, + { + "epoch": 0.45121, + "grad_norm": 1.0464084335065376, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 45121 + }, + { + "epoch": 0.45122, + "grad_norm": 0.9181176672553539, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 45122 + }, + { + "epoch": 0.45123, + "grad_norm": 0.7778482354204493, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 45123 + }, + { + "epoch": 0.45124, + "grad_norm": 0.7415689266034189, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 45124 + }, + { + "epoch": 0.45125, + "grad_norm": 0.7693464957813165, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 45125 + }, + { + "epoch": 0.45126, + "grad_norm": 0.7760212742290161, + "learning_rate": 0.003, + "loss": 4.0026, + "step": 45126 + }, + { + "epoch": 0.45127, + "grad_norm": 0.8580839634867244, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 45127 + }, + { + "epoch": 0.45128, + "grad_norm": 1.001586403944104, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 45128 + }, + { + "epoch": 0.45129, + "grad_norm": 1.0411973337176783, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 45129 + }, + { + "epoch": 0.4513, + "grad_norm": 0.8210571418911699, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 45130 + }, + { + "epoch": 0.45131, + "grad_norm": 0.733136972117965, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 45131 + }, + { + "epoch": 0.45132, + "grad_norm": 0.7405171778666544, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 45132 + }, + { + "epoch": 0.45133, + "grad_norm": 0.7504666801375941, + "learning_rate": 0.003, + "loss": 3.989, + "step": 45133 + }, + { + "epoch": 0.45134, + "grad_norm": 0.8854205143627638, + "learning_rate": 0.003, + "loss": 4.017, + "step": 45134 + }, + { + "epoch": 0.45135, + "grad_norm": 0.8867066551332616, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 45135 + }, + { + "epoch": 0.45136, + "grad_norm": 0.939653508638403, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 45136 + }, + { + "epoch": 0.45137, + "grad_norm": 1.004293317659957, + "learning_rate": 0.003, + "loss": 4.012, + "step": 45137 + }, + { + "epoch": 0.45138, + "grad_norm": 0.9966341350966206, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 45138 + }, + { + "epoch": 0.45139, + "grad_norm": 0.9126093364818492, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 45139 + }, + { + "epoch": 0.4514, + "grad_norm": 1.004257458093661, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 45140 + }, + { + "epoch": 0.45141, + "grad_norm": 1.128287338096801, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 45141 + }, + { + "epoch": 0.45142, + "grad_norm": 1.181635090405556, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 45142 + }, + { + "epoch": 0.45143, + "grad_norm": 0.9257104684686509, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 45143 + }, + { + "epoch": 0.45144, + "grad_norm": 0.9935288103439811, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 45144 + }, + { + "epoch": 0.45145, + "grad_norm": 1.0932595445746194, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 45145 + }, + { + "epoch": 0.45146, + "grad_norm": 0.8722470437250854, + "learning_rate": 0.003, + "loss": 4.057, + "step": 45146 + }, + { + "epoch": 0.45147, + "grad_norm": 0.7223803408782988, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 45147 + }, + { + "epoch": 0.45148, + "grad_norm": 0.9071403411333745, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 45148 + }, + { + "epoch": 0.45149, + "grad_norm": 0.9254003328159857, + "learning_rate": 0.003, + "loss": 4.032, + "step": 45149 + }, + { + "epoch": 0.4515, + "grad_norm": 0.8122921026345568, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 45150 + }, + { + "epoch": 0.45151, + "grad_norm": 0.8490398284799158, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 45151 + }, + { + "epoch": 0.45152, + "grad_norm": 0.8738500514484057, + "learning_rate": 0.003, + "loss": 4.0463, + "step": 45152 + }, + { + "epoch": 0.45153, + "grad_norm": 0.8349802067227645, + "learning_rate": 0.003, + "loss": 4.0838, + "step": 45153 + }, + { + "epoch": 0.45154, + "grad_norm": 0.7529160290210699, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 45154 + }, + { + "epoch": 0.45155, + "grad_norm": 0.7803864439995633, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 45155 + }, + { + "epoch": 0.45156, + "grad_norm": 0.6945801659641552, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 45156 + }, + { + "epoch": 0.45157, + "grad_norm": 0.6121852595638787, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 45157 + }, + { + "epoch": 0.45158, + "grad_norm": 0.6659691546498369, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 45158 + }, + { + "epoch": 0.45159, + "grad_norm": 0.6932817137216306, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 45159 + }, + { + "epoch": 0.4516, + "grad_norm": 0.8122744323638201, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 45160 + }, + { + "epoch": 0.45161, + "grad_norm": 0.9977858059850842, + "learning_rate": 0.003, + "loss": 4.05, + "step": 45161 + }, + { + "epoch": 0.45162, + "grad_norm": 1.1671435053969326, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 45162 + }, + { + "epoch": 0.45163, + "grad_norm": 0.707360487622524, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 45163 + }, + { + "epoch": 0.45164, + "grad_norm": 0.5973636709834009, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 45164 + }, + { + "epoch": 0.45165, + "grad_norm": 0.6984184930693944, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 45165 + }, + { + "epoch": 0.45166, + "grad_norm": 0.7736326572927001, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 45166 + }, + { + "epoch": 0.45167, + "grad_norm": 0.7852328056431632, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 45167 + }, + { + "epoch": 0.45168, + "grad_norm": 0.7025780126699579, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 45168 + }, + { + "epoch": 0.45169, + "grad_norm": 0.675719486491835, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 45169 + }, + { + "epoch": 0.4517, + "grad_norm": 0.6264815899736283, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 45170 + }, + { + "epoch": 0.45171, + "grad_norm": 0.583955961155674, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 45171 + }, + { + "epoch": 0.45172, + "grad_norm": 0.6514852170695716, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 45172 + }, + { + "epoch": 0.45173, + "grad_norm": 0.7116994937393347, + "learning_rate": 0.003, + "loss": 4.02, + "step": 45173 + }, + { + "epoch": 0.45174, + "grad_norm": 0.7030739874331033, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 45174 + }, + { + "epoch": 0.45175, + "grad_norm": 0.7549561584583605, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 45175 + }, + { + "epoch": 0.45176, + "grad_norm": 0.8434553594226591, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 45176 + }, + { + "epoch": 0.45177, + "grad_norm": 0.9446704518961422, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 45177 + }, + { + "epoch": 0.45178, + "grad_norm": 1.086332978825032, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 45178 + }, + { + "epoch": 0.45179, + "grad_norm": 0.8453346465456151, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 45179 + }, + { + "epoch": 0.4518, + "grad_norm": 0.6954675581813271, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 45180 + }, + { + "epoch": 0.45181, + "grad_norm": 0.667934796638749, + "learning_rate": 0.003, + "loss": 3.9953, + "step": 45181 + }, + { + "epoch": 0.45182, + "grad_norm": 0.7940392668518095, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 45182 + }, + { + "epoch": 0.45183, + "grad_norm": 0.9586854608939543, + "learning_rate": 0.003, + "loss": 4.0024, + "step": 45183 + }, + { + "epoch": 0.45184, + "grad_norm": 1.1008009947799011, + "learning_rate": 0.003, + "loss": 4.0787, + "step": 45184 + }, + { + "epoch": 0.45185, + "grad_norm": 0.9672481858052503, + "learning_rate": 0.003, + "loss": 4.002, + "step": 45185 + }, + { + "epoch": 0.45186, + "grad_norm": 0.9153668423205413, + "learning_rate": 0.003, + "loss": 4.0015, + "step": 45186 + }, + { + "epoch": 0.45187, + "grad_norm": 0.9518101040944213, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 45187 + }, + { + "epoch": 0.45188, + "grad_norm": 0.9794645467825551, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 45188 + }, + { + "epoch": 0.45189, + "grad_norm": 0.9431605784474341, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 45189 + }, + { + "epoch": 0.4519, + "grad_norm": 0.8667212042629147, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 45190 + }, + { + "epoch": 0.45191, + "grad_norm": 0.7819231480306358, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 45191 + }, + { + "epoch": 0.45192, + "grad_norm": 0.7544389935068896, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 45192 + }, + { + "epoch": 0.45193, + "grad_norm": 0.9415821923931877, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 45193 + }, + { + "epoch": 0.45194, + "grad_norm": 0.8802142256810697, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 45194 + }, + { + "epoch": 0.45195, + "grad_norm": 0.9481403874693725, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 45195 + }, + { + "epoch": 0.45196, + "grad_norm": 1.0753512561457033, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 45196 + }, + { + "epoch": 0.45197, + "grad_norm": 0.9203121947458837, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 45197 + }, + { + "epoch": 0.45198, + "grad_norm": 0.9523818418897632, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 45198 + }, + { + "epoch": 0.45199, + "grad_norm": 0.9176591076839524, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 45199 + }, + { + "epoch": 0.452, + "grad_norm": 0.957817383626936, + "learning_rate": 0.003, + "loss": 4.026, + "step": 45200 + }, + { + "epoch": 0.45201, + "grad_norm": 1.0470050796198074, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 45201 + }, + { + "epoch": 0.45202, + "grad_norm": 0.9133894979624608, + "learning_rate": 0.003, + "loss": 4.041, + "step": 45202 + }, + { + "epoch": 0.45203, + "grad_norm": 0.8346889240439429, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 45203 + }, + { + "epoch": 0.45204, + "grad_norm": 0.7860737449495429, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 45204 + }, + { + "epoch": 0.45205, + "grad_norm": 0.9521359322143157, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 45205 + }, + { + "epoch": 0.45206, + "grad_norm": 0.9400104734406732, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 45206 + }, + { + "epoch": 0.45207, + "grad_norm": 0.8303774216724299, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 45207 + }, + { + "epoch": 0.45208, + "grad_norm": 0.8830642050974276, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 45208 + }, + { + "epoch": 0.45209, + "grad_norm": 0.9347688978663454, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 45209 + }, + { + "epoch": 0.4521, + "grad_norm": 0.8435383355044407, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 45210 + }, + { + "epoch": 0.45211, + "grad_norm": 0.7701582331894119, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 45211 + }, + { + "epoch": 0.45212, + "grad_norm": 0.7058179552753885, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 45212 + }, + { + "epoch": 0.45213, + "grad_norm": 0.838955620132763, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 45213 + }, + { + "epoch": 0.45214, + "grad_norm": 1.064016770737717, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 45214 + }, + { + "epoch": 0.45215, + "grad_norm": 0.8630776723728015, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 45215 + }, + { + "epoch": 0.45216, + "grad_norm": 0.7597559009770753, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 45216 + }, + { + "epoch": 0.45217, + "grad_norm": 0.6704285316028507, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 45217 + }, + { + "epoch": 0.45218, + "grad_norm": 0.6649815991838545, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 45218 + }, + { + "epoch": 0.45219, + "grad_norm": 0.6602960160214353, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 45219 + }, + { + "epoch": 0.4522, + "grad_norm": 0.6880571728394965, + "learning_rate": 0.003, + "loss": 3.9926, + "step": 45220 + }, + { + "epoch": 0.45221, + "grad_norm": 0.8108178552283802, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 45221 + }, + { + "epoch": 0.45222, + "grad_norm": 0.9202288730041933, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 45222 + }, + { + "epoch": 0.45223, + "grad_norm": 0.8994732513552174, + "learning_rate": 0.003, + "loss": 4.041, + "step": 45223 + }, + { + "epoch": 0.45224, + "grad_norm": 0.7196041008906998, + "learning_rate": 0.003, + "loss": 3.998, + "step": 45224 + }, + { + "epoch": 0.45225, + "grad_norm": 0.6059044773881026, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 45225 + }, + { + "epoch": 0.45226, + "grad_norm": 0.7514896707343229, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 45226 + }, + { + "epoch": 0.45227, + "grad_norm": 0.773814694563542, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 45227 + }, + { + "epoch": 0.45228, + "grad_norm": 0.7836700203551933, + "learning_rate": 0.003, + "loss": 4.033, + "step": 45228 + }, + { + "epoch": 0.45229, + "grad_norm": 0.8515610099612697, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 45229 + }, + { + "epoch": 0.4523, + "grad_norm": 0.8759413828668655, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 45230 + }, + { + "epoch": 0.45231, + "grad_norm": 0.9392764583891828, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 45231 + }, + { + "epoch": 0.45232, + "grad_norm": 1.0778461351493582, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 45232 + }, + { + "epoch": 0.45233, + "grad_norm": 1.1151692170625744, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 45233 + }, + { + "epoch": 0.45234, + "grad_norm": 0.8434223778168091, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 45234 + }, + { + "epoch": 0.45235, + "grad_norm": 0.6439780041287061, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 45235 + }, + { + "epoch": 0.45236, + "grad_norm": 0.659740226024848, + "learning_rate": 0.003, + "loss": 4.051, + "step": 45236 + }, + { + "epoch": 0.45237, + "grad_norm": 0.6711829833005296, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 45237 + }, + { + "epoch": 0.45238, + "grad_norm": 0.7636972271737785, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 45238 + }, + { + "epoch": 0.45239, + "grad_norm": 0.8140624344252014, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 45239 + }, + { + "epoch": 0.4524, + "grad_norm": 0.8413550194217922, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 45240 + }, + { + "epoch": 0.45241, + "grad_norm": 0.739390890139506, + "learning_rate": 0.003, + "loss": 4.0041, + "step": 45241 + }, + { + "epoch": 0.45242, + "grad_norm": 0.6846162649259815, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 45242 + }, + { + "epoch": 0.45243, + "grad_norm": 0.7936277984983425, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 45243 + }, + { + "epoch": 0.45244, + "grad_norm": 0.8270452662999267, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 45244 + }, + { + "epoch": 0.45245, + "grad_norm": 0.6889451589481977, + "learning_rate": 0.003, + "loss": 4.0096, + "step": 45245 + }, + { + "epoch": 0.45246, + "grad_norm": 0.6977231967566825, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 45246 + }, + { + "epoch": 0.45247, + "grad_norm": 0.7221744932216823, + "learning_rate": 0.003, + "loss": 4.03, + "step": 45247 + }, + { + "epoch": 0.45248, + "grad_norm": 0.7086458677472883, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 45248 + }, + { + "epoch": 0.45249, + "grad_norm": 0.7772757725661621, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 45249 + }, + { + "epoch": 0.4525, + "grad_norm": 0.8591858946220712, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 45250 + }, + { + "epoch": 0.45251, + "grad_norm": 0.8790378365875778, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 45251 + }, + { + "epoch": 0.45252, + "grad_norm": 1.1076820937197323, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 45252 + }, + { + "epoch": 0.45253, + "grad_norm": 1.112865281926109, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 45253 + }, + { + "epoch": 0.45254, + "grad_norm": 0.8986817536215849, + "learning_rate": 0.003, + "loss": 4.0603, + "step": 45254 + }, + { + "epoch": 0.45255, + "grad_norm": 0.8231447865482336, + "learning_rate": 0.003, + "loss": 4.009, + "step": 45255 + }, + { + "epoch": 0.45256, + "grad_norm": 0.7563200375408387, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 45256 + }, + { + "epoch": 0.45257, + "grad_norm": 0.8012720502741298, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 45257 + }, + { + "epoch": 0.45258, + "grad_norm": 0.8576010362934239, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 45258 + }, + { + "epoch": 0.45259, + "grad_norm": 0.9336672506987251, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 45259 + }, + { + "epoch": 0.4526, + "grad_norm": 1.0472527200904114, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 45260 + }, + { + "epoch": 0.45261, + "grad_norm": 0.9337474035910813, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 45261 + }, + { + "epoch": 0.45262, + "grad_norm": 0.9879778044094009, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 45262 + }, + { + "epoch": 0.45263, + "grad_norm": 0.9493107100093708, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 45263 + }, + { + "epoch": 0.45264, + "grad_norm": 0.9196407782038524, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 45264 + }, + { + "epoch": 0.45265, + "grad_norm": 0.8980758196888498, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 45265 + }, + { + "epoch": 0.45266, + "grad_norm": 0.8835957736145377, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 45266 + }, + { + "epoch": 0.45267, + "grad_norm": 1.15422703344409, + "learning_rate": 0.003, + "loss": 4.02, + "step": 45267 + }, + { + "epoch": 0.45268, + "grad_norm": 1.0557976626413705, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 45268 + }, + { + "epoch": 0.45269, + "grad_norm": 1.0314815651913654, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 45269 + }, + { + "epoch": 0.4527, + "grad_norm": 0.910416457464419, + "learning_rate": 0.003, + "loss": 4.05, + "step": 45270 + }, + { + "epoch": 0.45271, + "grad_norm": 0.7736587485356127, + "learning_rate": 0.003, + "loss": 3.9995, + "step": 45271 + }, + { + "epoch": 0.45272, + "grad_norm": 0.6839616346101685, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 45272 + }, + { + "epoch": 0.45273, + "grad_norm": 0.8283486468167655, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 45273 + }, + { + "epoch": 0.45274, + "grad_norm": 0.8957178424373544, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 45274 + }, + { + "epoch": 0.45275, + "grad_norm": 0.9128896504732903, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 45275 + }, + { + "epoch": 0.45276, + "grad_norm": 0.8843603345347153, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 45276 + }, + { + "epoch": 0.45277, + "grad_norm": 0.7915086240073248, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 45277 + }, + { + "epoch": 0.45278, + "grad_norm": 0.7309715558187646, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 45278 + }, + { + "epoch": 0.45279, + "grad_norm": 0.8346451905010044, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 45279 + }, + { + "epoch": 0.4528, + "grad_norm": 0.9281977894205328, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 45280 + }, + { + "epoch": 0.45281, + "grad_norm": 0.975377411310574, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 45281 + }, + { + "epoch": 0.45282, + "grad_norm": 1.0992483557205126, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 45282 + }, + { + "epoch": 0.45283, + "grad_norm": 0.9276602466353108, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 45283 + }, + { + "epoch": 0.45284, + "grad_norm": 0.7377850800949248, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 45284 + }, + { + "epoch": 0.45285, + "grad_norm": 0.6443591140115356, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 45285 + }, + { + "epoch": 0.45286, + "grad_norm": 0.6027733986545081, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 45286 + }, + { + "epoch": 0.45287, + "grad_norm": 0.6408488195969431, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 45287 + }, + { + "epoch": 0.45288, + "grad_norm": 0.7152422355788384, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 45288 + }, + { + "epoch": 0.45289, + "grad_norm": 0.8398561064822544, + "learning_rate": 0.003, + "loss": 3.9889, + "step": 45289 + }, + { + "epoch": 0.4529, + "grad_norm": 0.966494351958308, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 45290 + }, + { + "epoch": 0.45291, + "grad_norm": 0.9474256845872675, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 45291 + }, + { + "epoch": 0.45292, + "grad_norm": 0.8561095248497337, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 45292 + }, + { + "epoch": 0.45293, + "grad_norm": 0.852909453041928, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 45293 + }, + { + "epoch": 0.45294, + "grad_norm": 0.8600791354059818, + "learning_rate": 0.003, + "loss": 4.0086, + "step": 45294 + }, + { + "epoch": 0.45295, + "grad_norm": 0.7766790501549421, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 45295 + }, + { + "epoch": 0.45296, + "grad_norm": 0.7052648639632482, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 45296 + }, + { + "epoch": 0.45297, + "grad_norm": 0.6726447689132824, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 45297 + }, + { + "epoch": 0.45298, + "grad_norm": 0.70358426365187, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 45298 + }, + { + "epoch": 0.45299, + "grad_norm": 0.7242214913878665, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 45299 + }, + { + "epoch": 0.453, + "grad_norm": 0.7167411585448191, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 45300 + }, + { + "epoch": 0.45301, + "grad_norm": 0.7764247996630754, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 45301 + }, + { + "epoch": 0.45302, + "grad_norm": 0.8590783412192006, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 45302 + }, + { + "epoch": 0.45303, + "grad_norm": 0.9195498150322811, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 45303 + }, + { + "epoch": 0.45304, + "grad_norm": 0.9808273468884391, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 45304 + }, + { + "epoch": 0.45305, + "grad_norm": 1.1731251252965917, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 45305 + }, + { + "epoch": 0.45306, + "grad_norm": 0.9351658447561682, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 45306 + }, + { + "epoch": 0.45307, + "grad_norm": 0.8136519578933257, + "learning_rate": 0.003, + "loss": 4.013, + "step": 45307 + }, + { + "epoch": 0.45308, + "grad_norm": 0.6650942911075444, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 45308 + }, + { + "epoch": 0.45309, + "grad_norm": 0.6478141355181395, + "learning_rate": 0.003, + "loss": 4.047, + "step": 45309 + }, + { + "epoch": 0.4531, + "grad_norm": 0.6272011906258071, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 45310 + }, + { + "epoch": 0.45311, + "grad_norm": 0.6145729690501417, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 45311 + }, + { + "epoch": 0.45312, + "grad_norm": 0.6705677890022037, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 45312 + }, + { + "epoch": 0.45313, + "grad_norm": 0.7940202480059221, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 45313 + }, + { + "epoch": 0.45314, + "grad_norm": 0.9080872696446429, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 45314 + }, + { + "epoch": 0.45315, + "grad_norm": 0.9324131137026064, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 45315 + }, + { + "epoch": 0.45316, + "grad_norm": 0.9247646307200156, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 45316 + }, + { + "epoch": 0.45317, + "grad_norm": 0.8218808756835281, + "learning_rate": 0.003, + "loss": 3.9916, + "step": 45317 + }, + { + "epoch": 0.45318, + "grad_norm": 0.8309449470256615, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 45318 + }, + { + "epoch": 0.45319, + "grad_norm": 0.8882761717027047, + "learning_rate": 0.003, + "loss": 4.0678, + "step": 45319 + }, + { + "epoch": 0.4532, + "grad_norm": 1.010831705997576, + "learning_rate": 0.003, + "loss": 4.0623, + "step": 45320 + }, + { + "epoch": 0.45321, + "grad_norm": 1.0595543764343487, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 45321 + }, + { + "epoch": 0.45322, + "grad_norm": 1.1169243531305533, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 45322 + }, + { + "epoch": 0.45323, + "grad_norm": 0.8959926048460136, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 45323 + }, + { + "epoch": 0.45324, + "grad_norm": 0.8836739552330163, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 45324 + }, + { + "epoch": 0.45325, + "grad_norm": 0.887055063875273, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 45325 + }, + { + "epoch": 0.45326, + "grad_norm": 0.8756543989733448, + "learning_rate": 0.003, + "loss": 3.9989, + "step": 45326 + }, + { + "epoch": 0.45327, + "grad_norm": 0.842646091216108, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 45327 + }, + { + "epoch": 0.45328, + "grad_norm": 0.9596059207180784, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 45328 + }, + { + "epoch": 0.45329, + "grad_norm": 1.0427987903808167, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 45329 + }, + { + "epoch": 0.4533, + "grad_norm": 1.0781982293741323, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 45330 + }, + { + "epoch": 0.45331, + "grad_norm": 0.7773480701265413, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 45331 + }, + { + "epoch": 0.45332, + "grad_norm": 0.7098408373999742, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 45332 + }, + { + "epoch": 0.45333, + "grad_norm": 0.7882821795214396, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 45333 + }, + { + "epoch": 0.45334, + "grad_norm": 0.8313383240411921, + "learning_rate": 0.003, + "loss": 3.9913, + "step": 45334 + }, + { + "epoch": 0.45335, + "grad_norm": 0.7180478247909079, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 45335 + }, + { + "epoch": 0.45336, + "grad_norm": 0.7492373846108838, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 45336 + }, + { + "epoch": 0.45337, + "grad_norm": 0.776846829076091, + "learning_rate": 0.003, + "loss": 4.0052, + "step": 45337 + }, + { + "epoch": 0.45338, + "grad_norm": 0.6866169143559375, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 45338 + }, + { + "epoch": 0.45339, + "grad_norm": 0.6854290992317674, + "learning_rate": 0.003, + "loss": 4.0089, + "step": 45339 + }, + { + "epoch": 0.4534, + "grad_norm": 0.6361656734536711, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 45340 + }, + { + "epoch": 0.45341, + "grad_norm": 0.8406326318854689, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 45341 + }, + { + "epoch": 0.45342, + "grad_norm": 1.0261532931853514, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 45342 + }, + { + "epoch": 0.45343, + "grad_norm": 1.2119155147151053, + "learning_rate": 0.003, + "loss": 4.023, + "step": 45343 + }, + { + "epoch": 0.45344, + "grad_norm": 0.7754207813694066, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 45344 + }, + { + "epoch": 0.45345, + "grad_norm": 0.746826389901834, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 45345 + }, + { + "epoch": 0.45346, + "grad_norm": 0.7912622965095704, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 45346 + }, + { + "epoch": 0.45347, + "grad_norm": 0.7972820830120572, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 45347 + }, + { + "epoch": 0.45348, + "grad_norm": 0.7674776402219706, + "learning_rate": 0.003, + "loss": 3.9834, + "step": 45348 + }, + { + "epoch": 0.45349, + "grad_norm": 0.7627203610613078, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 45349 + }, + { + "epoch": 0.4535, + "grad_norm": 0.720908863068727, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 45350 + }, + { + "epoch": 0.45351, + "grad_norm": 0.7625571954819189, + "learning_rate": 0.003, + "loss": 4.036, + "step": 45351 + }, + { + "epoch": 0.45352, + "grad_norm": 0.7973269671229432, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 45352 + }, + { + "epoch": 0.45353, + "grad_norm": 0.6725875800235659, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 45353 + }, + { + "epoch": 0.45354, + "grad_norm": 0.6284405222023263, + "learning_rate": 0.003, + "loss": 3.9958, + "step": 45354 + }, + { + "epoch": 0.45355, + "grad_norm": 0.7932072724799876, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 45355 + }, + { + "epoch": 0.45356, + "grad_norm": 0.8953130335595655, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 45356 + }, + { + "epoch": 0.45357, + "grad_norm": 1.0018194726396488, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 45357 + }, + { + "epoch": 0.45358, + "grad_norm": 1.1826834931358554, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 45358 + }, + { + "epoch": 0.45359, + "grad_norm": 0.9479587597755593, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 45359 + }, + { + "epoch": 0.4536, + "grad_norm": 0.7970899827768594, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 45360 + }, + { + "epoch": 0.45361, + "grad_norm": 0.6677366004835993, + "learning_rate": 0.003, + "loss": 4.028, + "step": 45361 + }, + { + "epoch": 0.45362, + "grad_norm": 0.7054935284008395, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 45362 + }, + { + "epoch": 0.45363, + "grad_norm": 0.8665631996174551, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 45363 + }, + { + "epoch": 0.45364, + "grad_norm": 0.7909773889347228, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 45364 + }, + { + "epoch": 0.45365, + "grad_norm": 0.7478615475385401, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 45365 + }, + { + "epoch": 0.45366, + "grad_norm": 0.8225488812438091, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 45366 + }, + { + "epoch": 0.45367, + "grad_norm": 0.8099449160966457, + "learning_rate": 0.003, + "loss": 4.018, + "step": 45367 + }, + { + "epoch": 0.45368, + "grad_norm": 0.7336336932168146, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 45368 + }, + { + "epoch": 0.45369, + "grad_norm": 0.7196323553754008, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 45369 + }, + { + "epoch": 0.4537, + "grad_norm": 0.7042480263193166, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 45370 + }, + { + "epoch": 0.45371, + "grad_norm": 0.7345665068912323, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 45371 + }, + { + "epoch": 0.45372, + "grad_norm": 0.9141743136306792, + "learning_rate": 0.003, + "loss": 4.029, + "step": 45372 + }, + { + "epoch": 0.45373, + "grad_norm": 0.9121976203918208, + "learning_rate": 0.003, + "loss": 3.9946, + "step": 45373 + }, + { + "epoch": 0.45374, + "grad_norm": 0.969829881321605, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 45374 + }, + { + "epoch": 0.45375, + "grad_norm": 1.1998708582782414, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 45375 + }, + { + "epoch": 0.45376, + "grad_norm": 0.8914635424532543, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 45376 + }, + { + "epoch": 0.45377, + "grad_norm": 0.8352467700903811, + "learning_rate": 0.003, + "loss": 4.026, + "step": 45377 + }, + { + "epoch": 0.45378, + "grad_norm": 0.7354703926908045, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 45378 + }, + { + "epoch": 0.45379, + "grad_norm": 0.6254518033450769, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 45379 + }, + { + "epoch": 0.4538, + "grad_norm": 0.6589327400549888, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 45380 + }, + { + "epoch": 0.45381, + "grad_norm": 0.6034904730836785, + "learning_rate": 0.003, + "loss": 3.9742, + "step": 45381 + }, + { + "epoch": 0.45382, + "grad_norm": 0.5829238915803127, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 45382 + }, + { + "epoch": 0.45383, + "grad_norm": 0.5471508306195731, + "learning_rate": 0.003, + "loss": 3.9778, + "step": 45383 + }, + { + "epoch": 0.45384, + "grad_norm": 0.5135817943412851, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 45384 + }, + { + "epoch": 0.45385, + "grad_norm": 0.5959892677574793, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 45385 + }, + { + "epoch": 0.45386, + "grad_norm": 0.6777479743480775, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 45386 + }, + { + "epoch": 0.45387, + "grad_norm": 0.7632002814224836, + "learning_rate": 0.003, + "loss": 4.0069, + "step": 45387 + }, + { + "epoch": 0.45388, + "grad_norm": 0.9359989492391715, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 45388 + }, + { + "epoch": 0.45389, + "grad_norm": 1.1208552359786295, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 45389 + }, + { + "epoch": 0.4539, + "grad_norm": 0.8134607319552946, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 45390 + }, + { + "epoch": 0.45391, + "grad_norm": 0.7347716301611891, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 45391 + }, + { + "epoch": 0.45392, + "grad_norm": 0.7296352827196358, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 45392 + }, + { + "epoch": 0.45393, + "grad_norm": 0.8149549044461281, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 45393 + }, + { + "epoch": 0.45394, + "grad_norm": 0.9986064329973046, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 45394 + }, + { + "epoch": 0.45395, + "grad_norm": 0.9801474872971154, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 45395 + }, + { + "epoch": 0.45396, + "grad_norm": 0.8419384751532067, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 45396 + }, + { + "epoch": 0.45397, + "grad_norm": 0.8380181690147234, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 45397 + }, + { + "epoch": 0.45398, + "grad_norm": 0.774777564024713, + "learning_rate": 0.003, + "loss": 3.9838, + "step": 45398 + }, + { + "epoch": 0.45399, + "grad_norm": 0.7560366276237787, + "learning_rate": 0.003, + "loss": 4.052, + "step": 45399 + }, + { + "epoch": 0.454, + "grad_norm": 0.7610305137711921, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 45400 + }, + { + "epoch": 0.45401, + "grad_norm": 0.9837098668328964, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 45401 + }, + { + "epoch": 0.45402, + "grad_norm": 1.2084938252951174, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 45402 + }, + { + "epoch": 0.45403, + "grad_norm": 0.9183152526645085, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 45403 + }, + { + "epoch": 0.45404, + "grad_norm": 0.8971219047538045, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 45404 + }, + { + "epoch": 0.45405, + "grad_norm": 0.8533816034202825, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 45405 + }, + { + "epoch": 0.45406, + "grad_norm": 0.7722684660022877, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 45406 + }, + { + "epoch": 0.45407, + "grad_norm": 0.8852022094903568, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 45407 + }, + { + "epoch": 0.45408, + "grad_norm": 1.0119382232311307, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 45408 + }, + { + "epoch": 0.45409, + "grad_norm": 1.1288864085792254, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 45409 + }, + { + "epoch": 0.4541, + "grad_norm": 1.0039254575423158, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 45410 + }, + { + "epoch": 0.45411, + "grad_norm": 1.04896516249944, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 45411 + }, + { + "epoch": 0.45412, + "grad_norm": 1.0746097497276423, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 45412 + }, + { + "epoch": 0.45413, + "grad_norm": 0.9558673801294872, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 45413 + }, + { + "epoch": 0.45414, + "grad_norm": 0.9676866410766161, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 45414 + }, + { + "epoch": 0.45415, + "grad_norm": 0.9733347565222177, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 45415 + }, + { + "epoch": 0.45416, + "grad_norm": 0.9695923629194035, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 45416 + }, + { + "epoch": 0.45417, + "grad_norm": 1.0938557036305907, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 45417 + }, + { + "epoch": 0.45418, + "grad_norm": 1.087385290832473, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 45418 + }, + { + "epoch": 0.45419, + "grad_norm": 0.883306393316532, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 45419 + }, + { + "epoch": 0.4542, + "grad_norm": 0.9102737970214008, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 45420 + }, + { + "epoch": 0.45421, + "grad_norm": 0.958247252795279, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 45421 + }, + { + "epoch": 0.45422, + "grad_norm": 1.0335044108799913, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 45422 + }, + { + "epoch": 0.45423, + "grad_norm": 1.0286050733446275, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 45423 + }, + { + "epoch": 0.45424, + "grad_norm": 0.8706634464318002, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 45424 + }, + { + "epoch": 0.45425, + "grad_norm": 0.8148885400337059, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 45425 + }, + { + "epoch": 0.45426, + "grad_norm": 0.8007655767491624, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 45426 + }, + { + "epoch": 0.45427, + "grad_norm": 0.7276370533465524, + "learning_rate": 0.003, + "loss": 4.0689, + "step": 45427 + }, + { + "epoch": 0.45428, + "grad_norm": 0.6412321415002404, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 45428 + }, + { + "epoch": 0.45429, + "grad_norm": 0.6521992373941747, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 45429 + }, + { + "epoch": 0.4543, + "grad_norm": 0.6177036050142585, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 45430 + }, + { + "epoch": 0.45431, + "grad_norm": 0.5695162678288466, + "learning_rate": 0.003, + "loss": 4.03, + "step": 45431 + }, + { + "epoch": 0.45432, + "grad_norm": 0.47374103486077856, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 45432 + }, + { + "epoch": 0.45433, + "grad_norm": 0.45977070690159133, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 45433 + }, + { + "epoch": 0.45434, + "grad_norm": 0.458714640427915, + "learning_rate": 0.003, + "loss": 3.9981, + "step": 45434 + }, + { + "epoch": 0.45435, + "grad_norm": 0.49255074137343746, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 45435 + }, + { + "epoch": 0.45436, + "grad_norm": 0.6301936179298345, + "learning_rate": 0.003, + "loss": 4.04, + "step": 45436 + }, + { + "epoch": 0.45437, + "grad_norm": 0.7943039148506419, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 45437 + }, + { + "epoch": 0.45438, + "grad_norm": 0.9830769291616797, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 45438 + }, + { + "epoch": 0.45439, + "grad_norm": 1.1594357031949396, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 45439 + }, + { + "epoch": 0.4544, + "grad_norm": 0.6517723209140454, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 45440 + }, + { + "epoch": 0.45441, + "grad_norm": 0.6193233844620009, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 45441 + }, + { + "epoch": 0.45442, + "grad_norm": 0.7320624481420769, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 45442 + }, + { + "epoch": 0.45443, + "grad_norm": 0.7668275608761693, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 45443 + }, + { + "epoch": 0.45444, + "grad_norm": 0.7884227253937081, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 45444 + }, + { + "epoch": 0.45445, + "grad_norm": 0.8453447586379707, + "learning_rate": 0.003, + "loss": 4.0, + "step": 45445 + }, + { + "epoch": 0.45446, + "grad_norm": 0.8898514809958369, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 45446 + }, + { + "epoch": 0.45447, + "grad_norm": 0.9979390588245193, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 45447 + }, + { + "epoch": 0.45448, + "grad_norm": 1.2476364704191538, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 45448 + }, + { + "epoch": 0.45449, + "grad_norm": 0.7972474308985367, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 45449 + }, + { + "epoch": 0.4545, + "grad_norm": 0.7755548452136086, + "learning_rate": 0.003, + "loss": 4.046, + "step": 45450 + }, + { + "epoch": 0.45451, + "grad_norm": 0.8689501708245361, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 45451 + }, + { + "epoch": 0.45452, + "grad_norm": 0.8893548447993759, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 45452 + }, + { + "epoch": 0.45453, + "grad_norm": 0.8354187606445271, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 45453 + }, + { + "epoch": 0.45454, + "grad_norm": 0.7599630475425808, + "learning_rate": 0.003, + "loss": 4.0671, + "step": 45454 + }, + { + "epoch": 0.45455, + "grad_norm": 0.8353662341704832, + "learning_rate": 0.003, + "loss": 4.041, + "step": 45455 + }, + { + "epoch": 0.45456, + "grad_norm": 0.9938809868880318, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 45456 + }, + { + "epoch": 0.45457, + "grad_norm": 1.1526748046335715, + "learning_rate": 0.003, + "loss": 4.0721, + "step": 45457 + }, + { + "epoch": 0.45458, + "grad_norm": 0.8481229072620786, + "learning_rate": 0.003, + "loss": 4.0497, + "step": 45458 + }, + { + "epoch": 0.45459, + "grad_norm": 0.7709637300391878, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 45459 + }, + { + "epoch": 0.4546, + "grad_norm": 0.6928583856407996, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 45460 + }, + { + "epoch": 0.45461, + "grad_norm": 0.5985527121171544, + "learning_rate": 0.003, + "loss": 3.9987, + "step": 45461 + }, + { + "epoch": 0.45462, + "grad_norm": 0.5468811850700163, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 45462 + }, + { + "epoch": 0.45463, + "grad_norm": 0.6396642157065423, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 45463 + }, + { + "epoch": 0.45464, + "grad_norm": 0.7919408248364381, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 45464 + }, + { + "epoch": 0.45465, + "grad_norm": 0.8745239731965201, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 45465 + }, + { + "epoch": 0.45466, + "grad_norm": 0.9488331597267481, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 45466 + }, + { + "epoch": 0.45467, + "grad_norm": 0.984820370779825, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 45467 + }, + { + "epoch": 0.45468, + "grad_norm": 0.9988612494188643, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 45468 + }, + { + "epoch": 0.45469, + "grad_norm": 0.9382139024605617, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 45469 + }, + { + "epoch": 0.4547, + "grad_norm": 0.9124132007387235, + "learning_rate": 0.003, + "loss": 3.9956, + "step": 45470 + }, + { + "epoch": 0.45471, + "grad_norm": 1.0346799838884182, + "learning_rate": 0.003, + "loss": 4.046, + "step": 45471 + }, + { + "epoch": 0.45472, + "grad_norm": 0.9139258902329228, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 45472 + }, + { + "epoch": 0.45473, + "grad_norm": 0.8961328122909866, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 45473 + }, + { + "epoch": 0.45474, + "grad_norm": 0.9910727839755126, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 45474 + }, + { + "epoch": 0.45475, + "grad_norm": 1.071609966399681, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 45475 + }, + { + "epoch": 0.45476, + "grad_norm": 0.8839260216286899, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 45476 + }, + { + "epoch": 0.45477, + "grad_norm": 0.7807095992906216, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 45477 + }, + { + "epoch": 0.45478, + "grad_norm": 0.7444501141343263, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 45478 + }, + { + "epoch": 0.45479, + "grad_norm": 0.7921140379698698, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 45479 + }, + { + "epoch": 0.4548, + "grad_norm": 1.0834608811035091, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 45480 + }, + { + "epoch": 0.45481, + "grad_norm": 1.1092346392961878, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 45481 + }, + { + "epoch": 0.45482, + "grad_norm": 0.9090133007622945, + "learning_rate": 0.003, + "loss": 4.0615, + "step": 45482 + }, + { + "epoch": 0.45483, + "grad_norm": 0.8365314557795088, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 45483 + }, + { + "epoch": 0.45484, + "grad_norm": 0.746283334970976, + "learning_rate": 0.003, + "loss": 4.022, + "step": 45484 + }, + { + "epoch": 0.45485, + "grad_norm": 0.6233341838599291, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 45485 + }, + { + "epoch": 0.45486, + "grad_norm": 0.6066942238056715, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 45486 + }, + { + "epoch": 0.45487, + "grad_norm": 0.649739392193358, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 45487 + }, + { + "epoch": 0.45488, + "grad_norm": 0.7192660419690886, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 45488 + }, + { + "epoch": 0.45489, + "grad_norm": 0.8987330135746161, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 45489 + }, + { + "epoch": 0.4549, + "grad_norm": 0.9005770426677466, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 45490 + }, + { + "epoch": 0.45491, + "grad_norm": 0.7940563900227477, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 45491 + }, + { + "epoch": 0.45492, + "grad_norm": 0.8593701102292788, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 45492 + }, + { + "epoch": 0.45493, + "grad_norm": 0.8743120347534116, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 45493 + }, + { + "epoch": 0.45494, + "grad_norm": 0.7747081072884167, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 45494 + }, + { + "epoch": 0.45495, + "grad_norm": 0.7795740436521645, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 45495 + }, + { + "epoch": 0.45496, + "grad_norm": 0.9283501141163042, + "learning_rate": 0.003, + "loss": 4.0543, + "step": 45496 + }, + { + "epoch": 0.45497, + "grad_norm": 0.9986183560086334, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 45497 + }, + { + "epoch": 0.45498, + "grad_norm": 0.9839981309729595, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 45498 + }, + { + "epoch": 0.45499, + "grad_norm": 0.9301479055060253, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 45499 + }, + { + "epoch": 0.455, + "grad_norm": 0.7927606784123753, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 45500 + }, + { + "epoch": 0.45501, + "grad_norm": 0.7720637577010243, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 45501 + }, + { + "epoch": 0.45502, + "grad_norm": 0.7398766998864964, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 45502 + }, + { + "epoch": 0.45503, + "grad_norm": 0.7586995649045102, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 45503 + }, + { + "epoch": 0.45504, + "grad_norm": 0.9486151161786276, + "learning_rate": 0.003, + "loss": 4.0669, + "step": 45504 + }, + { + "epoch": 0.45505, + "grad_norm": 0.9553607950113439, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 45505 + }, + { + "epoch": 0.45506, + "grad_norm": 1.0653374190865392, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 45506 + }, + { + "epoch": 0.45507, + "grad_norm": 0.9692772420999408, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 45507 + }, + { + "epoch": 0.45508, + "grad_norm": 0.9455346701623637, + "learning_rate": 0.003, + "loss": 4.038, + "step": 45508 + }, + { + "epoch": 0.45509, + "grad_norm": 0.9626714052717751, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 45509 + }, + { + "epoch": 0.4551, + "grad_norm": 0.8416484740431638, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 45510 + }, + { + "epoch": 0.45511, + "grad_norm": 0.7485650040118836, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 45511 + }, + { + "epoch": 0.45512, + "grad_norm": 0.771230606381515, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 45512 + }, + { + "epoch": 0.45513, + "grad_norm": 0.8186806956652269, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 45513 + }, + { + "epoch": 0.45514, + "grad_norm": 0.7790104458929109, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 45514 + }, + { + "epoch": 0.45515, + "grad_norm": 0.6789240367999735, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 45515 + }, + { + "epoch": 0.45516, + "grad_norm": 0.742379248474403, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 45516 + }, + { + "epoch": 0.45517, + "grad_norm": 0.7763023017680016, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 45517 + }, + { + "epoch": 0.45518, + "grad_norm": 0.8318147486914556, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 45518 + }, + { + "epoch": 0.45519, + "grad_norm": 0.7568657854976436, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 45519 + }, + { + "epoch": 0.4552, + "grad_norm": 0.667211514784072, + "learning_rate": 0.003, + "loss": 4.031, + "step": 45520 + }, + { + "epoch": 0.45521, + "grad_norm": 0.6491854058462977, + "learning_rate": 0.003, + "loss": 4.002, + "step": 45521 + }, + { + "epoch": 0.45522, + "grad_norm": 0.5585475429786427, + "learning_rate": 0.003, + "loss": 4.013, + "step": 45522 + }, + { + "epoch": 0.45523, + "grad_norm": 0.5557191406946306, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 45523 + }, + { + "epoch": 0.45524, + "grad_norm": 0.6195668926552719, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 45524 + }, + { + "epoch": 0.45525, + "grad_norm": 0.6654969686002143, + "learning_rate": 0.003, + "loss": 3.9962, + "step": 45525 + }, + { + "epoch": 0.45526, + "grad_norm": 0.8307988657293531, + "learning_rate": 0.003, + "loss": 3.9954, + "step": 45526 + }, + { + "epoch": 0.45527, + "grad_norm": 1.0514296101744531, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 45527 + }, + { + "epoch": 0.45528, + "grad_norm": 1.0094712878050824, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 45528 + }, + { + "epoch": 0.45529, + "grad_norm": 1.064642037229251, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 45529 + }, + { + "epoch": 0.4553, + "grad_norm": 0.9455412544082369, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 45530 + }, + { + "epoch": 0.45531, + "grad_norm": 0.6974577052847643, + "learning_rate": 0.003, + "loss": 4.015, + "step": 45531 + }, + { + "epoch": 0.45532, + "grad_norm": 0.6976044133526356, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 45532 + }, + { + "epoch": 0.45533, + "grad_norm": 0.7521036127162949, + "learning_rate": 0.003, + "loss": 4.027, + "step": 45533 + }, + { + "epoch": 0.45534, + "grad_norm": 0.7497353073294734, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 45534 + }, + { + "epoch": 0.45535, + "grad_norm": 0.7313011280874765, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 45535 + }, + { + "epoch": 0.45536, + "grad_norm": 0.7156362653521162, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 45536 + }, + { + "epoch": 0.45537, + "grad_norm": 0.759639883020969, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 45537 + }, + { + "epoch": 0.45538, + "grad_norm": 0.8030971204485885, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 45538 + }, + { + "epoch": 0.45539, + "grad_norm": 0.8529699044141846, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 45539 + }, + { + "epoch": 0.4554, + "grad_norm": 0.9200661799880123, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 45540 + }, + { + "epoch": 0.45541, + "grad_norm": 0.9061705946790726, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 45541 + }, + { + "epoch": 0.45542, + "grad_norm": 0.9594196796252497, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 45542 + }, + { + "epoch": 0.45543, + "grad_norm": 0.9791670629152284, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 45543 + }, + { + "epoch": 0.45544, + "grad_norm": 0.9908777713249977, + "learning_rate": 0.003, + "loss": 4.0073, + "step": 45544 + }, + { + "epoch": 0.45545, + "grad_norm": 1.00042043621753, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 45545 + }, + { + "epoch": 0.45546, + "grad_norm": 0.9864119854845959, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 45546 + }, + { + "epoch": 0.45547, + "grad_norm": 0.9545883194858868, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 45547 + }, + { + "epoch": 0.45548, + "grad_norm": 0.8472913608063539, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 45548 + }, + { + "epoch": 0.45549, + "grad_norm": 0.6020153177381072, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 45549 + }, + { + "epoch": 0.4555, + "grad_norm": 0.6901446919675848, + "learning_rate": 0.003, + "loss": 4.045, + "step": 45550 + }, + { + "epoch": 0.45551, + "grad_norm": 0.8116784269952833, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 45551 + }, + { + "epoch": 0.45552, + "grad_norm": 0.8616011551846399, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 45552 + }, + { + "epoch": 0.45553, + "grad_norm": 0.8319064433543527, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 45553 + }, + { + "epoch": 0.45554, + "grad_norm": 0.907113697167661, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 45554 + }, + { + "epoch": 0.45555, + "grad_norm": 0.9826947788777765, + "learning_rate": 0.003, + "loss": 4.0048, + "step": 45555 + }, + { + "epoch": 0.45556, + "grad_norm": 0.9562908357564291, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 45556 + }, + { + "epoch": 0.45557, + "grad_norm": 0.8734200616612611, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 45557 + }, + { + "epoch": 0.45558, + "grad_norm": 0.767803270340384, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 45558 + }, + { + "epoch": 0.45559, + "grad_norm": 0.6870475743701959, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 45559 + }, + { + "epoch": 0.4556, + "grad_norm": 0.6987408053903333, + "learning_rate": 0.003, + "loss": 4.032, + "step": 45560 + }, + { + "epoch": 0.45561, + "grad_norm": 0.6264485762371859, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 45561 + }, + { + "epoch": 0.45562, + "grad_norm": 0.5761748067734493, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 45562 + }, + { + "epoch": 0.45563, + "grad_norm": 0.565588960948011, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 45563 + }, + { + "epoch": 0.45564, + "grad_norm": 0.557928276195087, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 45564 + }, + { + "epoch": 0.45565, + "grad_norm": 0.5882810097721547, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 45565 + }, + { + "epoch": 0.45566, + "grad_norm": 0.6483487338517503, + "learning_rate": 0.003, + "loss": 4.0035, + "step": 45566 + }, + { + "epoch": 0.45567, + "grad_norm": 0.7741959791319246, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 45567 + }, + { + "epoch": 0.45568, + "grad_norm": 0.948872439151438, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 45568 + }, + { + "epoch": 0.45569, + "grad_norm": 1.2614153741674832, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 45569 + }, + { + "epoch": 0.4557, + "grad_norm": 0.971248829220503, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 45570 + }, + { + "epoch": 0.45571, + "grad_norm": 1.0433050614486379, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 45571 + }, + { + "epoch": 0.45572, + "grad_norm": 0.9376853583094393, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 45572 + }, + { + "epoch": 0.45573, + "grad_norm": 0.8821320277280997, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 45573 + }, + { + "epoch": 0.45574, + "grad_norm": 0.7966035654835659, + "learning_rate": 0.003, + "loss": 4.039, + "step": 45574 + }, + { + "epoch": 0.45575, + "grad_norm": 0.8507609905266768, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 45575 + }, + { + "epoch": 0.45576, + "grad_norm": 0.9500268257945214, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 45576 + }, + { + "epoch": 0.45577, + "grad_norm": 1.048356611301137, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 45577 + }, + { + "epoch": 0.45578, + "grad_norm": 1.0420325307216258, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 45578 + }, + { + "epoch": 0.45579, + "grad_norm": 1.0630547913107533, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 45579 + }, + { + "epoch": 0.4558, + "grad_norm": 1.0180146943288833, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 45580 + }, + { + "epoch": 0.45581, + "grad_norm": 0.901704617486308, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 45581 + }, + { + "epoch": 0.45582, + "grad_norm": 0.8862753394274732, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 45582 + }, + { + "epoch": 0.45583, + "grad_norm": 0.7706219380686751, + "learning_rate": 0.003, + "loss": 4.0182, + "step": 45583 + }, + { + "epoch": 0.45584, + "grad_norm": 0.63702662427909, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 45584 + }, + { + "epoch": 0.45585, + "grad_norm": 0.5355941999358379, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 45585 + }, + { + "epoch": 0.45586, + "grad_norm": 0.6194059203849233, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 45586 + }, + { + "epoch": 0.45587, + "grad_norm": 0.7164024138581827, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 45587 + }, + { + "epoch": 0.45588, + "grad_norm": 0.7412746691011954, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 45588 + }, + { + "epoch": 0.45589, + "grad_norm": 0.6630225110175442, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 45589 + }, + { + "epoch": 0.4559, + "grad_norm": 0.7566227304391777, + "learning_rate": 0.003, + "loss": 3.9886, + "step": 45590 + }, + { + "epoch": 0.45591, + "grad_norm": 0.9142929184480122, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 45591 + }, + { + "epoch": 0.45592, + "grad_norm": 0.8823387580764784, + "learning_rate": 0.003, + "loss": 4.0088, + "step": 45592 + }, + { + "epoch": 0.45593, + "grad_norm": 0.7202745389312132, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 45593 + }, + { + "epoch": 0.45594, + "grad_norm": 0.6721642918200951, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 45594 + }, + { + "epoch": 0.45595, + "grad_norm": 0.7203956125238936, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 45595 + }, + { + "epoch": 0.45596, + "grad_norm": 0.7293199756465759, + "learning_rate": 0.003, + "loss": 4.0068, + "step": 45596 + }, + { + "epoch": 0.45597, + "grad_norm": 0.9019494096303808, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 45597 + }, + { + "epoch": 0.45598, + "grad_norm": 1.1577110031387867, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 45598 + }, + { + "epoch": 0.45599, + "grad_norm": 1.3109264848547613, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 45599 + }, + { + "epoch": 0.456, + "grad_norm": 0.6817898862249892, + "learning_rate": 0.003, + "loss": 4.0053, + "step": 45600 + }, + { + "epoch": 0.45601, + "grad_norm": 0.7113185732684061, + "learning_rate": 0.003, + "loss": 4.0069, + "step": 45601 + }, + { + "epoch": 0.45602, + "grad_norm": 0.8063229686139676, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 45602 + }, + { + "epoch": 0.45603, + "grad_norm": 0.8206409919614315, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 45603 + }, + { + "epoch": 0.45604, + "grad_norm": 0.8904033478453178, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 45604 + }, + { + "epoch": 0.45605, + "grad_norm": 0.9351125216211288, + "learning_rate": 0.003, + "loss": 4.051, + "step": 45605 + }, + { + "epoch": 0.45606, + "grad_norm": 0.9006089753266067, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 45606 + }, + { + "epoch": 0.45607, + "grad_norm": 0.7116924073627584, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 45607 + }, + { + "epoch": 0.45608, + "grad_norm": 0.6213886882866063, + "learning_rate": 0.003, + "loss": 3.9965, + "step": 45608 + }, + { + "epoch": 0.45609, + "grad_norm": 0.5344284455073741, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 45609 + }, + { + "epoch": 0.4561, + "grad_norm": 0.551279050252476, + "learning_rate": 0.003, + "loss": 3.9808, + "step": 45610 + }, + { + "epoch": 0.45611, + "grad_norm": 0.6251848996618282, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 45611 + }, + { + "epoch": 0.45612, + "grad_norm": 0.7128715776013879, + "learning_rate": 0.003, + "loss": 4.023, + "step": 45612 + }, + { + "epoch": 0.45613, + "grad_norm": 0.7586317107400435, + "learning_rate": 0.003, + "loss": 4.008, + "step": 45613 + }, + { + "epoch": 0.45614, + "grad_norm": 0.6829518408950923, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 45614 + }, + { + "epoch": 0.45615, + "grad_norm": 0.688463388268285, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 45615 + }, + { + "epoch": 0.45616, + "grad_norm": 0.7385714191058682, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 45616 + }, + { + "epoch": 0.45617, + "grad_norm": 0.7224854676559317, + "learning_rate": 0.003, + "loss": 4.047, + "step": 45617 + }, + { + "epoch": 0.45618, + "grad_norm": 0.8079634277092217, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 45618 + }, + { + "epoch": 0.45619, + "grad_norm": 1.0447943003840647, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 45619 + }, + { + "epoch": 0.4562, + "grad_norm": 1.1964633268996008, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 45620 + }, + { + "epoch": 0.45621, + "grad_norm": 0.8514727273850327, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 45621 + }, + { + "epoch": 0.45622, + "grad_norm": 0.7756124120539923, + "learning_rate": 0.003, + "loss": 4.0058, + "step": 45622 + }, + { + "epoch": 0.45623, + "grad_norm": 0.7578170179967519, + "learning_rate": 0.003, + "loss": 3.9826, + "step": 45623 + }, + { + "epoch": 0.45624, + "grad_norm": 0.8180303919131862, + "learning_rate": 0.003, + "loss": 3.9934, + "step": 45624 + }, + { + "epoch": 0.45625, + "grad_norm": 0.9647225746474363, + "learning_rate": 0.003, + "loss": 4.0053, + "step": 45625 + }, + { + "epoch": 0.45626, + "grad_norm": 1.0536742693362633, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 45626 + }, + { + "epoch": 0.45627, + "grad_norm": 0.8567598932755611, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 45627 + }, + { + "epoch": 0.45628, + "grad_norm": 0.8781159076398959, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 45628 + }, + { + "epoch": 0.45629, + "grad_norm": 0.8357464911699556, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 45629 + }, + { + "epoch": 0.4563, + "grad_norm": 0.805481348946293, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 45630 + }, + { + "epoch": 0.45631, + "grad_norm": 0.902765005441729, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 45631 + }, + { + "epoch": 0.45632, + "grad_norm": 0.9392514551153002, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 45632 + }, + { + "epoch": 0.45633, + "grad_norm": 0.945114078475436, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 45633 + }, + { + "epoch": 0.45634, + "grad_norm": 0.9286429974913155, + "learning_rate": 0.003, + "loss": 4.034, + "step": 45634 + }, + { + "epoch": 0.45635, + "grad_norm": 0.9308148219928475, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 45635 + }, + { + "epoch": 0.45636, + "grad_norm": 1.0549780561995095, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 45636 + }, + { + "epoch": 0.45637, + "grad_norm": 1.1125690737307377, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 45637 + }, + { + "epoch": 0.45638, + "grad_norm": 0.8155001673085944, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 45638 + }, + { + "epoch": 0.45639, + "grad_norm": 0.656517319262082, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 45639 + }, + { + "epoch": 0.4564, + "grad_norm": 0.6706678947722391, + "learning_rate": 0.003, + "loss": 4.05, + "step": 45640 + }, + { + "epoch": 0.45641, + "grad_norm": 0.7842408061945897, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 45641 + }, + { + "epoch": 0.45642, + "grad_norm": 0.8950320081036915, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 45642 + }, + { + "epoch": 0.45643, + "grad_norm": 1.033367495618979, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 45643 + }, + { + "epoch": 0.45644, + "grad_norm": 0.9281274595113936, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 45644 + }, + { + "epoch": 0.45645, + "grad_norm": 0.9433113223076969, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 45645 + }, + { + "epoch": 0.45646, + "grad_norm": 0.9623450431402325, + "learning_rate": 0.003, + "loss": 4.0009, + "step": 45646 + }, + { + "epoch": 0.45647, + "grad_norm": 1.0366526331852808, + "learning_rate": 0.003, + "loss": 4.0672, + "step": 45647 + }, + { + "epoch": 0.45648, + "grad_norm": 1.0777309651219298, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 45648 + }, + { + "epoch": 0.45649, + "grad_norm": 0.9599435362093072, + "learning_rate": 0.003, + "loss": 4.011, + "step": 45649 + }, + { + "epoch": 0.4565, + "grad_norm": 0.8752161164309897, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 45650 + }, + { + "epoch": 0.45651, + "grad_norm": 1.0068571975361709, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 45651 + }, + { + "epoch": 0.45652, + "grad_norm": 0.9900459747293798, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 45652 + }, + { + "epoch": 0.45653, + "grad_norm": 0.9420624747904544, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 45653 + }, + { + "epoch": 0.45654, + "grad_norm": 0.9789868990585011, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 45654 + }, + { + "epoch": 0.45655, + "grad_norm": 1.0140911790420304, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 45655 + }, + { + "epoch": 0.45656, + "grad_norm": 0.918751765151564, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 45656 + }, + { + "epoch": 0.45657, + "grad_norm": 0.7479639886729362, + "learning_rate": 0.003, + "loss": 4.0124, + "step": 45657 + }, + { + "epoch": 0.45658, + "grad_norm": 0.687854978295029, + "learning_rate": 0.003, + "loss": 4.0024, + "step": 45658 + }, + { + "epoch": 0.45659, + "grad_norm": 0.8055508703868801, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 45659 + }, + { + "epoch": 0.4566, + "grad_norm": 0.8549912079574731, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 45660 + }, + { + "epoch": 0.45661, + "grad_norm": 0.8209847604121142, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 45661 + }, + { + "epoch": 0.45662, + "grad_norm": 0.8730102362719403, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 45662 + }, + { + "epoch": 0.45663, + "grad_norm": 0.9655749508061086, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 45663 + }, + { + "epoch": 0.45664, + "grad_norm": 1.0391399656827114, + "learning_rate": 0.003, + "loss": 4.055, + "step": 45664 + }, + { + "epoch": 0.45665, + "grad_norm": 0.8422195092384677, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 45665 + }, + { + "epoch": 0.45666, + "grad_norm": 0.803930193356331, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 45666 + }, + { + "epoch": 0.45667, + "grad_norm": 0.7670991124731659, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 45667 + }, + { + "epoch": 0.45668, + "grad_norm": 0.8067805423180888, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 45668 + }, + { + "epoch": 0.45669, + "grad_norm": 0.8980194890546872, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 45669 + }, + { + "epoch": 0.4567, + "grad_norm": 0.9009081639549379, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 45670 + }, + { + "epoch": 0.45671, + "grad_norm": 0.8236899970089157, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 45671 + }, + { + "epoch": 0.45672, + "grad_norm": 0.7378002208152876, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 45672 + }, + { + "epoch": 0.45673, + "grad_norm": 0.6596684085935599, + "learning_rate": 0.003, + "loss": 4.0022, + "step": 45673 + }, + { + "epoch": 0.45674, + "grad_norm": 0.602156044066984, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 45674 + }, + { + "epoch": 0.45675, + "grad_norm": 0.6104115307799634, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 45675 + }, + { + "epoch": 0.45676, + "grad_norm": 0.7865395780042448, + "learning_rate": 0.003, + "loss": 4.026, + "step": 45676 + }, + { + "epoch": 0.45677, + "grad_norm": 0.8887028538886982, + "learning_rate": 0.003, + "loss": 4.0026, + "step": 45677 + }, + { + "epoch": 0.45678, + "grad_norm": 1.0675016988508959, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 45678 + }, + { + "epoch": 0.45679, + "grad_norm": 1.0158604724276448, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 45679 + }, + { + "epoch": 0.4568, + "grad_norm": 0.9627733595732502, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 45680 + }, + { + "epoch": 0.45681, + "grad_norm": 0.8767257421602539, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 45681 + }, + { + "epoch": 0.45682, + "grad_norm": 0.8826116249417618, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 45682 + }, + { + "epoch": 0.45683, + "grad_norm": 0.7987294613827792, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 45683 + }, + { + "epoch": 0.45684, + "grad_norm": 0.8931281488685875, + "learning_rate": 0.003, + "loss": 4.044, + "step": 45684 + }, + { + "epoch": 0.45685, + "grad_norm": 1.0260877322865776, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 45685 + }, + { + "epoch": 0.45686, + "grad_norm": 0.9868560034138975, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 45686 + }, + { + "epoch": 0.45687, + "grad_norm": 1.1828456474986009, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 45687 + }, + { + "epoch": 0.45688, + "grad_norm": 0.8479310966719661, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 45688 + }, + { + "epoch": 0.45689, + "grad_norm": 0.6673281196838954, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 45689 + }, + { + "epoch": 0.4569, + "grad_norm": 0.6422971652683213, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 45690 + }, + { + "epoch": 0.45691, + "grad_norm": 0.5801068435864924, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 45691 + }, + { + "epoch": 0.45692, + "grad_norm": 0.6092167599912038, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 45692 + }, + { + "epoch": 0.45693, + "grad_norm": 0.6271919366446225, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 45693 + }, + { + "epoch": 0.45694, + "grad_norm": 0.5564117194223818, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 45694 + }, + { + "epoch": 0.45695, + "grad_norm": 0.5107903701215368, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 45695 + }, + { + "epoch": 0.45696, + "grad_norm": 0.5992251684130128, + "learning_rate": 0.003, + "loss": 3.9968, + "step": 45696 + }, + { + "epoch": 0.45697, + "grad_norm": 0.6415385557778087, + "learning_rate": 0.003, + "loss": 3.9931, + "step": 45697 + }, + { + "epoch": 0.45698, + "grad_norm": 0.7238573585405784, + "learning_rate": 0.003, + "loss": 3.9687, + "step": 45698 + }, + { + "epoch": 0.45699, + "grad_norm": 0.9577405234835955, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 45699 + }, + { + "epoch": 0.457, + "grad_norm": 1.2167374522421777, + "learning_rate": 0.003, + "loss": 4.046, + "step": 45700 + }, + { + "epoch": 0.45701, + "grad_norm": 0.9129891265538039, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 45701 + }, + { + "epoch": 0.45702, + "grad_norm": 0.9040412530125034, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 45702 + }, + { + "epoch": 0.45703, + "grad_norm": 0.8956478127124609, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 45703 + }, + { + "epoch": 0.45704, + "grad_norm": 0.9278730269310939, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 45704 + }, + { + "epoch": 0.45705, + "grad_norm": 1.0096891751666102, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 45705 + }, + { + "epoch": 0.45706, + "grad_norm": 0.9481919782384982, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 45706 + }, + { + "epoch": 0.45707, + "grad_norm": 0.8806761096984208, + "learning_rate": 0.003, + "loss": 4.0013, + "step": 45707 + }, + { + "epoch": 0.45708, + "grad_norm": 0.753968461384301, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 45708 + }, + { + "epoch": 0.45709, + "grad_norm": 0.6856087573215952, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 45709 + }, + { + "epoch": 0.4571, + "grad_norm": 0.7470449077418873, + "learning_rate": 0.003, + "loss": 4.041, + "step": 45710 + }, + { + "epoch": 0.45711, + "grad_norm": 0.8917934451950905, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 45711 + }, + { + "epoch": 0.45712, + "grad_norm": 1.0076125305434602, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 45712 + }, + { + "epoch": 0.45713, + "grad_norm": 0.9751428264134452, + "learning_rate": 0.003, + "loss": 4.014, + "step": 45713 + }, + { + "epoch": 0.45714, + "grad_norm": 0.8366908696091028, + "learning_rate": 0.003, + "loss": 4.012, + "step": 45714 + }, + { + "epoch": 0.45715, + "grad_norm": 0.77037016456216, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 45715 + }, + { + "epoch": 0.45716, + "grad_norm": 0.6887142723275849, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 45716 + }, + { + "epoch": 0.45717, + "grad_norm": 0.6145780441302576, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 45717 + }, + { + "epoch": 0.45718, + "grad_norm": 0.5985890914346652, + "learning_rate": 0.003, + "loss": 3.9961, + "step": 45718 + }, + { + "epoch": 0.45719, + "grad_norm": 0.6198249472030245, + "learning_rate": 0.003, + "loss": 4.026, + "step": 45719 + }, + { + "epoch": 0.4572, + "grad_norm": 0.6095626233386484, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 45720 + }, + { + "epoch": 0.45721, + "grad_norm": 0.7495574942317936, + "learning_rate": 0.003, + "loss": 4.0014, + "step": 45721 + }, + { + "epoch": 0.45722, + "grad_norm": 0.8700876370113926, + "learning_rate": 0.003, + "loss": 4.0061, + "step": 45722 + }, + { + "epoch": 0.45723, + "grad_norm": 0.9324817836151955, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 45723 + }, + { + "epoch": 0.45724, + "grad_norm": 1.0196605589304821, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 45724 + }, + { + "epoch": 0.45725, + "grad_norm": 0.9888696670121958, + "learning_rate": 0.003, + "loss": 4.044, + "step": 45725 + }, + { + "epoch": 0.45726, + "grad_norm": 0.8752824840132033, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 45726 + }, + { + "epoch": 0.45727, + "grad_norm": 0.8671183653075212, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 45727 + }, + { + "epoch": 0.45728, + "grad_norm": 0.9073850702941797, + "learning_rate": 0.003, + "loss": 4.074, + "step": 45728 + }, + { + "epoch": 0.45729, + "grad_norm": 0.8994364900977998, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 45729 + }, + { + "epoch": 0.4573, + "grad_norm": 0.8966392840504118, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 45730 + }, + { + "epoch": 0.45731, + "grad_norm": 1.0036549043004492, + "learning_rate": 0.003, + "loss": 4.033, + "step": 45731 + }, + { + "epoch": 0.45732, + "grad_norm": 0.9583564547325678, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 45732 + }, + { + "epoch": 0.45733, + "grad_norm": 0.8833521270997945, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 45733 + }, + { + "epoch": 0.45734, + "grad_norm": 0.7719565878431592, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 45734 + }, + { + "epoch": 0.45735, + "grad_norm": 0.8005974679234552, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 45735 + }, + { + "epoch": 0.45736, + "grad_norm": 0.9067717994195996, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 45736 + }, + { + "epoch": 0.45737, + "grad_norm": 0.9613327089963153, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 45737 + }, + { + "epoch": 0.45738, + "grad_norm": 1.1645605690691068, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 45738 + }, + { + "epoch": 0.45739, + "grad_norm": 1.1193938144454196, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 45739 + }, + { + "epoch": 0.4574, + "grad_norm": 1.0293438296249562, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 45740 + }, + { + "epoch": 0.45741, + "grad_norm": 0.900194156046003, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 45741 + }, + { + "epoch": 0.45742, + "grad_norm": 0.6865929960052959, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 45742 + }, + { + "epoch": 0.45743, + "grad_norm": 0.7102088267620075, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 45743 + }, + { + "epoch": 0.45744, + "grad_norm": 0.8629767900994081, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 45744 + }, + { + "epoch": 0.45745, + "grad_norm": 1.1483316360705074, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 45745 + }, + { + "epoch": 0.45746, + "grad_norm": 0.9445884865708238, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 45746 + }, + { + "epoch": 0.45747, + "grad_norm": 0.9410087123238225, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 45747 + }, + { + "epoch": 0.45748, + "grad_norm": 0.9294212581984868, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 45748 + }, + { + "epoch": 0.45749, + "grad_norm": 0.9299063915944115, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 45749 + }, + { + "epoch": 0.4575, + "grad_norm": 0.8774848486022494, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 45750 + }, + { + "epoch": 0.45751, + "grad_norm": 0.8364464173372352, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 45751 + }, + { + "epoch": 0.45752, + "grad_norm": 0.8469858026394662, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 45752 + }, + { + "epoch": 0.45753, + "grad_norm": 0.9118303904999503, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 45753 + }, + { + "epoch": 0.45754, + "grad_norm": 0.9950160953931687, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 45754 + }, + { + "epoch": 0.45755, + "grad_norm": 0.9667095434785097, + "learning_rate": 0.003, + "loss": 4.0646, + "step": 45755 + }, + { + "epoch": 0.45756, + "grad_norm": 0.9593841271178553, + "learning_rate": 0.003, + "loss": 4.045, + "step": 45756 + }, + { + "epoch": 0.45757, + "grad_norm": 0.8782057637528284, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 45757 + }, + { + "epoch": 0.45758, + "grad_norm": 0.8226644628536675, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 45758 + }, + { + "epoch": 0.45759, + "grad_norm": 0.7160309090666416, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 45759 + }, + { + "epoch": 0.4576, + "grad_norm": 0.6294653976816437, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 45760 + }, + { + "epoch": 0.45761, + "grad_norm": 0.5696414573102204, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 45761 + }, + { + "epoch": 0.45762, + "grad_norm": 0.5289360976111053, + "learning_rate": 0.003, + "loss": 4.0069, + "step": 45762 + }, + { + "epoch": 0.45763, + "grad_norm": 0.5161647508889665, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 45763 + }, + { + "epoch": 0.45764, + "grad_norm": 0.4697470718973671, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 45764 + }, + { + "epoch": 0.45765, + "grad_norm": 0.5359828777229705, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 45765 + }, + { + "epoch": 0.45766, + "grad_norm": 0.6769255346220769, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 45766 + }, + { + "epoch": 0.45767, + "grad_norm": 0.8938811154300222, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 45767 + }, + { + "epoch": 0.45768, + "grad_norm": 1.0391522881273456, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 45768 + }, + { + "epoch": 0.45769, + "grad_norm": 0.937223469716404, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 45769 + }, + { + "epoch": 0.4577, + "grad_norm": 0.7663458141476365, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 45770 + }, + { + "epoch": 0.45771, + "grad_norm": 0.6833930188320141, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 45771 + }, + { + "epoch": 0.45772, + "grad_norm": 0.8082418939688866, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 45772 + }, + { + "epoch": 0.45773, + "grad_norm": 0.8348386992686129, + "learning_rate": 0.003, + "loss": 3.9906, + "step": 45773 + }, + { + "epoch": 0.45774, + "grad_norm": 0.7221485653340484, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 45774 + }, + { + "epoch": 0.45775, + "grad_norm": 0.6835725991285144, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 45775 + }, + { + "epoch": 0.45776, + "grad_norm": 0.707794231952861, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 45776 + }, + { + "epoch": 0.45777, + "grad_norm": 0.6734283956724523, + "learning_rate": 0.003, + "loss": 3.9966, + "step": 45777 + }, + { + "epoch": 0.45778, + "grad_norm": 0.6726039485787326, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 45778 + }, + { + "epoch": 0.45779, + "grad_norm": 0.7644033509336102, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 45779 + }, + { + "epoch": 0.4578, + "grad_norm": 0.8581736477096313, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 45780 + }, + { + "epoch": 0.45781, + "grad_norm": 0.9327742711667386, + "learning_rate": 0.003, + "loss": 4.0022, + "step": 45781 + }, + { + "epoch": 0.45782, + "grad_norm": 0.9149398442668588, + "learning_rate": 0.003, + "loss": 4.052, + "step": 45782 + }, + { + "epoch": 0.45783, + "grad_norm": 1.0519229680754139, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 45783 + }, + { + "epoch": 0.45784, + "grad_norm": 1.1407836682884371, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 45784 + }, + { + "epoch": 0.45785, + "grad_norm": 0.9147718465308109, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 45785 + }, + { + "epoch": 0.45786, + "grad_norm": 0.9177371510875191, + "learning_rate": 0.003, + "loss": 4.0027, + "step": 45786 + }, + { + "epoch": 0.45787, + "grad_norm": 0.9893074551722557, + "learning_rate": 0.003, + "loss": 4.059, + "step": 45787 + }, + { + "epoch": 0.45788, + "grad_norm": 0.88791552520174, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 45788 + }, + { + "epoch": 0.45789, + "grad_norm": 0.9228152222964849, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 45789 + }, + { + "epoch": 0.4579, + "grad_norm": 0.8307890581982842, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 45790 + }, + { + "epoch": 0.45791, + "grad_norm": 0.720969029879689, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 45791 + }, + { + "epoch": 0.45792, + "grad_norm": 0.7156067225842089, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 45792 + }, + { + "epoch": 0.45793, + "grad_norm": 0.8003883981437475, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 45793 + }, + { + "epoch": 0.45794, + "grad_norm": 0.8078581473630407, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 45794 + }, + { + "epoch": 0.45795, + "grad_norm": 0.9043963113797959, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 45795 + }, + { + "epoch": 0.45796, + "grad_norm": 1.0760129434675143, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 45796 + }, + { + "epoch": 0.45797, + "grad_norm": 1.0575214716110792, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 45797 + }, + { + "epoch": 0.45798, + "grad_norm": 0.9267937917517242, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 45798 + }, + { + "epoch": 0.45799, + "grad_norm": 0.7450197040096707, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 45799 + }, + { + "epoch": 0.458, + "grad_norm": 0.6744847317984404, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 45800 + }, + { + "epoch": 0.45801, + "grad_norm": 0.7571966091212818, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 45801 + }, + { + "epoch": 0.45802, + "grad_norm": 0.7295012423941821, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 45802 + }, + { + "epoch": 0.45803, + "grad_norm": 0.6741726988080806, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 45803 + }, + { + "epoch": 0.45804, + "grad_norm": 0.6172158631166483, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 45804 + }, + { + "epoch": 0.45805, + "grad_norm": 0.6521931333100813, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 45805 + }, + { + "epoch": 0.45806, + "grad_norm": 0.5881908552825337, + "learning_rate": 0.003, + "loss": 4.027, + "step": 45806 + }, + { + "epoch": 0.45807, + "grad_norm": 0.643283304003125, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 45807 + }, + { + "epoch": 0.45808, + "grad_norm": 0.7809839792395686, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 45808 + }, + { + "epoch": 0.45809, + "grad_norm": 1.0306165697198044, + "learning_rate": 0.003, + "loss": 4.033, + "step": 45809 + }, + { + "epoch": 0.4581, + "grad_norm": 1.126829300723892, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 45810 + }, + { + "epoch": 0.45811, + "grad_norm": 0.9319359511517921, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 45811 + }, + { + "epoch": 0.45812, + "grad_norm": 0.9807644069481718, + "learning_rate": 0.003, + "loss": 4.0568, + "step": 45812 + }, + { + "epoch": 0.45813, + "grad_norm": 0.9171175939579189, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 45813 + }, + { + "epoch": 0.45814, + "grad_norm": 1.0287943836035651, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 45814 + }, + { + "epoch": 0.45815, + "grad_norm": 0.8646844862416587, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 45815 + }, + { + "epoch": 0.45816, + "grad_norm": 0.8510866729321744, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 45816 + }, + { + "epoch": 0.45817, + "grad_norm": 0.8376783961411893, + "learning_rate": 0.003, + "loss": 4.0752, + "step": 45817 + }, + { + "epoch": 0.45818, + "grad_norm": 0.7613936607712952, + "learning_rate": 0.003, + "loss": 4.052, + "step": 45818 + }, + { + "epoch": 0.45819, + "grad_norm": 0.7235615043270677, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 45819 + }, + { + "epoch": 0.4582, + "grad_norm": 0.7515077823988452, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 45820 + }, + { + "epoch": 0.45821, + "grad_norm": 0.8157989505432688, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 45821 + }, + { + "epoch": 0.45822, + "grad_norm": 0.8198303018116322, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 45822 + }, + { + "epoch": 0.45823, + "grad_norm": 0.796398304203491, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 45823 + }, + { + "epoch": 0.45824, + "grad_norm": 0.8590237543165279, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 45824 + }, + { + "epoch": 0.45825, + "grad_norm": 0.8104691388451857, + "learning_rate": 0.003, + "loss": 4.022, + "step": 45825 + }, + { + "epoch": 0.45826, + "grad_norm": 0.8891062613571287, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 45826 + }, + { + "epoch": 0.45827, + "grad_norm": 1.0175862395636555, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 45827 + }, + { + "epoch": 0.45828, + "grad_norm": 1.1397932530305146, + "learning_rate": 0.003, + "loss": 4.042, + "step": 45828 + }, + { + "epoch": 0.45829, + "grad_norm": 0.7657061622308814, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 45829 + }, + { + "epoch": 0.4583, + "grad_norm": 0.6613703001823337, + "learning_rate": 0.003, + "loss": 3.9938, + "step": 45830 + }, + { + "epoch": 0.45831, + "grad_norm": 0.547677278051275, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 45831 + }, + { + "epoch": 0.45832, + "grad_norm": 0.601058196867938, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 45832 + }, + { + "epoch": 0.45833, + "grad_norm": 0.7135010481042681, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 45833 + }, + { + "epoch": 0.45834, + "grad_norm": 0.8915104935997855, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 45834 + }, + { + "epoch": 0.45835, + "grad_norm": 1.0278891932059562, + "learning_rate": 0.003, + "loss": 3.995, + "step": 45835 + }, + { + "epoch": 0.45836, + "grad_norm": 0.8431539738272275, + "learning_rate": 0.003, + "loss": 4.0026, + "step": 45836 + }, + { + "epoch": 0.45837, + "grad_norm": 0.8635334318183848, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 45837 + }, + { + "epoch": 0.45838, + "grad_norm": 0.8663706134132951, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 45838 + }, + { + "epoch": 0.45839, + "grad_norm": 0.8401932850156375, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 45839 + }, + { + "epoch": 0.4584, + "grad_norm": 0.7726747395422978, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 45840 + }, + { + "epoch": 0.45841, + "grad_norm": 0.7627101139576009, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 45841 + }, + { + "epoch": 0.45842, + "grad_norm": 0.7952904298683913, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 45842 + }, + { + "epoch": 0.45843, + "grad_norm": 0.9119945810269203, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 45843 + }, + { + "epoch": 0.45844, + "grad_norm": 0.8949933709615175, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 45844 + }, + { + "epoch": 0.45845, + "grad_norm": 0.7664502483951965, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 45845 + }, + { + "epoch": 0.45846, + "grad_norm": 0.747893187527391, + "learning_rate": 0.003, + "loss": 4.0483, + "step": 45846 + }, + { + "epoch": 0.45847, + "grad_norm": 0.8345769453490114, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 45847 + }, + { + "epoch": 0.45848, + "grad_norm": 1.0864167169607022, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 45848 + }, + { + "epoch": 0.45849, + "grad_norm": 1.372841961137247, + "learning_rate": 0.003, + "loss": 4.05, + "step": 45849 + }, + { + "epoch": 0.4585, + "grad_norm": 0.7236294678490086, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 45850 + }, + { + "epoch": 0.45851, + "grad_norm": 0.6747199083552036, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 45851 + }, + { + "epoch": 0.45852, + "grad_norm": 0.8325857250207056, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 45852 + }, + { + "epoch": 0.45853, + "grad_norm": 0.9501715249327524, + "learning_rate": 0.003, + "loss": 3.9964, + "step": 45853 + }, + { + "epoch": 0.45854, + "grad_norm": 0.9863464099151913, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 45854 + }, + { + "epoch": 0.45855, + "grad_norm": 0.9646365414295242, + "learning_rate": 0.003, + "loss": 4.026, + "step": 45855 + }, + { + "epoch": 0.45856, + "grad_norm": 0.8800129334570769, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 45856 + }, + { + "epoch": 0.45857, + "grad_norm": 0.7658762151515085, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 45857 + }, + { + "epoch": 0.45858, + "grad_norm": 0.7039214675058957, + "learning_rate": 0.003, + "loss": 3.9949, + "step": 45858 + }, + { + "epoch": 0.45859, + "grad_norm": 0.6937781900951961, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 45859 + }, + { + "epoch": 0.4586, + "grad_norm": 0.6760579549383237, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 45860 + }, + { + "epoch": 0.45861, + "grad_norm": 0.7143142036204769, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 45861 + }, + { + "epoch": 0.45862, + "grad_norm": 0.7761571033165544, + "learning_rate": 0.003, + "loss": 3.9993, + "step": 45862 + }, + { + "epoch": 0.45863, + "grad_norm": 0.8330442893523846, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 45863 + }, + { + "epoch": 0.45864, + "grad_norm": 1.1729552240322536, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 45864 + }, + { + "epoch": 0.45865, + "grad_norm": 1.1921095625103606, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 45865 + }, + { + "epoch": 0.45866, + "grad_norm": 0.7363255903223376, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 45866 + }, + { + "epoch": 0.45867, + "grad_norm": 0.7122413182842151, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 45867 + }, + { + "epoch": 0.45868, + "grad_norm": 0.7857534886542193, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 45868 + }, + { + "epoch": 0.45869, + "grad_norm": 0.8364048214301106, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 45869 + }, + { + "epoch": 0.4587, + "grad_norm": 0.826962372614985, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 45870 + }, + { + "epoch": 0.45871, + "grad_norm": 0.8188970496801261, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 45871 + }, + { + "epoch": 0.45872, + "grad_norm": 0.8300589886273395, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 45872 + }, + { + "epoch": 0.45873, + "grad_norm": 0.8148997578272075, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 45873 + }, + { + "epoch": 0.45874, + "grad_norm": 0.7995197153184525, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 45874 + }, + { + "epoch": 0.45875, + "grad_norm": 0.7664685948172243, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 45875 + }, + { + "epoch": 0.45876, + "grad_norm": 0.7605282262805102, + "learning_rate": 0.003, + "loss": 4.0587, + "step": 45876 + }, + { + "epoch": 0.45877, + "grad_norm": 0.8425154971643312, + "learning_rate": 0.003, + "loss": 4.0313, + "step": 45877 + }, + { + "epoch": 0.45878, + "grad_norm": 0.9187242349497327, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 45878 + }, + { + "epoch": 0.45879, + "grad_norm": 1.017951548700173, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 45879 + }, + { + "epoch": 0.4588, + "grad_norm": 0.9434264683155619, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 45880 + }, + { + "epoch": 0.45881, + "grad_norm": 0.704156831219182, + "learning_rate": 0.003, + "loss": 4.0134, + "step": 45881 + }, + { + "epoch": 0.45882, + "grad_norm": 0.6546957718410645, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 45882 + }, + { + "epoch": 0.45883, + "grad_norm": 0.6330241964800897, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 45883 + }, + { + "epoch": 0.45884, + "grad_norm": 0.6718559042930617, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 45884 + }, + { + "epoch": 0.45885, + "grad_norm": 0.7192211962276244, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 45885 + }, + { + "epoch": 0.45886, + "grad_norm": 0.8617655072911343, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 45886 + }, + { + "epoch": 0.45887, + "grad_norm": 1.038656342906339, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 45887 + }, + { + "epoch": 0.45888, + "grad_norm": 1.1915111573130552, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 45888 + }, + { + "epoch": 0.45889, + "grad_norm": 0.9243157132668446, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 45889 + }, + { + "epoch": 0.4589, + "grad_norm": 0.9342529545926288, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 45890 + }, + { + "epoch": 0.45891, + "grad_norm": 0.9485382425131498, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 45891 + }, + { + "epoch": 0.45892, + "grad_norm": 1.0409305377232125, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 45892 + }, + { + "epoch": 0.45893, + "grad_norm": 0.9962315274100223, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 45893 + }, + { + "epoch": 0.45894, + "grad_norm": 1.066079121022366, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 45894 + }, + { + "epoch": 0.45895, + "grad_norm": 0.791794800791947, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 45895 + }, + { + "epoch": 0.45896, + "grad_norm": 0.7988382102919458, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 45896 + }, + { + "epoch": 0.45897, + "grad_norm": 0.780623238243102, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 45897 + }, + { + "epoch": 0.45898, + "grad_norm": 0.8044874089046685, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 45898 + }, + { + "epoch": 0.45899, + "grad_norm": 0.8169615797061069, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 45899 + }, + { + "epoch": 0.459, + "grad_norm": 0.7660464748470878, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 45900 + }, + { + "epoch": 0.45901, + "grad_norm": 0.7468716556967704, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 45901 + }, + { + "epoch": 0.45902, + "grad_norm": 0.7587873582283127, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 45902 + }, + { + "epoch": 0.45903, + "grad_norm": 0.8585147510990933, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 45903 + }, + { + "epoch": 0.45904, + "grad_norm": 1.010376761732812, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 45904 + }, + { + "epoch": 0.45905, + "grad_norm": 1.1115729653378135, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 45905 + }, + { + "epoch": 0.45906, + "grad_norm": 0.878199080569954, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 45906 + }, + { + "epoch": 0.45907, + "grad_norm": 0.7470253528523036, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 45907 + }, + { + "epoch": 0.45908, + "grad_norm": 0.6487757629799854, + "learning_rate": 0.003, + "loss": 4.0016, + "step": 45908 + }, + { + "epoch": 0.45909, + "grad_norm": 0.6801293405566895, + "learning_rate": 0.003, + "loss": 3.971, + "step": 45909 + }, + { + "epoch": 0.4591, + "grad_norm": 0.6916928977527351, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 45910 + }, + { + "epoch": 0.45911, + "grad_norm": 0.76615286504268, + "learning_rate": 0.003, + "loss": 3.995, + "step": 45911 + }, + { + "epoch": 0.45912, + "grad_norm": 0.8795425856285417, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 45912 + }, + { + "epoch": 0.45913, + "grad_norm": 0.9890875080012677, + "learning_rate": 0.003, + "loss": 3.9951, + "step": 45913 + }, + { + "epoch": 0.45914, + "grad_norm": 1.108869514789031, + "learning_rate": 0.003, + "loss": 4.0613, + "step": 45914 + }, + { + "epoch": 0.45915, + "grad_norm": 0.9861002437858808, + "learning_rate": 0.003, + "loss": 4.006, + "step": 45915 + }, + { + "epoch": 0.45916, + "grad_norm": 1.0145630279158864, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 45916 + }, + { + "epoch": 0.45917, + "grad_norm": 0.9299686814521305, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 45917 + }, + { + "epoch": 0.45918, + "grad_norm": 0.8570444248684954, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 45918 + }, + { + "epoch": 0.45919, + "grad_norm": 0.759308219999056, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 45919 + }, + { + "epoch": 0.4592, + "grad_norm": 0.6973386654306825, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 45920 + }, + { + "epoch": 0.45921, + "grad_norm": 0.6979460299281489, + "learning_rate": 0.003, + "loss": 4.044, + "step": 45921 + }, + { + "epoch": 0.45922, + "grad_norm": 0.743645945615913, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 45922 + }, + { + "epoch": 0.45923, + "grad_norm": 0.6951605094556949, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 45923 + }, + { + "epoch": 0.45924, + "grad_norm": 0.6850385190273786, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 45924 + }, + { + "epoch": 0.45925, + "grad_norm": 0.7560126443494505, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 45925 + }, + { + "epoch": 0.45926, + "grad_norm": 0.910254614527342, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 45926 + }, + { + "epoch": 0.45927, + "grad_norm": 1.0398111820037699, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 45927 + }, + { + "epoch": 0.45928, + "grad_norm": 0.9662161260463238, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 45928 + }, + { + "epoch": 0.45929, + "grad_norm": 0.8811043113182766, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 45929 + }, + { + "epoch": 0.4593, + "grad_norm": 0.844375457373046, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 45930 + }, + { + "epoch": 0.45931, + "grad_norm": 0.7523577249627031, + "learning_rate": 0.003, + "loss": 4.053, + "step": 45931 + }, + { + "epoch": 0.45932, + "grad_norm": 0.7795624966794473, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 45932 + }, + { + "epoch": 0.45933, + "grad_norm": 0.7358512638930544, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 45933 + }, + { + "epoch": 0.45934, + "grad_norm": 0.7721807471181563, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 45934 + }, + { + "epoch": 0.45935, + "grad_norm": 0.7876761037689769, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 45935 + }, + { + "epoch": 0.45936, + "grad_norm": 0.7701436785479491, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 45936 + }, + { + "epoch": 0.45937, + "grad_norm": 0.7621389124086476, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 45937 + }, + { + "epoch": 0.45938, + "grad_norm": 0.8173134384161513, + "learning_rate": 0.003, + "loss": 4.0008, + "step": 45938 + }, + { + "epoch": 0.45939, + "grad_norm": 0.9445634214753815, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 45939 + }, + { + "epoch": 0.4594, + "grad_norm": 0.9514895376409607, + "learning_rate": 0.003, + "loss": 4.02, + "step": 45940 + }, + { + "epoch": 0.45941, + "grad_norm": 0.9279427976804395, + "learning_rate": 0.003, + "loss": 4.0069, + "step": 45941 + }, + { + "epoch": 0.45942, + "grad_norm": 0.9835341180626104, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 45942 + }, + { + "epoch": 0.45943, + "grad_norm": 0.8967045225460442, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 45943 + }, + { + "epoch": 0.45944, + "grad_norm": 1.0177964676596847, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 45944 + }, + { + "epoch": 0.45945, + "grad_norm": 1.1312171113924319, + "learning_rate": 0.003, + "loss": 4.0461, + "step": 45945 + }, + { + "epoch": 0.45946, + "grad_norm": 0.7781595134869311, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 45946 + }, + { + "epoch": 0.45947, + "grad_norm": 0.8239567921369834, + "learning_rate": 0.003, + "loss": 4.065, + "step": 45947 + }, + { + "epoch": 0.45948, + "grad_norm": 0.8260287458102293, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 45948 + }, + { + "epoch": 0.45949, + "grad_norm": 0.874742829908873, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 45949 + }, + { + "epoch": 0.4595, + "grad_norm": 1.0727745529661963, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 45950 + }, + { + "epoch": 0.45951, + "grad_norm": 1.2381959903814523, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 45951 + }, + { + "epoch": 0.45952, + "grad_norm": 0.8033226759672965, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 45952 + }, + { + "epoch": 0.45953, + "grad_norm": 0.8119464781763498, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 45953 + }, + { + "epoch": 0.45954, + "grad_norm": 0.7972601370398406, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 45954 + }, + { + "epoch": 0.45955, + "grad_norm": 0.9508692036317131, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 45955 + }, + { + "epoch": 0.45956, + "grad_norm": 0.9651098253276503, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 45956 + }, + { + "epoch": 0.45957, + "grad_norm": 0.8418850937653181, + "learning_rate": 0.003, + "loss": 4.043, + "step": 45957 + }, + { + "epoch": 0.45958, + "grad_norm": 0.7348536305744401, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 45958 + }, + { + "epoch": 0.45959, + "grad_norm": 0.6405247819502439, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 45959 + }, + { + "epoch": 0.4596, + "grad_norm": 0.7323529063184906, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 45960 + }, + { + "epoch": 0.45961, + "grad_norm": 0.794564372904993, + "learning_rate": 0.003, + "loss": 4.008, + "step": 45961 + }, + { + "epoch": 0.45962, + "grad_norm": 0.8105267826097204, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 45962 + }, + { + "epoch": 0.45963, + "grad_norm": 0.8544013444636326, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 45963 + }, + { + "epoch": 0.45964, + "grad_norm": 0.9270317407360094, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 45964 + }, + { + "epoch": 0.45965, + "grad_norm": 1.0257649523905126, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 45965 + }, + { + "epoch": 0.45966, + "grad_norm": 0.9479461644486248, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 45966 + }, + { + "epoch": 0.45967, + "grad_norm": 0.9985151516824355, + "learning_rate": 0.003, + "loss": 4.062, + "step": 45967 + }, + { + "epoch": 0.45968, + "grad_norm": 0.9489944657741631, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 45968 + }, + { + "epoch": 0.45969, + "grad_norm": 0.8239961151605136, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 45969 + }, + { + "epoch": 0.4597, + "grad_norm": 0.7378639391873214, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 45970 + }, + { + "epoch": 0.45971, + "grad_norm": 0.744703678400459, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 45971 + }, + { + "epoch": 0.45972, + "grad_norm": 0.6661090136700727, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 45972 + }, + { + "epoch": 0.45973, + "grad_norm": 0.5690321870003924, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 45973 + }, + { + "epoch": 0.45974, + "grad_norm": 0.5914876639976845, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 45974 + }, + { + "epoch": 0.45975, + "grad_norm": 0.6894238486257898, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 45975 + }, + { + "epoch": 0.45976, + "grad_norm": 0.795394890287583, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 45976 + }, + { + "epoch": 0.45977, + "grad_norm": 0.9231845028049535, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 45977 + }, + { + "epoch": 0.45978, + "grad_norm": 0.884998831928674, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 45978 + }, + { + "epoch": 0.45979, + "grad_norm": 0.8731328412704323, + "learning_rate": 0.003, + "loss": 4.014, + "step": 45979 + }, + { + "epoch": 0.4598, + "grad_norm": 0.9341304982727942, + "learning_rate": 0.003, + "loss": 3.9992, + "step": 45980 + }, + { + "epoch": 0.45981, + "grad_norm": 1.0464392694513864, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 45981 + }, + { + "epoch": 0.45982, + "grad_norm": 0.8794439937022922, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 45982 + }, + { + "epoch": 0.45983, + "grad_norm": 0.7070054507292762, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 45983 + }, + { + "epoch": 0.45984, + "grad_norm": 0.6754807010021519, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 45984 + }, + { + "epoch": 0.45985, + "grad_norm": 0.7023903751951396, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 45985 + }, + { + "epoch": 0.45986, + "grad_norm": 0.6402757510527911, + "learning_rate": 0.003, + "loss": 3.9957, + "step": 45986 + }, + { + "epoch": 0.45987, + "grad_norm": 0.6502910779281191, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 45987 + }, + { + "epoch": 0.45988, + "grad_norm": 0.6813489506555198, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 45988 + }, + { + "epoch": 0.45989, + "grad_norm": 0.7567158756248097, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 45989 + }, + { + "epoch": 0.4599, + "grad_norm": 0.927967785814004, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 45990 + }, + { + "epoch": 0.45991, + "grad_norm": 1.0221278907037115, + "learning_rate": 0.003, + "loss": 4.0344, + "step": 45991 + }, + { + "epoch": 0.45992, + "grad_norm": 0.979841466573488, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 45992 + }, + { + "epoch": 0.45993, + "grad_norm": 0.9250575106167268, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 45993 + }, + { + "epoch": 0.45994, + "grad_norm": 0.9010658854325054, + "learning_rate": 0.003, + "loss": 4.0706, + "step": 45994 + }, + { + "epoch": 0.45995, + "grad_norm": 0.9575603654124396, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 45995 + }, + { + "epoch": 0.45996, + "grad_norm": 0.9464278579695817, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 45996 + }, + { + "epoch": 0.45997, + "grad_norm": 0.8082048645609452, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 45997 + }, + { + "epoch": 0.45998, + "grad_norm": 0.7561264220069237, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 45998 + }, + { + "epoch": 0.45999, + "grad_norm": 0.7997397370992118, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 45999 + }, + { + "epoch": 0.46, + "grad_norm": 1.0164695214171413, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 46000 + }, + { + "epoch": 0.46001, + "grad_norm": 1.1949048865029968, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 46001 + }, + { + "epoch": 0.46002, + "grad_norm": 1.0051062974394707, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 46002 + }, + { + "epoch": 0.46003, + "grad_norm": 1.2430232444581708, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 46003 + }, + { + "epoch": 0.46004, + "grad_norm": 0.859902909339549, + "learning_rate": 0.003, + "loss": 4.0607, + "step": 46004 + }, + { + "epoch": 0.46005, + "grad_norm": 0.8437936141364527, + "learning_rate": 0.003, + "loss": 4.035, + "step": 46005 + }, + { + "epoch": 0.46006, + "grad_norm": 0.8408726306369416, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 46006 + }, + { + "epoch": 0.46007, + "grad_norm": 0.8945088762026552, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 46007 + }, + { + "epoch": 0.46008, + "grad_norm": 1.0282607045603174, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 46008 + }, + { + "epoch": 0.46009, + "grad_norm": 1.0143614173651294, + "learning_rate": 0.003, + "loss": 4.065, + "step": 46009 + }, + { + "epoch": 0.4601, + "grad_norm": 0.9530607989066668, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 46010 + }, + { + "epoch": 0.46011, + "grad_norm": 0.8647286089160091, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 46011 + }, + { + "epoch": 0.46012, + "grad_norm": 0.7029830132932446, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 46012 + }, + { + "epoch": 0.46013, + "grad_norm": 0.5883689064715342, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 46013 + }, + { + "epoch": 0.46014, + "grad_norm": 0.5967324031037974, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 46014 + }, + { + "epoch": 0.46015, + "grad_norm": 0.7179889970594677, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 46015 + }, + { + "epoch": 0.46016, + "grad_norm": 0.8584858582535756, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 46016 + }, + { + "epoch": 0.46017, + "grad_norm": 0.8631356995647745, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 46017 + }, + { + "epoch": 0.46018, + "grad_norm": 0.7812859250586637, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 46018 + }, + { + "epoch": 0.46019, + "grad_norm": 0.7343071842726564, + "learning_rate": 0.003, + "loss": 4.0601, + "step": 46019 + }, + { + "epoch": 0.4602, + "grad_norm": 0.7462634845757877, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 46020 + }, + { + "epoch": 0.46021, + "grad_norm": 0.7054321956198001, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 46021 + }, + { + "epoch": 0.46022, + "grad_norm": 0.6896697690089657, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 46022 + }, + { + "epoch": 0.46023, + "grad_norm": 0.6573700963547157, + "learning_rate": 0.003, + "loss": 3.9804, + "step": 46023 + }, + { + "epoch": 0.46024, + "grad_norm": 0.7228049084633488, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 46024 + }, + { + "epoch": 0.46025, + "grad_norm": 0.7187108465834939, + "learning_rate": 0.003, + "loss": 4.0096, + "step": 46025 + }, + { + "epoch": 0.46026, + "grad_norm": 0.6872784962878212, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 46026 + }, + { + "epoch": 0.46027, + "grad_norm": 0.6760181386449572, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 46027 + }, + { + "epoch": 0.46028, + "grad_norm": 0.6926920952971933, + "learning_rate": 0.003, + "loss": 4.0161, + "step": 46028 + }, + { + "epoch": 0.46029, + "grad_norm": 0.5790438742996634, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 46029 + }, + { + "epoch": 0.4603, + "grad_norm": 0.5914874027096777, + "learning_rate": 0.003, + "loss": 3.9969, + "step": 46030 + }, + { + "epoch": 0.46031, + "grad_norm": 0.6568690229474702, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 46031 + }, + { + "epoch": 0.46032, + "grad_norm": 0.6370999313851703, + "learning_rate": 0.003, + "loss": 3.9877, + "step": 46032 + }, + { + "epoch": 0.46033, + "grad_norm": 0.7725480733376371, + "learning_rate": 0.003, + "loss": 3.9766, + "step": 46033 + }, + { + "epoch": 0.46034, + "grad_norm": 1.231909908652787, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 46034 + }, + { + "epoch": 0.46035, + "grad_norm": 0.8257771697893617, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 46035 + }, + { + "epoch": 0.46036, + "grad_norm": 0.6484714389321707, + "learning_rate": 0.003, + "loss": 3.9991, + "step": 46036 + }, + { + "epoch": 0.46037, + "grad_norm": 0.6760945565390964, + "learning_rate": 0.003, + "loss": 3.9885, + "step": 46037 + }, + { + "epoch": 0.46038, + "grad_norm": 0.7176292958918139, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 46038 + }, + { + "epoch": 0.46039, + "grad_norm": 0.8450559507440704, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 46039 + }, + { + "epoch": 0.4604, + "grad_norm": 1.1217319794316327, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 46040 + }, + { + "epoch": 0.46041, + "grad_norm": 1.0820480847783633, + "learning_rate": 0.003, + "loss": 4.05, + "step": 46041 + }, + { + "epoch": 0.46042, + "grad_norm": 1.0769144570359002, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 46042 + }, + { + "epoch": 0.46043, + "grad_norm": 0.9367057642608076, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 46043 + }, + { + "epoch": 0.46044, + "grad_norm": 0.9007416148811478, + "learning_rate": 0.003, + "loss": 4.041, + "step": 46044 + }, + { + "epoch": 0.46045, + "grad_norm": 0.9291259558047138, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 46045 + }, + { + "epoch": 0.46046, + "grad_norm": 1.0406698134958023, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 46046 + }, + { + "epoch": 0.46047, + "grad_norm": 0.9724291975054772, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 46047 + }, + { + "epoch": 0.46048, + "grad_norm": 1.0360441851158826, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 46048 + }, + { + "epoch": 0.46049, + "grad_norm": 0.96002715814896, + "learning_rate": 0.003, + "loss": 4.0614, + "step": 46049 + }, + { + "epoch": 0.4605, + "grad_norm": 1.0250654644711183, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 46050 + }, + { + "epoch": 0.46051, + "grad_norm": 1.0637413130128637, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 46051 + }, + { + "epoch": 0.46052, + "grad_norm": 0.9912586699979113, + "learning_rate": 0.003, + "loss": 4.0774, + "step": 46052 + }, + { + "epoch": 0.46053, + "grad_norm": 0.987076162606616, + "learning_rate": 0.003, + "loss": 4.044, + "step": 46053 + }, + { + "epoch": 0.46054, + "grad_norm": 1.0308502747658168, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 46054 + }, + { + "epoch": 0.46055, + "grad_norm": 0.9823725040500637, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 46055 + }, + { + "epoch": 0.46056, + "grad_norm": 0.9331954927877698, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 46056 + }, + { + "epoch": 0.46057, + "grad_norm": 1.1355389084161045, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 46057 + }, + { + "epoch": 0.46058, + "grad_norm": 1.0183421483843744, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 46058 + }, + { + "epoch": 0.46059, + "grad_norm": 0.8535627288714498, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 46059 + }, + { + "epoch": 0.4606, + "grad_norm": 0.7708731772300144, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 46060 + }, + { + "epoch": 0.46061, + "grad_norm": 0.7542729842137759, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 46061 + }, + { + "epoch": 0.46062, + "grad_norm": 0.6692755053902189, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 46062 + }, + { + "epoch": 0.46063, + "grad_norm": 0.5443827238697168, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 46063 + }, + { + "epoch": 0.46064, + "grad_norm": 0.519051251342372, + "learning_rate": 0.003, + "loss": 4.015, + "step": 46064 + }, + { + "epoch": 0.46065, + "grad_norm": 0.5844162925184645, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 46065 + }, + { + "epoch": 0.46066, + "grad_norm": 0.5891460286598565, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 46066 + }, + { + "epoch": 0.46067, + "grad_norm": 0.6281846716636892, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 46067 + }, + { + "epoch": 0.46068, + "grad_norm": 0.7315039501857639, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 46068 + }, + { + "epoch": 0.46069, + "grad_norm": 0.7999542454215974, + "learning_rate": 0.003, + "loss": 4.0026, + "step": 46069 + }, + { + "epoch": 0.4607, + "grad_norm": 0.9512124986099822, + "learning_rate": 0.003, + "loss": 3.9744, + "step": 46070 + }, + { + "epoch": 0.46071, + "grad_norm": 1.0631070410129873, + "learning_rate": 0.003, + "loss": 3.9905, + "step": 46071 + }, + { + "epoch": 0.46072, + "grad_norm": 0.9201033994046461, + "learning_rate": 0.003, + "loss": 4.043, + "step": 46072 + }, + { + "epoch": 0.46073, + "grad_norm": 0.8569461443282155, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 46073 + }, + { + "epoch": 0.46074, + "grad_norm": 0.8421822129962998, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 46074 + }, + { + "epoch": 0.46075, + "grad_norm": 0.724585586328339, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 46075 + }, + { + "epoch": 0.46076, + "grad_norm": 0.6086366386468555, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 46076 + }, + { + "epoch": 0.46077, + "grad_norm": 0.6740227618441473, + "learning_rate": 0.003, + "loss": 4.0521, + "step": 46077 + }, + { + "epoch": 0.46078, + "grad_norm": 0.7200338651765199, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 46078 + }, + { + "epoch": 0.46079, + "grad_norm": 0.7420997639219644, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 46079 + }, + { + "epoch": 0.4608, + "grad_norm": 0.7413070141820646, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 46080 + }, + { + "epoch": 0.46081, + "grad_norm": 0.7071875062764178, + "learning_rate": 0.003, + "loss": 3.9941, + "step": 46081 + }, + { + "epoch": 0.46082, + "grad_norm": 0.6270044881128825, + "learning_rate": 0.003, + "loss": 4.037, + "step": 46082 + }, + { + "epoch": 0.46083, + "grad_norm": 0.67654227657279, + "learning_rate": 0.003, + "loss": 4.023, + "step": 46083 + }, + { + "epoch": 0.46084, + "grad_norm": 0.7656807753038063, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 46084 + }, + { + "epoch": 0.46085, + "grad_norm": 0.8585496452178103, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 46085 + }, + { + "epoch": 0.46086, + "grad_norm": 0.9690087058536601, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 46086 + }, + { + "epoch": 0.46087, + "grad_norm": 1.0039074915252906, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 46087 + }, + { + "epoch": 0.46088, + "grad_norm": 1.0049255177521674, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 46088 + }, + { + "epoch": 0.46089, + "grad_norm": 0.8873942410228447, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 46089 + }, + { + "epoch": 0.4609, + "grad_norm": 0.7270038800101601, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 46090 + }, + { + "epoch": 0.46091, + "grad_norm": 0.7154181726602783, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 46091 + }, + { + "epoch": 0.46092, + "grad_norm": 0.8723470313017301, + "learning_rate": 0.003, + "loss": 4.0261, + "step": 46092 + }, + { + "epoch": 0.46093, + "grad_norm": 0.8029671826504419, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 46093 + }, + { + "epoch": 0.46094, + "grad_norm": 0.7109722387304273, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 46094 + }, + { + "epoch": 0.46095, + "grad_norm": 0.635436554896374, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 46095 + }, + { + "epoch": 0.46096, + "grad_norm": 0.6898639037239844, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 46096 + }, + { + "epoch": 0.46097, + "grad_norm": 0.7159022389751776, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 46097 + }, + { + "epoch": 0.46098, + "grad_norm": 0.8293673592444509, + "learning_rate": 0.003, + "loss": 4.0032, + "step": 46098 + }, + { + "epoch": 0.46099, + "grad_norm": 0.8941122151563946, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 46099 + }, + { + "epoch": 0.461, + "grad_norm": 0.720470683003848, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 46100 + }, + { + "epoch": 0.46101, + "grad_norm": 0.599386416105854, + "learning_rate": 0.003, + "loss": 3.9989, + "step": 46101 + }, + { + "epoch": 0.46102, + "grad_norm": 0.6307295548323012, + "learning_rate": 0.003, + "loss": 4.0025, + "step": 46102 + }, + { + "epoch": 0.46103, + "grad_norm": 0.7850036798297465, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 46103 + }, + { + "epoch": 0.46104, + "grad_norm": 1.1306483592781025, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 46104 + }, + { + "epoch": 0.46105, + "grad_norm": 1.093031282231485, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 46105 + }, + { + "epoch": 0.46106, + "grad_norm": 0.9844102461657637, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 46106 + }, + { + "epoch": 0.46107, + "grad_norm": 0.8613586465250785, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 46107 + }, + { + "epoch": 0.46108, + "grad_norm": 0.8721669925274296, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 46108 + }, + { + "epoch": 0.46109, + "grad_norm": 0.9780269148870169, + "learning_rate": 0.003, + "loss": 3.9986, + "step": 46109 + }, + { + "epoch": 0.4611, + "grad_norm": 1.1423292390880588, + "learning_rate": 0.003, + "loss": 4.0557, + "step": 46110 + }, + { + "epoch": 0.46111, + "grad_norm": 0.9259047708936748, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 46111 + }, + { + "epoch": 0.46112, + "grad_norm": 0.8000032069206896, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 46112 + }, + { + "epoch": 0.46113, + "grad_norm": 0.670647806350879, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 46113 + }, + { + "epoch": 0.46114, + "grad_norm": 0.6103725635458227, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 46114 + }, + { + "epoch": 0.46115, + "grad_norm": 0.6443528624391623, + "learning_rate": 0.003, + "loss": 4.048, + "step": 46115 + }, + { + "epoch": 0.46116, + "grad_norm": 0.6983357333494633, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 46116 + }, + { + "epoch": 0.46117, + "grad_norm": 0.7171205267446449, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 46117 + }, + { + "epoch": 0.46118, + "grad_norm": 0.7343897986707858, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 46118 + }, + { + "epoch": 0.46119, + "grad_norm": 0.7601381734329841, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 46119 + }, + { + "epoch": 0.4612, + "grad_norm": 0.8930033007384656, + "learning_rate": 0.003, + "loss": 4.05, + "step": 46120 + }, + { + "epoch": 0.46121, + "grad_norm": 1.1814838562445722, + "learning_rate": 0.003, + "loss": 4.0776, + "step": 46121 + }, + { + "epoch": 0.46122, + "grad_norm": 1.0351765010694074, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 46122 + }, + { + "epoch": 0.46123, + "grad_norm": 0.9899904690962861, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 46123 + }, + { + "epoch": 0.46124, + "grad_norm": 0.9253500731856412, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 46124 + }, + { + "epoch": 0.46125, + "grad_norm": 0.944863498630749, + "learning_rate": 0.003, + "loss": 4.0059, + "step": 46125 + }, + { + "epoch": 0.46126, + "grad_norm": 0.9389777569915918, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 46126 + }, + { + "epoch": 0.46127, + "grad_norm": 0.9406846757982759, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 46127 + }, + { + "epoch": 0.46128, + "grad_norm": 1.0005382102977578, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 46128 + }, + { + "epoch": 0.46129, + "grad_norm": 1.0846410179512052, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 46129 + }, + { + "epoch": 0.4613, + "grad_norm": 1.0616168186191781, + "learning_rate": 0.003, + "loss": 4.0693, + "step": 46130 + }, + { + "epoch": 0.46131, + "grad_norm": 0.9568015036318829, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 46131 + }, + { + "epoch": 0.46132, + "grad_norm": 0.8743429010726312, + "learning_rate": 0.003, + "loss": 4.041, + "step": 46132 + }, + { + "epoch": 0.46133, + "grad_norm": 0.8457603181632779, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 46133 + }, + { + "epoch": 0.46134, + "grad_norm": 0.929676666565223, + "learning_rate": 0.003, + "loss": 4.073, + "step": 46134 + }, + { + "epoch": 0.46135, + "grad_norm": 0.9476897967551962, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 46135 + }, + { + "epoch": 0.46136, + "grad_norm": 0.9303929027874654, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 46136 + }, + { + "epoch": 0.46137, + "grad_norm": 0.87760248217994, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 46137 + }, + { + "epoch": 0.46138, + "grad_norm": 0.8304946254061273, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 46138 + }, + { + "epoch": 0.46139, + "grad_norm": 0.720872597979093, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 46139 + }, + { + "epoch": 0.4614, + "grad_norm": 0.6898766591184963, + "learning_rate": 0.003, + "loss": 4.047, + "step": 46140 + }, + { + "epoch": 0.46141, + "grad_norm": 0.7376382095241647, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 46141 + }, + { + "epoch": 0.46142, + "grad_norm": 0.7902686972098957, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 46142 + }, + { + "epoch": 0.46143, + "grad_norm": 0.8875576132180095, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 46143 + }, + { + "epoch": 0.46144, + "grad_norm": 1.064812790406111, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 46144 + }, + { + "epoch": 0.46145, + "grad_norm": 0.7573921273766229, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 46145 + }, + { + "epoch": 0.46146, + "grad_norm": 0.5843144271543637, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 46146 + }, + { + "epoch": 0.46147, + "grad_norm": 0.7211663187714427, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 46147 + }, + { + "epoch": 0.46148, + "grad_norm": 0.8738074983513608, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 46148 + }, + { + "epoch": 0.46149, + "grad_norm": 0.9902235286951027, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 46149 + }, + { + "epoch": 0.4615, + "grad_norm": 1.0889913930272124, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 46150 + }, + { + "epoch": 0.46151, + "grad_norm": 0.7396269220735584, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 46151 + }, + { + "epoch": 0.46152, + "grad_norm": 0.6561306193227476, + "learning_rate": 0.003, + "loss": 4.033, + "step": 46152 + }, + { + "epoch": 0.46153, + "grad_norm": 0.6845777289904528, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 46153 + }, + { + "epoch": 0.46154, + "grad_norm": 0.7794461729480893, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 46154 + }, + { + "epoch": 0.46155, + "grad_norm": 0.7343079955611338, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 46155 + }, + { + "epoch": 0.46156, + "grad_norm": 0.8247845817735421, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 46156 + }, + { + "epoch": 0.46157, + "grad_norm": 0.8918704041576482, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 46157 + }, + { + "epoch": 0.46158, + "grad_norm": 0.930256327355946, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 46158 + }, + { + "epoch": 0.46159, + "grad_norm": 1.0421060956203447, + "learning_rate": 0.003, + "loss": 4.0009, + "step": 46159 + }, + { + "epoch": 0.4616, + "grad_norm": 1.2471646835598016, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 46160 + }, + { + "epoch": 0.46161, + "grad_norm": 0.7614227081207788, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 46161 + }, + { + "epoch": 0.46162, + "grad_norm": 0.7620845724472116, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 46162 + }, + { + "epoch": 0.46163, + "grad_norm": 0.9129099476270683, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 46163 + }, + { + "epoch": 0.46164, + "grad_norm": 0.9540675265012689, + "learning_rate": 0.003, + "loss": 4.0771, + "step": 46164 + }, + { + "epoch": 0.46165, + "grad_norm": 1.1489061313689861, + "learning_rate": 0.003, + "loss": 4.0575, + "step": 46165 + }, + { + "epoch": 0.46166, + "grad_norm": 0.8347164590768146, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 46166 + }, + { + "epoch": 0.46167, + "grad_norm": 0.7258947916299054, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 46167 + }, + { + "epoch": 0.46168, + "grad_norm": 0.6671957349998189, + "learning_rate": 0.003, + "loss": 4.0007, + "step": 46168 + }, + { + "epoch": 0.46169, + "grad_norm": 0.5671200935850844, + "learning_rate": 0.003, + "loss": 4.033, + "step": 46169 + }, + { + "epoch": 0.4617, + "grad_norm": 0.6332797802256224, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 46170 + }, + { + "epoch": 0.46171, + "grad_norm": 0.7773997353713543, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 46171 + }, + { + "epoch": 0.46172, + "grad_norm": 1.0806019427754086, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 46172 + }, + { + "epoch": 0.46173, + "grad_norm": 1.0537364036928245, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 46173 + }, + { + "epoch": 0.46174, + "grad_norm": 0.896962508624949, + "learning_rate": 0.003, + "loss": 4.014, + "step": 46174 + }, + { + "epoch": 0.46175, + "grad_norm": 0.8646200874447173, + "learning_rate": 0.003, + "loss": 3.9825, + "step": 46175 + }, + { + "epoch": 0.46176, + "grad_norm": 0.7570128587318332, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 46176 + }, + { + "epoch": 0.46177, + "grad_norm": 0.777611723737064, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 46177 + }, + { + "epoch": 0.46178, + "grad_norm": 0.8255319801870853, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 46178 + }, + { + "epoch": 0.46179, + "grad_norm": 0.8496195885189697, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 46179 + }, + { + "epoch": 0.4618, + "grad_norm": 0.8222318037585575, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 46180 + }, + { + "epoch": 0.46181, + "grad_norm": 0.736431304080499, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 46181 + }, + { + "epoch": 0.46182, + "grad_norm": 0.694218461601657, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 46182 + }, + { + "epoch": 0.46183, + "grad_norm": 0.6968411803276778, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 46183 + }, + { + "epoch": 0.46184, + "grad_norm": 0.8276245815530725, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 46184 + }, + { + "epoch": 0.46185, + "grad_norm": 0.9662875762873242, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 46185 + }, + { + "epoch": 0.46186, + "grad_norm": 0.9968131440180086, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 46186 + }, + { + "epoch": 0.46187, + "grad_norm": 0.9190120776921169, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 46187 + }, + { + "epoch": 0.46188, + "grad_norm": 0.9428429761190321, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 46188 + }, + { + "epoch": 0.46189, + "grad_norm": 0.93914125218085, + "learning_rate": 0.003, + "loss": 4.028, + "step": 46189 + }, + { + "epoch": 0.4619, + "grad_norm": 0.9864736681604441, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 46190 + }, + { + "epoch": 0.46191, + "grad_norm": 1.009201113519832, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 46191 + }, + { + "epoch": 0.46192, + "grad_norm": 1.0387276596508148, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 46192 + }, + { + "epoch": 0.46193, + "grad_norm": 0.8821475172071436, + "learning_rate": 0.003, + "loss": 4.034, + "step": 46193 + }, + { + "epoch": 0.46194, + "grad_norm": 0.8250294528627273, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 46194 + }, + { + "epoch": 0.46195, + "grad_norm": 0.7789575002368206, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 46195 + }, + { + "epoch": 0.46196, + "grad_norm": 0.8543957475068908, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 46196 + }, + { + "epoch": 0.46197, + "grad_norm": 0.6943304952805267, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 46197 + }, + { + "epoch": 0.46198, + "grad_norm": 0.76467341118441, + "learning_rate": 0.003, + "loss": 4.0766, + "step": 46198 + }, + { + "epoch": 0.46199, + "grad_norm": 0.8639360506393432, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 46199 + }, + { + "epoch": 0.462, + "grad_norm": 0.7967473818474088, + "learning_rate": 0.003, + "loss": 4.0205, + "step": 46200 + }, + { + "epoch": 0.46201, + "grad_norm": 0.7403124488146081, + "learning_rate": 0.003, + "loss": 4.0238, + "step": 46201 + }, + { + "epoch": 0.46202, + "grad_norm": 0.8185042629992522, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 46202 + }, + { + "epoch": 0.46203, + "grad_norm": 1.0751482612149554, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 46203 + }, + { + "epoch": 0.46204, + "grad_norm": 1.2553132327134384, + "learning_rate": 0.003, + "loss": 4.0484, + "step": 46204 + }, + { + "epoch": 0.46205, + "grad_norm": 0.7381182009640882, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 46205 + }, + { + "epoch": 0.46206, + "grad_norm": 0.6921867356376514, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 46206 + }, + { + "epoch": 0.46207, + "grad_norm": 0.742787251186372, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 46207 + }, + { + "epoch": 0.46208, + "grad_norm": 0.6990201025296802, + "learning_rate": 0.003, + "loss": 4.0038, + "step": 46208 + }, + { + "epoch": 0.46209, + "grad_norm": 0.6604988402280135, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 46209 + }, + { + "epoch": 0.4621, + "grad_norm": 0.761318594470623, + "learning_rate": 0.003, + "loss": 3.9994, + "step": 46210 + }, + { + "epoch": 0.46211, + "grad_norm": 0.9229558301537862, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 46211 + }, + { + "epoch": 0.46212, + "grad_norm": 0.9713680040337206, + "learning_rate": 0.003, + "loss": 4.071, + "step": 46212 + }, + { + "epoch": 0.46213, + "grad_norm": 1.083846013131475, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 46213 + }, + { + "epoch": 0.46214, + "grad_norm": 0.9032147732860963, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 46214 + }, + { + "epoch": 0.46215, + "grad_norm": 0.9562134739452067, + "learning_rate": 0.003, + "loss": 4.0095, + "step": 46215 + }, + { + "epoch": 0.46216, + "grad_norm": 1.0537195365438723, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 46216 + }, + { + "epoch": 0.46217, + "grad_norm": 0.7840736835619311, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 46217 + }, + { + "epoch": 0.46218, + "grad_norm": 0.6413996035734468, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 46218 + }, + { + "epoch": 0.46219, + "grad_norm": 0.5760553512989451, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 46219 + }, + { + "epoch": 0.4622, + "grad_norm": 0.5507676384586936, + "learning_rate": 0.003, + "loss": 4.017, + "step": 46220 + }, + { + "epoch": 0.46221, + "grad_norm": 0.5543095123515853, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 46221 + }, + { + "epoch": 0.46222, + "grad_norm": 0.5428458290585713, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 46222 + }, + { + "epoch": 0.46223, + "grad_norm": 0.6801547712311382, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 46223 + }, + { + "epoch": 0.46224, + "grad_norm": 0.8911292916270389, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 46224 + }, + { + "epoch": 0.46225, + "grad_norm": 1.1445926661449481, + "learning_rate": 0.003, + "loss": 4.0211, + "step": 46225 + }, + { + "epoch": 0.46226, + "grad_norm": 0.7999124599511733, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 46226 + }, + { + "epoch": 0.46227, + "grad_norm": 0.6843991132145855, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 46227 + }, + { + "epoch": 0.46228, + "grad_norm": 0.718567043408044, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 46228 + }, + { + "epoch": 0.46229, + "grad_norm": 0.7088190259871461, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 46229 + }, + { + "epoch": 0.4623, + "grad_norm": 0.6205546408193884, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 46230 + }, + { + "epoch": 0.46231, + "grad_norm": 0.7199907342157714, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 46231 + }, + { + "epoch": 0.46232, + "grad_norm": 0.8407727925694753, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 46232 + }, + { + "epoch": 0.46233, + "grad_norm": 0.8780643987241786, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 46233 + }, + { + "epoch": 0.46234, + "grad_norm": 0.842089153751086, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 46234 + }, + { + "epoch": 0.46235, + "grad_norm": 0.7419327163980562, + "learning_rate": 0.003, + "loss": 4.013, + "step": 46235 + }, + { + "epoch": 0.46236, + "grad_norm": 0.7362624687639888, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 46236 + }, + { + "epoch": 0.46237, + "grad_norm": 0.8304478925125051, + "learning_rate": 0.003, + "loss": 4.0026, + "step": 46237 + }, + { + "epoch": 0.46238, + "grad_norm": 1.0739561988869533, + "learning_rate": 0.003, + "loss": 4.03, + "step": 46238 + }, + { + "epoch": 0.46239, + "grad_norm": 1.0780049672607506, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 46239 + }, + { + "epoch": 0.4624, + "grad_norm": 0.9357510108264264, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 46240 + }, + { + "epoch": 0.46241, + "grad_norm": 1.0198323095893675, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 46241 + }, + { + "epoch": 0.46242, + "grad_norm": 1.072207684006313, + "learning_rate": 0.003, + "loss": 4.04, + "step": 46242 + }, + { + "epoch": 0.46243, + "grad_norm": 1.0724021274940727, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 46243 + }, + { + "epoch": 0.46244, + "grad_norm": 0.908401897638193, + "learning_rate": 0.003, + "loss": 4.015, + "step": 46244 + }, + { + "epoch": 0.46245, + "grad_norm": 0.7934945600711042, + "learning_rate": 0.003, + "loss": 3.9842, + "step": 46245 + }, + { + "epoch": 0.46246, + "grad_norm": 0.6244466327518918, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 46246 + }, + { + "epoch": 0.46247, + "grad_norm": 0.6734673125793695, + "learning_rate": 0.003, + "loss": 3.9753, + "step": 46247 + }, + { + "epoch": 0.46248, + "grad_norm": 0.6691080512525449, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 46248 + }, + { + "epoch": 0.46249, + "grad_norm": 0.6854659195283286, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 46249 + }, + { + "epoch": 0.4625, + "grad_norm": 0.681153463506551, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 46250 + }, + { + "epoch": 0.46251, + "grad_norm": 0.8114100835551407, + "learning_rate": 0.003, + "loss": 4.0111, + "step": 46251 + }, + { + "epoch": 0.46252, + "grad_norm": 0.9762223188112513, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 46252 + }, + { + "epoch": 0.46253, + "grad_norm": 1.17338730335195, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 46253 + }, + { + "epoch": 0.46254, + "grad_norm": 0.8505465069629373, + "learning_rate": 0.003, + "loss": 4.0053, + "step": 46254 + }, + { + "epoch": 0.46255, + "grad_norm": 0.7182349679602554, + "learning_rate": 0.003, + "loss": 4.046, + "step": 46255 + }, + { + "epoch": 0.46256, + "grad_norm": 0.6268622139897901, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 46256 + }, + { + "epoch": 0.46257, + "grad_norm": 0.648432648331287, + "learning_rate": 0.003, + "loss": 4.024, + "step": 46257 + }, + { + "epoch": 0.46258, + "grad_norm": 0.6754893500371423, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 46258 + }, + { + "epoch": 0.46259, + "grad_norm": 0.7260765126547909, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 46259 + }, + { + "epoch": 0.4626, + "grad_norm": 0.8287167380260924, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 46260 + }, + { + "epoch": 0.46261, + "grad_norm": 1.0044526989721942, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 46261 + }, + { + "epoch": 0.46262, + "grad_norm": 1.0837384329300426, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 46262 + }, + { + "epoch": 0.46263, + "grad_norm": 1.0242041419344226, + "learning_rate": 0.003, + "loss": 4.032, + "step": 46263 + }, + { + "epoch": 0.46264, + "grad_norm": 1.0173512107237843, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 46264 + }, + { + "epoch": 0.46265, + "grad_norm": 0.8221822353912865, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 46265 + }, + { + "epoch": 0.46266, + "grad_norm": 0.762294711231529, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 46266 + }, + { + "epoch": 0.46267, + "grad_norm": 0.8336204740143801, + "learning_rate": 0.003, + "loss": 4.014, + "step": 46267 + }, + { + "epoch": 0.46268, + "grad_norm": 0.8636758141442872, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 46268 + }, + { + "epoch": 0.46269, + "grad_norm": 0.936477689725008, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 46269 + }, + { + "epoch": 0.4627, + "grad_norm": 1.1396853181597904, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 46270 + }, + { + "epoch": 0.46271, + "grad_norm": 1.064988124774106, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 46271 + }, + { + "epoch": 0.46272, + "grad_norm": 0.9017935347417363, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 46272 + }, + { + "epoch": 0.46273, + "grad_norm": 0.7657578931673193, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 46273 + }, + { + "epoch": 0.46274, + "grad_norm": 0.8883462971356677, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 46274 + }, + { + "epoch": 0.46275, + "grad_norm": 1.0790211660788875, + "learning_rate": 0.003, + "loss": 4.0283, + "step": 46275 + }, + { + "epoch": 0.46276, + "grad_norm": 0.8617649451279037, + "learning_rate": 0.003, + "loss": 4.0747, + "step": 46276 + }, + { + "epoch": 0.46277, + "grad_norm": 0.6991888869916422, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 46277 + }, + { + "epoch": 0.46278, + "grad_norm": 0.5964822348556642, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 46278 + }, + { + "epoch": 0.46279, + "grad_norm": 0.6405864385320087, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 46279 + }, + { + "epoch": 0.4628, + "grad_norm": 0.6855074504112286, + "learning_rate": 0.003, + "loss": 3.998, + "step": 46280 + }, + { + "epoch": 0.46281, + "grad_norm": 0.7873747792440758, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 46281 + }, + { + "epoch": 0.46282, + "grad_norm": 0.8295590544268089, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 46282 + }, + { + "epoch": 0.46283, + "grad_norm": 0.8832471315669038, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 46283 + }, + { + "epoch": 0.46284, + "grad_norm": 0.8269823303435527, + "learning_rate": 0.003, + "loss": 4.015, + "step": 46284 + }, + { + "epoch": 0.46285, + "grad_norm": 0.7075391565826896, + "learning_rate": 0.003, + "loss": 3.9915, + "step": 46285 + }, + { + "epoch": 0.46286, + "grad_norm": 0.8405436577557026, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 46286 + }, + { + "epoch": 0.46287, + "grad_norm": 0.9787090471472152, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 46287 + }, + { + "epoch": 0.46288, + "grad_norm": 1.0456048222345893, + "learning_rate": 0.003, + "loss": 4.0371, + "step": 46288 + }, + { + "epoch": 0.46289, + "grad_norm": 0.91838069894333, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 46289 + }, + { + "epoch": 0.4629, + "grad_norm": 0.8175990200966323, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 46290 + }, + { + "epoch": 0.46291, + "grad_norm": 0.8092532648168562, + "learning_rate": 0.003, + "loss": 4.014, + "step": 46291 + }, + { + "epoch": 0.46292, + "grad_norm": 0.8050775469653609, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 46292 + }, + { + "epoch": 0.46293, + "grad_norm": 0.6833549899291522, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 46293 + }, + { + "epoch": 0.46294, + "grad_norm": 0.7541186049806375, + "learning_rate": 0.003, + "loss": 4.0069, + "step": 46294 + }, + { + "epoch": 0.46295, + "grad_norm": 0.7610607438259435, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 46295 + }, + { + "epoch": 0.46296, + "grad_norm": 0.7301731987718261, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 46296 + }, + { + "epoch": 0.46297, + "grad_norm": 0.842556712257868, + "learning_rate": 0.003, + "loss": 4.0647, + "step": 46297 + }, + { + "epoch": 0.46298, + "grad_norm": 0.9089621689124832, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 46298 + }, + { + "epoch": 0.46299, + "grad_norm": 0.914914221527164, + "learning_rate": 0.003, + "loss": 4.057, + "step": 46299 + }, + { + "epoch": 0.463, + "grad_norm": 1.0195605908391105, + "learning_rate": 0.003, + "loss": 4.015, + "step": 46300 + }, + { + "epoch": 0.46301, + "grad_norm": 0.9807370069596412, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 46301 + }, + { + "epoch": 0.46302, + "grad_norm": 1.1280603778884832, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 46302 + }, + { + "epoch": 0.46303, + "grad_norm": 1.0536342356177344, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 46303 + }, + { + "epoch": 0.46304, + "grad_norm": 1.0294428148391115, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 46304 + }, + { + "epoch": 0.46305, + "grad_norm": 0.9439431105081942, + "learning_rate": 0.003, + "loss": 4.0027, + "step": 46305 + }, + { + "epoch": 0.46306, + "grad_norm": 0.8598066884039575, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 46306 + }, + { + "epoch": 0.46307, + "grad_norm": 0.9256266752677262, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 46307 + }, + { + "epoch": 0.46308, + "grad_norm": 0.9528507719153114, + "learning_rate": 0.003, + "loss": 4.0682, + "step": 46308 + }, + { + "epoch": 0.46309, + "grad_norm": 0.9088870088932397, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 46309 + }, + { + "epoch": 0.4631, + "grad_norm": 0.9009724274799096, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 46310 + }, + { + "epoch": 0.46311, + "grad_norm": 0.909652862579712, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 46311 + }, + { + "epoch": 0.46312, + "grad_norm": 0.885623807157337, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 46312 + }, + { + "epoch": 0.46313, + "grad_norm": 0.8815465080962245, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 46313 + }, + { + "epoch": 0.46314, + "grad_norm": 0.7974546370143314, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 46314 + }, + { + "epoch": 0.46315, + "grad_norm": 0.7204469204461, + "learning_rate": 0.003, + "loss": 3.994, + "step": 46315 + }, + { + "epoch": 0.46316, + "grad_norm": 0.7175589928384198, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 46316 + }, + { + "epoch": 0.46317, + "grad_norm": 0.8115451036491405, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 46317 + }, + { + "epoch": 0.46318, + "grad_norm": 0.749500585391513, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 46318 + }, + { + "epoch": 0.46319, + "grad_norm": 0.7606061175035285, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 46319 + }, + { + "epoch": 0.4632, + "grad_norm": 0.8488603047739502, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 46320 + }, + { + "epoch": 0.46321, + "grad_norm": 0.7121328802321754, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 46321 + }, + { + "epoch": 0.46322, + "grad_norm": 0.7339957483206545, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 46322 + }, + { + "epoch": 0.46323, + "grad_norm": 0.74086396712644, + "learning_rate": 0.003, + "loss": 4.053, + "step": 46323 + }, + { + "epoch": 0.46324, + "grad_norm": 0.7398410431144017, + "learning_rate": 0.003, + "loss": 4.042, + "step": 46324 + }, + { + "epoch": 0.46325, + "grad_norm": 0.7494944891856823, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 46325 + }, + { + "epoch": 0.46326, + "grad_norm": 0.8797164499022688, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 46326 + }, + { + "epoch": 0.46327, + "grad_norm": 1.2065250234500056, + "learning_rate": 0.003, + "loss": 4.0046, + "step": 46327 + }, + { + "epoch": 0.46328, + "grad_norm": 1.1422946091376138, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 46328 + }, + { + "epoch": 0.46329, + "grad_norm": 0.8773488677889079, + "learning_rate": 0.003, + "loss": 3.9886, + "step": 46329 + }, + { + "epoch": 0.4633, + "grad_norm": 0.8418951585279226, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 46330 + }, + { + "epoch": 0.46331, + "grad_norm": 0.911952612707433, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 46331 + }, + { + "epoch": 0.46332, + "grad_norm": 0.9359978499635173, + "learning_rate": 0.003, + "loss": 4.003, + "step": 46332 + }, + { + "epoch": 0.46333, + "grad_norm": 0.9441470866997066, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 46333 + }, + { + "epoch": 0.46334, + "grad_norm": 0.951365238063992, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 46334 + }, + { + "epoch": 0.46335, + "grad_norm": 0.8902724728231562, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 46335 + }, + { + "epoch": 0.46336, + "grad_norm": 0.7833695176162409, + "learning_rate": 0.003, + "loss": 4.0095, + "step": 46336 + }, + { + "epoch": 0.46337, + "grad_norm": 0.6597876360647896, + "learning_rate": 0.003, + "loss": 4.0018, + "step": 46337 + }, + { + "epoch": 0.46338, + "grad_norm": 0.5469112366311131, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 46338 + }, + { + "epoch": 0.46339, + "grad_norm": 0.6199779039972891, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 46339 + }, + { + "epoch": 0.4634, + "grad_norm": 0.6491214753925978, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 46340 + }, + { + "epoch": 0.46341, + "grad_norm": 0.8150679686806384, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 46341 + }, + { + "epoch": 0.46342, + "grad_norm": 0.8554631304572534, + "learning_rate": 0.003, + "loss": 4.029, + "step": 46342 + }, + { + "epoch": 0.46343, + "grad_norm": 0.8383461988933065, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 46343 + }, + { + "epoch": 0.46344, + "grad_norm": 0.8682129454252676, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 46344 + }, + { + "epoch": 0.46345, + "grad_norm": 0.9993004526379001, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 46345 + }, + { + "epoch": 0.46346, + "grad_norm": 1.0746651716009785, + "learning_rate": 0.003, + "loss": 4.0554, + "step": 46346 + }, + { + "epoch": 0.46347, + "grad_norm": 0.784789617055859, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 46347 + }, + { + "epoch": 0.46348, + "grad_norm": 0.8057815579366576, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 46348 + }, + { + "epoch": 0.46349, + "grad_norm": 0.8804603854780602, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 46349 + }, + { + "epoch": 0.4635, + "grad_norm": 0.9268815801533254, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 46350 + }, + { + "epoch": 0.46351, + "grad_norm": 0.9831657454235541, + "learning_rate": 0.003, + "loss": 3.9917, + "step": 46351 + }, + { + "epoch": 0.46352, + "grad_norm": 1.1626703656196373, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 46352 + }, + { + "epoch": 0.46353, + "grad_norm": 1.1087273431463862, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 46353 + }, + { + "epoch": 0.46354, + "grad_norm": 0.8533082263516474, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 46354 + }, + { + "epoch": 0.46355, + "grad_norm": 0.7371964763823922, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 46355 + }, + { + "epoch": 0.46356, + "grad_norm": 0.6462960026878596, + "learning_rate": 0.003, + "loss": 4.023, + "step": 46356 + }, + { + "epoch": 0.46357, + "grad_norm": 0.7002357383578134, + "learning_rate": 0.003, + "loss": 4.019, + "step": 46357 + }, + { + "epoch": 0.46358, + "grad_norm": 0.7887033196679345, + "learning_rate": 0.003, + "loss": 4.0489, + "step": 46358 + }, + { + "epoch": 0.46359, + "grad_norm": 0.938017390427733, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 46359 + }, + { + "epoch": 0.4636, + "grad_norm": 0.9745079761828702, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 46360 + }, + { + "epoch": 0.46361, + "grad_norm": 0.8957893678781024, + "learning_rate": 0.003, + "loss": 4.0578, + "step": 46361 + }, + { + "epoch": 0.46362, + "grad_norm": 0.9067736753217919, + "learning_rate": 0.003, + "loss": 4.0477, + "step": 46362 + }, + { + "epoch": 0.46363, + "grad_norm": 0.897948604578224, + "learning_rate": 0.003, + "loss": 4.057, + "step": 46363 + }, + { + "epoch": 0.46364, + "grad_norm": 0.9769256085259311, + "learning_rate": 0.003, + "loss": 4.0591, + "step": 46364 + }, + { + "epoch": 0.46365, + "grad_norm": 1.1032898286908701, + "learning_rate": 0.003, + "loss": 4.0546, + "step": 46365 + }, + { + "epoch": 0.46366, + "grad_norm": 0.8764102471929965, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 46366 + }, + { + "epoch": 0.46367, + "grad_norm": 0.8934674468377058, + "learning_rate": 0.003, + "loss": 4.0114, + "step": 46367 + }, + { + "epoch": 0.46368, + "grad_norm": 0.9371293080831669, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 46368 + }, + { + "epoch": 0.46369, + "grad_norm": 0.9829338773115486, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 46369 + }, + { + "epoch": 0.4637, + "grad_norm": 1.0426003749050312, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 46370 + }, + { + "epoch": 0.46371, + "grad_norm": 0.8391666819699682, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 46371 + }, + { + "epoch": 0.46372, + "grad_norm": 0.959976279219079, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 46372 + }, + { + "epoch": 0.46373, + "grad_norm": 0.9937890349916816, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 46373 + }, + { + "epoch": 0.46374, + "grad_norm": 0.9849799092445101, + "learning_rate": 0.003, + "loss": 4.0705, + "step": 46374 + }, + { + "epoch": 0.46375, + "grad_norm": 1.1724195602970668, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 46375 + }, + { + "epoch": 0.46376, + "grad_norm": 0.9160932461046858, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 46376 + }, + { + "epoch": 0.46377, + "grad_norm": 0.7526528745064458, + "learning_rate": 0.003, + "loss": 4.051, + "step": 46377 + }, + { + "epoch": 0.46378, + "grad_norm": 0.6768661263322632, + "learning_rate": 0.003, + "loss": 4.038, + "step": 46378 + }, + { + "epoch": 0.46379, + "grad_norm": 0.6207549541087898, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 46379 + }, + { + "epoch": 0.4638, + "grad_norm": 0.5349895006242884, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 46380 + }, + { + "epoch": 0.46381, + "grad_norm": 0.4692515973660449, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 46381 + }, + { + "epoch": 0.46382, + "grad_norm": 0.44594094521489275, + "learning_rate": 0.003, + "loss": 3.9988, + "step": 46382 + }, + { + "epoch": 0.46383, + "grad_norm": 0.42591596154936745, + "learning_rate": 0.003, + "loss": 3.9821, + "step": 46383 + }, + { + "epoch": 0.46384, + "grad_norm": 0.41350591759402855, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 46384 + }, + { + "epoch": 0.46385, + "grad_norm": 0.43733514608887347, + "learning_rate": 0.003, + "loss": 4.0053, + "step": 46385 + }, + { + "epoch": 0.46386, + "grad_norm": 0.43135841480769255, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 46386 + }, + { + "epoch": 0.46387, + "grad_norm": 0.552624107274668, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 46387 + }, + { + "epoch": 0.46388, + "grad_norm": 0.7904853679242687, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 46388 + }, + { + "epoch": 0.46389, + "grad_norm": 1.1952166122626202, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 46389 + }, + { + "epoch": 0.4639, + "grad_norm": 0.8856077037383212, + "learning_rate": 0.003, + "loss": 3.9986, + "step": 46390 + }, + { + "epoch": 0.46391, + "grad_norm": 0.7386913516211827, + "learning_rate": 0.003, + "loss": 4.0183, + "step": 46391 + }, + { + "epoch": 0.46392, + "grad_norm": 0.7799679249601884, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 46392 + }, + { + "epoch": 0.46393, + "grad_norm": 0.7363105336694539, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 46393 + }, + { + "epoch": 0.46394, + "grad_norm": 0.714042942059662, + "learning_rate": 0.003, + "loss": 3.9849, + "step": 46394 + }, + { + "epoch": 0.46395, + "grad_norm": 0.732838665226248, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 46395 + }, + { + "epoch": 0.46396, + "grad_norm": 0.767437505732331, + "learning_rate": 0.003, + "loss": 3.9966, + "step": 46396 + }, + { + "epoch": 0.46397, + "grad_norm": 0.9772476569530263, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 46397 + }, + { + "epoch": 0.46398, + "grad_norm": 1.2195468310902997, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 46398 + }, + { + "epoch": 0.46399, + "grad_norm": 0.7357797930166404, + "learning_rate": 0.003, + "loss": 3.9905, + "step": 46399 + }, + { + "epoch": 0.464, + "grad_norm": 0.7385896444416761, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 46400 + }, + { + "epoch": 0.46401, + "grad_norm": 0.8288129248135525, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 46401 + }, + { + "epoch": 0.46402, + "grad_norm": 0.9013708805822089, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 46402 + }, + { + "epoch": 0.46403, + "grad_norm": 1.0347595439732693, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 46403 + }, + { + "epoch": 0.46404, + "grad_norm": 0.8642511514538186, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 46404 + }, + { + "epoch": 0.46405, + "grad_norm": 0.8782836775353824, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 46405 + }, + { + "epoch": 0.46406, + "grad_norm": 0.9093673808073844, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 46406 + }, + { + "epoch": 0.46407, + "grad_norm": 0.8726422976387074, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 46407 + }, + { + "epoch": 0.46408, + "grad_norm": 0.9361546368763466, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 46408 + }, + { + "epoch": 0.46409, + "grad_norm": 0.8969298723667004, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 46409 + }, + { + "epoch": 0.4641, + "grad_norm": 0.8810599863153029, + "learning_rate": 0.003, + "loss": 3.9994, + "step": 46410 + }, + { + "epoch": 0.46411, + "grad_norm": 1.004795267309244, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 46411 + }, + { + "epoch": 0.46412, + "grad_norm": 1.281242531600743, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 46412 + }, + { + "epoch": 0.46413, + "grad_norm": 0.8760384545921219, + "learning_rate": 0.003, + "loss": 4.0154, + "step": 46413 + }, + { + "epoch": 0.46414, + "grad_norm": 0.66842642792383, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 46414 + }, + { + "epoch": 0.46415, + "grad_norm": 0.6442184257563132, + "learning_rate": 0.003, + "loss": 3.9943, + "step": 46415 + }, + { + "epoch": 0.46416, + "grad_norm": 0.7364137766659836, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 46416 + }, + { + "epoch": 0.46417, + "grad_norm": 0.8955279392677615, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 46417 + }, + { + "epoch": 0.46418, + "grad_norm": 1.0708139967811408, + "learning_rate": 0.003, + "loss": 4.0722, + "step": 46418 + }, + { + "epoch": 0.46419, + "grad_norm": 0.9303819197203195, + "learning_rate": 0.003, + "loss": 4.0084, + "step": 46419 + }, + { + "epoch": 0.4642, + "grad_norm": 0.9701790714733705, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 46420 + }, + { + "epoch": 0.46421, + "grad_norm": 1.0345017142670965, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 46421 + }, + { + "epoch": 0.46422, + "grad_norm": 0.9436486604993661, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 46422 + }, + { + "epoch": 0.46423, + "grad_norm": 0.8181442678708092, + "learning_rate": 0.003, + "loss": 4.0012, + "step": 46423 + }, + { + "epoch": 0.46424, + "grad_norm": 0.738639054175846, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 46424 + }, + { + "epoch": 0.46425, + "grad_norm": 0.7460899486579666, + "learning_rate": 0.003, + "loss": 3.9896, + "step": 46425 + }, + { + "epoch": 0.46426, + "grad_norm": 0.7843905802674426, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 46426 + }, + { + "epoch": 0.46427, + "grad_norm": 0.8198779074761153, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 46427 + }, + { + "epoch": 0.46428, + "grad_norm": 0.864858789823859, + "learning_rate": 0.003, + "loss": 4.017, + "step": 46428 + }, + { + "epoch": 0.46429, + "grad_norm": 1.0335866905213709, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 46429 + }, + { + "epoch": 0.4643, + "grad_norm": 1.1301302868268448, + "learning_rate": 0.003, + "loss": 4.0564, + "step": 46430 + }, + { + "epoch": 0.46431, + "grad_norm": 0.8092554559975845, + "learning_rate": 0.003, + "loss": 3.9978, + "step": 46431 + }, + { + "epoch": 0.46432, + "grad_norm": 0.8173778970704458, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 46432 + }, + { + "epoch": 0.46433, + "grad_norm": 0.8388069879426638, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 46433 + }, + { + "epoch": 0.46434, + "grad_norm": 0.8839946885633112, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 46434 + }, + { + "epoch": 0.46435, + "grad_norm": 0.8190927214813677, + "learning_rate": 0.003, + "loss": 4.009, + "step": 46435 + }, + { + "epoch": 0.46436, + "grad_norm": 0.8292754685972715, + "learning_rate": 0.003, + "loss": 4.03, + "step": 46436 + }, + { + "epoch": 0.46437, + "grad_norm": 0.8892425821969563, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 46437 + }, + { + "epoch": 0.46438, + "grad_norm": 0.7884513137201242, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 46438 + }, + { + "epoch": 0.46439, + "grad_norm": 0.7714004658563789, + "learning_rate": 0.003, + "loss": 4.0602, + "step": 46439 + }, + { + "epoch": 0.4644, + "grad_norm": 0.9166799750615587, + "learning_rate": 0.003, + "loss": 4.0073, + "step": 46440 + }, + { + "epoch": 0.46441, + "grad_norm": 1.0712993887031417, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 46441 + }, + { + "epoch": 0.46442, + "grad_norm": 0.9521097325463063, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 46442 + }, + { + "epoch": 0.46443, + "grad_norm": 0.7856491410809329, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 46443 + }, + { + "epoch": 0.46444, + "grad_norm": 0.6329500270780433, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 46444 + }, + { + "epoch": 0.46445, + "grad_norm": 0.6496642014374315, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 46445 + }, + { + "epoch": 0.46446, + "grad_norm": 0.7031300816740554, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 46446 + }, + { + "epoch": 0.46447, + "grad_norm": 0.7164966284014741, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 46447 + }, + { + "epoch": 0.46448, + "grad_norm": 0.5730172860837055, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 46448 + }, + { + "epoch": 0.46449, + "grad_norm": 0.46142931189032, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 46449 + }, + { + "epoch": 0.4645, + "grad_norm": 0.4519213551776824, + "learning_rate": 0.003, + "loss": 4.028, + "step": 46450 + }, + { + "epoch": 0.46451, + "grad_norm": 0.5553348600865474, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 46451 + }, + { + "epoch": 0.46452, + "grad_norm": 0.6389672135363037, + "learning_rate": 0.003, + "loss": 4.011, + "step": 46452 + }, + { + "epoch": 0.46453, + "grad_norm": 0.7695279002869477, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 46453 + }, + { + "epoch": 0.46454, + "grad_norm": 0.8518309161439785, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 46454 + }, + { + "epoch": 0.46455, + "grad_norm": 0.7455566679012071, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 46455 + }, + { + "epoch": 0.46456, + "grad_norm": 0.7604653655271018, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 46456 + }, + { + "epoch": 0.46457, + "grad_norm": 0.9439201499483785, + "learning_rate": 0.003, + "loss": 4.068, + "step": 46457 + }, + { + "epoch": 0.46458, + "grad_norm": 1.2894025290661082, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 46458 + }, + { + "epoch": 0.46459, + "grad_norm": 0.7391427762138572, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 46459 + }, + { + "epoch": 0.4646, + "grad_norm": 0.827381505035817, + "learning_rate": 0.003, + "loss": 4.0251, + "step": 46460 + }, + { + "epoch": 0.46461, + "grad_norm": 0.9792216599922331, + "learning_rate": 0.003, + "loss": 4.0077, + "step": 46461 + }, + { + "epoch": 0.46462, + "grad_norm": 1.1008545842967652, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 46462 + }, + { + "epoch": 0.46463, + "grad_norm": 0.9875598154268098, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 46463 + }, + { + "epoch": 0.46464, + "grad_norm": 0.9050574363307825, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 46464 + }, + { + "epoch": 0.46465, + "grad_norm": 0.7416124967063503, + "learning_rate": 0.003, + "loss": 3.9984, + "step": 46465 + }, + { + "epoch": 0.46466, + "grad_norm": 0.796212040062973, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 46466 + }, + { + "epoch": 0.46467, + "grad_norm": 0.9533205463672589, + "learning_rate": 0.003, + "loss": 4.0016, + "step": 46467 + }, + { + "epoch": 0.46468, + "grad_norm": 1.060416013350727, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 46468 + }, + { + "epoch": 0.46469, + "grad_norm": 0.9128416022482121, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 46469 + }, + { + "epoch": 0.4647, + "grad_norm": 0.8735198218194803, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 46470 + }, + { + "epoch": 0.46471, + "grad_norm": 0.9110773211248566, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 46471 + }, + { + "epoch": 0.46472, + "grad_norm": 0.9442938118006178, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 46472 + }, + { + "epoch": 0.46473, + "grad_norm": 0.8916904957863145, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 46473 + }, + { + "epoch": 0.46474, + "grad_norm": 0.8598223673337505, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 46474 + }, + { + "epoch": 0.46475, + "grad_norm": 0.9529340411730199, + "learning_rate": 0.003, + "loss": 4.0469, + "step": 46475 + }, + { + "epoch": 0.46476, + "grad_norm": 1.0691263672941673, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 46476 + }, + { + "epoch": 0.46477, + "grad_norm": 0.9189267829599178, + "learning_rate": 0.003, + "loss": 4.031, + "step": 46477 + }, + { + "epoch": 0.46478, + "grad_norm": 1.0090835978983852, + "learning_rate": 0.003, + "loss": 4.0529, + "step": 46478 + }, + { + "epoch": 0.46479, + "grad_norm": 1.1430035917345376, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 46479 + }, + { + "epoch": 0.4648, + "grad_norm": 0.7701445380583174, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 46480 + }, + { + "epoch": 0.46481, + "grad_norm": 0.7921172909441834, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 46481 + }, + { + "epoch": 0.46482, + "grad_norm": 0.8438614902512477, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 46482 + }, + { + "epoch": 0.46483, + "grad_norm": 0.9011969462515323, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 46483 + }, + { + "epoch": 0.46484, + "grad_norm": 1.0039516076811057, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 46484 + }, + { + "epoch": 0.46485, + "grad_norm": 1.0502717507531667, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 46485 + }, + { + "epoch": 0.46486, + "grad_norm": 0.9143133467549247, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 46486 + }, + { + "epoch": 0.46487, + "grad_norm": 0.9061888385244152, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 46487 + }, + { + "epoch": 0.46488, + "grad_norm": 0.9388780408550259, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 46488 + }, + { + "epoch": 0.46489, + "grad_norm": 0.9696300395071262, + "learning_rate": 0.003, + "loss": 4.0735, + "step": 46489 + }, + { + "epoch": 0.4649, + "grad_norm": 0.9000724444700415, + "learning_rate": 0.003, + "loss": 4.0644, + "step": 46490 + }, + { + "epoch": 0.46491, + "grad_norm": 0.845114352827696, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 46491 + }, + { + "epoch": 0.46492, + "grad_norm": 0.8838560409226139, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 46492 + }, + { + "epoch": 0.46493, + "grad_norm": 0.8785917308754017, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 46493 + }, + { + "epoch": 0.46494, + "grad_norm": 0.928279125685672, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 46494 + }, + { + "epoch": 0.46495, + "grad_norm": 0.8587041148081022, + "learning_rate": 0.003, + "loss": 4.0035, + "step": 46495 + }, + { + "epoch": 0.46496, + "grad_norm": 0.7453556535222955, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 46496 + }, + { + "epoch": 0.46497, + "grad_norm": 0.8039338800182365, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 46497 + }, + { + "epoch": 0.46498, + "grad_norm": 0.805661878016546, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 46498 + }, + { + "epoch": 0.46499, + "grad_norm": 0.8028204129173034, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 46499 + }, + { + "epoch": 0.465, + "grad_norm": 0.8793503486181188, + "learning_rate": 0.003, + "loss": 4.0073, + "step": 46500 + }, + { + "epoch": 0.46501, + "grad_norm": 0.7561817450471168, + "learning_rate": 0.003, + "loss": 3.9892, + "step": 46501 + }, + { + "epoch": 0.46502, + "grad_norm": 0.5908120129622508, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 46502 + }, + { + "epoch": 0.46503, + "grad_norm": 0.5724406188856267, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 46503 + }, + { + "epoch": 0.46504, + "grad_norm": 0.602580688040959, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 46504 + }, + { + "epoch": 0.46505, + "grad_norm": 0.7011338840488095, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 46505 + }, + { + "epoch": 0.46506, + "grad_norm": 0.7613778157422862, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 46506 + }, + { + "epoch": 0.46507, + "grad_norm": 0.727354346971855, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 46507 + }, + { + "epoch": 0.46508, + "grad_norm": 0.7171674724662631, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 46508 + }, + { + "epoch": 0.46509, + "grad_norm": 0.6803549914284548, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 46509 + }, + { + "epoch": 0.4651, + "grad_norm": 0.6871278763257261, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 46510 + }, + { + "epoch": 0.46511, + "grad_norm": 0.7413829534058237, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 46511 + }, + { + "epoch": 0.46512, + "grad_norm": 0.7589098275582723, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 46512 + }, + { + "epoch": 0.46513, + "grad_norm": 0.894722008758898, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 46513 + }, + { + "epoch": 0.46514, + "grad_norm": 0.9975295132969789, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 46514 + }, + { + "epoch": 0.46515, + "grad_norm": 1.067846549885119, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 46515 + }, + { + "epoch": 0.46516, + "grad_norm": 1.1048295732609752, + "learning_rate": 0.003, + "loss": 4.0363, + "step": 46516 + }, + { + "epoch": 0.46517, + "grad_norm": 0.9932113240738987, + "learning_rate": 0.003, + "loss": 4.0049, + "step": 46517 + }, + { + "epoch": 0.46518, + "grad_norm": 0.9137126567274287, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 46518 + }, + { + "epoch": 0.46519, + "grad_norm": 0.8917592766891562, + "learning_rate": 0.003, + "loss": 4.031, + "step": 46519 + }, + { + "epoch": 0.4652, + "grad_norm": 0.849377613839144, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 46520 + }, + { + "epoch": 0.46521, + "grad_norm": 0.8819160498835044, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 46521 + }, + { + "epoch": 0.46522, + "grad_norm": 0.8019054863727728, + "learning_rate": 0.003, + "loss": 4.0239, + "step": 46522 + }, + { + "epoch": 0.46523, + "grad_norm": 0.8113707193165234, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 46523 + }, + { + "epoch": 0.46524, + "grad_norm": 0.8170107411493411, + "learning_rate": 0.003, + "loss": 4.0089, + "step": 46524 + }, + { + "epoch": 0.46525, + "grad_norm": 0.8803803231659841, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 46525 + }, + { + "epoch": 0.46526, + "grad_norm": 0.8320312631125553, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 46526 + }, + { + "epoch": 0.46527, + "grad_norm": 0.7837153157241861, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 46527 + }, + { + "epoch": 0.46528, + "grad_norm": 0.6502705285565219, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 46528 + }, + { + "epoch": 0.46529, + "grad_norm": 0.6286262614017378, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 46529 + }, + { + "epoch": 0.4653, + "grad_norm": 0.6173312613058988, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 46530 + }, + { + "epoch": 0.46531, + "grad_norm": 0.599213429472896, + "learning_rate": 0.003, + "loss": 3.9983, + "step": 46531 + }, + { + "epoch": 0.46532, + "grad_norm": 0.6477051044356764, + "learning_rate": 0.003, + "loss": 4.018, + "step": 46532 + }, + { + "epoch": 0.46533, + "grad_norm": 0.6818575317489531, + "learning_rate": 0.003, + "loss": 3.9995, + "step": 46533 + }, + { + "epoch": 0.46534, + "grad_norm": 0.7701509964032247, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 46534 + }, + { + "epoch": 0.46535, + "grad_norm": 0.897387194485018, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 46535 + }, + { + "epoch": 0.46536, + "grad_norm": 0.9063499597734597, + "learning_rate": 0.003, + "loss": 4.0703, + "step": 46536 + }, + { + "epoch": 0.46537, + "grad_norm": 0.891507684935295, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 46537 + }, + { + "epoch": 0.46538, + "grad_norm": 0.8900092502712611, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 46538 + }, + { + "epoch": 0.46539, + "grad_norm": 0.9591184355552095, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 46539 + }, + { + "epoch": 0.4654, + "grad_norm": 1.0590297705331109, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 46540 + }, + { + "epoch": 0.46541, + "grad_norm": 0.9582515798422848, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 46541 + }, + { + "epoch": 0.46542, + "grad_norm": 0.9834413235467563, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 46542 + }, + { + "epoch": 0.46543, + "grad_norm": 0.9848534157051329, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 46543 + }, + { + "epoch": 0.46544, + "grad_norm": 0.797765943200676, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 46544 + }, + { + "epoch": 0.46545, + "grad_norm": 0.6652460018106552, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 46545 + }, + { + "epoch": 0.46546, + "grad_norm": 0.6386852197414211, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 46546 + }, + { + "epoch": 0.46547, + "grad_norm": 0.6418127277071, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 46547 + }, + { + "epoch": 0.46548, + "grad_norm": 0.6157964661045882, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 46548 + }, + { + "epoch": 0.46549, + "grad_norm": 0.5861459339473559, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 46549 + }, + { + "epoch": 0.4655, + "grad_norm": 0.5869393639702718, + "learning_rate": 0.003, + "loss": 4.0041, + "step": 46550 + }, + { + "epoch": 0.46551, + "grad_norm": 0.6785122755596337, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 46551 + }, + { + "epoch": 0.46552, + "grad_norm": 0.8229489353060028, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 46552 + }, + { + "epoch": 0.46553, + "grad_norm": 1.067139065564125, + "learning_rate": 0.003, + "loss": 4.0222, + "step": 46553 + }, + { + "epoch": 0.46554, + "grad_norm": 1.017719015360712, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 46554 + }, + { + "epoch": 0.46555, + "grad_norm": 0.9061655517090484, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 46555 + }, + { + "epoch": 0.46556, + "grad_norm": 0.9450286287665156, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 46556 + }, + { + "epoch": 0.46557, + "grad_norm": 0.8756165836247698, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 46557 + }, + { + "epoch": 0.46558, + "grad_norm": 0.9552983900764382, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 46558 + }, + { + "epoch": 0.46559, + "grad_norm": 0.9392656757092516, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 46559 + }, + { + "epoch": 0.4656, + "grad_norm": 0.8562504709201149, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 46560 + }, + { + "epoch": 0.46561, + "grad_norm": 0.8813399381776447, + "learning_rate": 0.003, + "loss": 3.9989, + "step": 46561 + }, + { + "epoch": 0.46562, + "grad_norm": 0.8650759857735965, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 46562 + }, + { + "epoch": 0.46563, + "grad_norm": 1.0547714174807599, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 46563 + }, + { + "epoch": 0.46564, + "grad_norm": 1.1057098278817399, + "learning_rate": 0.003, + "loss": 4.048, + "step": 46564 + }, + { + "epoch": 0.46565, + "grad_norm": 0.9899366637946992, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 46565 + }, + { + "epoch": 0.46566, + "grad_norm": 1.0874380109059174, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 46566 + }, + { + "epoch": 0.46567, + "grad_norm": 0.9302924260261546, + "learning_rate": 0.003, + "loss": 4.0146, + "step": 46567 + }, + { + "epoch": 0.46568, + "grad_norm": 0.8801292618502715, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 46568 + }, + { + "epoch": 0.46569, + "grad_norm": 0.7552634035502656, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 46569 + }, + { + "epoch": 0.4657, + "grad_norm": 0.6889288875805697, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 46570 + }, + { + "epoch": 0.46571, + "grad_norm": 0.6275160815822144, + "learning_rate": 0.003, + "loss": 4.063, + "step": 46571 + }, + { + "epoch": 0.46572, + "grad_norm": 0.6114429393288775, + "learning_rate": 0.003, + "loss": 4.0064, + "step": 46572 + }, + { + "epoch": 0.46573, + "grad_norm": 0.5904280394766357, + "learning_rate": 0.003, + "loss": 3.9841, + "step": 46573 + }, + { + "epoch": 0.46574, + "grad_norm": 0.6332850229048953, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 46574 + }, + { + "epoch": 0.46575, + "grad_norm": 0.8089449232586866, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 46575 + }, + { + "epoch": 0.46576, + "grad_norm": 1.0155448268969718, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 46576 + }, + { + "epoch": 0.46577, + "grad_norm": 1.2211735340384817, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 46577 + }, + { + "epoch": 0.46578, + "grad_norm": 0.6067301568993204, + "learning_rate": 0.003, + "loss": 4.028, + "step": 46578 + }, + { + "epoch": 0.46579, + "grad_norm": 0.6850282409540263, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 46579 + }, + { + "epoch": 0.4658, + "grad_norm": 0.7702128436173441, + "learning_rate": 0.003, + "loss": 4.0041, + "step": 46580 + }, + { + "epoch": 0.46581, + "grad_norm": 0.7814831420922242, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 46581 + }, + { + "epoch": 0.46582, + "grad_norm": 0.7666810635244142, + "learning_rate": 0.003, + "loss": 3.9996, + "step": 46582 + }, + { + "epoch": 0.46583, + "grad_norm": 0.7478912830655584, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 46583 + }, + { + "epoch": 0.46584, + "grad_norm": 0.7681951252151482, + "learning_rate": 0.003, + "loss": 4.0022, + "step": 46584 + }, + { + "epoch": 0.46585, + "grad_norm": 0.8153490081636545, + "learning_rate": 0.003, + "loss": 4.007, + "step": 46585 + }, + { + "epoch": 0.46586, + "grad_norm": 0.8669468299345785, + "learning_rate": 0.003, + "loss": 4.022, + "step": 46586 + }, + { + "epoch": 0.46587, + "grad_norm": 0.9526820061729624, + "learning_rate": 0.003, + "loss": 4.005, + "step": 46587 + }, + { + "epoch": 0.46588, + "grad_norm": 0.9383629757765779, + "learning_rate": 0.003, + "loss": 4.006, + "step": 46588 + }, + { + "epoch": 0.46589, + "grad_norm": 0.928640825467375, + "learning_rate": 0.003, + "loss": 4.0012, + "step": 46589 + }, + { + "epoch": 0.4659, + "grad_norm": 1.0337324972442914, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 46590 + }, + { + "epoch": 0.46591, + "grad_norm": 1.1112911544212523, + "learning_rate": 0.003, + "loss": 4.043, + "step": 46591 + }, + { + "epoch": 0.46592, + "grad_norm": 0.8483156461403132, + "learning_rate": 0.003, + "loss": 4.0013, + "step": 46592 + }, + { + "epoch": 0.46593, + "grad_norm": 0.8347345191762641, + "learning_rate": 0.003, + "loss": 4.038, + "step": 46593 + }, + { + "epoch": 0.46594, + "grad_norm": 0.8530191762550065, + "learning_rate": 0.003, + "loss": 4.0025, + "step": 46594 + }, + { + "epoch": 0.46595, + "grad_norm": 0.8332829112973863, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 46595 + }, + { + "epoch": 0.46596, + "grad_norm": 0.9983398224007533, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 46596 + }, + { + "epoch": 0.46597, + "grad_norm": 1.2682552073066393, + "learning_rate": 0.003, + "loss": 4.0455, + "step": 46597 + }, + { + "epoch": 0.46598, + "grad_norm": 0.9190740236840669, + "learning_rate": 0.003, + "loss": 4.0668, + "step": 46598 + }, + { + "epoch": 0.46599, + "grad_norm": 0.9949933156938496, + "learning_rate": 0.003, + "loss": 4.0137, + "step": 46599 + }, + { + "epoch": 0.466, + "grad_norm": 1.1161127986754935, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 46600 + }, + { + "epoch": 0.46601, + "grad_norm": 0.9379591813415887, + "learning_rate": 0.003, + "loss": 4.041, + "step": 46601 + }, + { + "epoch": 0.46602, + "grad_norm": 0.8301019606276606, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 46602 + }, + { + "epoch": 0.46603, + "grad_norm": 0.8622389356186498, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 46603 + }, + { + "epoch": 0.46604, + "grad_norm": 0.8851535030929799, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 46604 + }, + { + "epoch": 0.46605, + "grad_norm": 0.8910674638237772, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 46605 + }, + { + "epoch": 0.46606, + "grad_norm": 0.8391915942834008, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 46606 + }, + { + "epoch": 0.46607, + "grad_norm": 0.9617616798982158, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 46607 + }, + { + "epoch": 0.46608, + "grad_norm": 1.1786988819428155, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 46608 + }, + { + "epoch": 0.46609, + "grad_norm": 0.8951928388770072, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 46609 + }, + { + "epoch": 0.4661, + "grad_norm": 0.790101326212214, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 46610 + }, + { + "epoch": 0.46611, + "grad_norm": 0.8296579228436927, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 46611 + }, + { + "epoch": 0.46612, + "grad_norm": 1.0312834745004882, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 46612 + }, + { + "epoch": 0.46613, + "grad_norm": 0.9944493973153424, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 46613 + }, + { + "epoch": 0.46614, + "grad_norm": 0.9023282743382602, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 46614 + }, + { + "epoch": 0.46615, + "grad_norm": 0.8106472162910812, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 46615 + }, + { + "epoch": 0.46616, + "grad_norm": 0.7070494210613761, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 46616 + }, + { + "epoch": 0.46617, + "grad_norm": 0.686952993818861, + "learning_rate": 0.003, + "loss": 4.035, + "step": 46617 + }, + { + "epoch": 0.46618, + "grad_norm": 0.7468686134468588, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 46618 + }, + { + "epoch": 0.46619, + "grad_norm": 0.8986508339580351, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 46619 + }, + { + "epoch": 0.4662, + "grad_norm": 0.913652864254396, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 46620 + }, + { + "epoch": 0.46621, + "grad_norm": 0.8273666250126378, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 46621 + }, + { + "epoch": 0.46622, + "grad_norm": 0.7723724124904038, + "learning_rate": 0.003, + "loss": 3.99, + "step": 46622 + }, + { + "epoch": 0.46623, + "grad_norm": 0.7333079993001487, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 46623 + }, + { + "epoch": 0.46624, + "grad_norm": 0.6399189504288553, + "learning_rate": 0.003, + "loss": 4.032, + "step": 46624 + }, + { + "epoch": 0.46625, + "grad_norm": 0.595856148147675, + "learning_rate": 0.003, + "loss": 4.04, + "step": 46625 + }, + { + "epoch": 0.46626, + "grad_norm": 0.5601951591776982, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 46626 + }, + { + "epoch": 0.46627, + "grad_norm": 0.5242658372867304, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 46627 + }, + { + "epoch": 0.46628, + "grad_norm": 0.5998404577951109, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 46628 + }, + { + "epoch": 0.46629, + "grad_norm": 0.6240395668289896, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 46629 + }, + { + "epoch": 0.4663, + "grad_norm": 0.5279645064699376, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 46630 + }, + { + "epoch": 0.46631, + "grad_norm": 0.5136511802398203, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 46631 + }, + { + "epoch": 0.46632, + "grad_norm": 0.5541865779390402, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 46632 + }, + { + "epoch": 0.46633, + "grad_norm": 0.6180439074839699, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 46633 + }, + { + "epoch": 0.46634, + "grad_norm": 0.8055278931002922, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 46634 + }, + { + "epoch": 0.46635, + "grad_norm": 1.0452096677180356, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 46635 + }, + { + "epoch": 0.46636, + "grad_norm": 1.1956918060405668, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 46636 + }, + { + "epoch": 0.46637, + "grad_norm": 0.978349348651029, + "learning_rate": 0.003, + "loss": 4.015, + "step": 46637 + }, + { + "epoch": 0.46638, + "grad_norm": 0.9465026886408177, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 46638 + }, + { + "epoch": 0.46639, + "grad_norm": 0.8716306905328997, + "learning_rate": 0.003, + "loss": 3.9979, + "step": 46639 + }, + { + "epoch": 0.4664, + "grad_norm": 0.7524742835994197, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 46640 + }, + { + "epoch": 0.46641, + "grad_norm": 0.646377203307349, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 46641 + }, + { + "epoch": 0.46642, + "grad_norm": 0.6331192640326411, + "learning_rate": 0.003, + "loss": 4.0178, + "step": 46642 + }, + { + "epoch": 0.46643, + "grad_norm": 0.6815782359634949, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 46643 + }, + { + "epoch": 0.46644, + "grad_norm": 0.6757964674672595, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 46644 + }, + { + "epoch": 0.46645, + "grad_norm": 0.7277878440238605, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 46645 + }, + { + "epoch": 0.46646, + "grad_norm": 0.6352764638890589, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 46646 + }, + { + "epoch": 0.46647, + "grad_norm": 0.6276936136377683, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 46647 + }, + { + "epoch": 0.46648, + "grad_norm": 0.6008671753366808, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 46648 + }, + { + "epoch": 0.46649, + "grad_norm": 0.6614804196366113, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 46649 + }, + { + "epoch": 0.4665, + "grad_norm": 0.7816686494379854, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 46650 + }, + { + "epoch": 0.46651, + "grad_norm": 0.8597091819350292, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 46651 + }, + { + "epoch": 0.46652, + "grad_norm": 1.167987196517224, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 46652 + }, + { + "epoch": 0.46653, + "grad_norm": 1.2795638744153988, + "learning_rate": 0.003, + "loss": 4.0233, + "step": 46653 + }, + { + "epoch": 0.46654, + "grad_norm": 0.8275234427597759, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 46654 + }, + { + "epoch": 0.46655, + "grad_norm": 0.817259598260389, + "learning_rate": 0.003, + "loss": 4.037, + "step": 46655 + }, + { + "epoch": 0.46656, + "grad_norm": 1.0600408743513372, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 46656 + }, + { + "epoch": 0.46657, + "grad_norm": 1.2559462854893952, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 46657 + }, + { + "epoch": 0.46658, + "grad_norm": 1.0884815964015333, + "learning_rate": 0.003, + "loss": 4.044, + "step": 46658 + }, + { + "epoch": 0.46659, + "grad_norm": 0.8453111884611354, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 46659 + }, + { + "epoch": 0.4666, + "grad_norm": 0.785304877670272, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 46660 + }, + { + "epoch": 0.46661, + "grad_norm": 0.796913371019885, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 46661 + }, + { + "epoch": 0.46662, + "grad_norm": 0.980387109662072, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 46662 + }, + { + "epoch": 0.46663, + "grad_norm": 1.1709985997135337, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 46663 + }, + { + "epoch": 0.46664, + "grad_norm": 0.9793995227920035, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 46664 + }, + { + "epoch": 0.46665, + "grad_norm": 0.9546393644134361, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 46665 + }, + { + "epoch": 0.46666, + "grad_norm": 0.8924664104138644, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 46666 + }, + { + "epoch": 0.46667, + "grad_norm": 0.8589662273795552, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 46667 + }, + { + "epoch": 0.46668, + "grad_norm": 0.8682095637777282, + "learning_rate": 0.003, + "loss": 4.0604, + "step": 46668 + }, + { + "epoch": 0.46669, + "grad_norm": 0.8552919090568368, + "learning_rate": 0.003, + "loss": 4.0069, + "step": 46669 + }, + { + "epoch": 0.4667, + "grad_norm": 0.8431909338948509, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 46670 + }, + { + "epoch": 0.46671, + "grad_norm": 0.8629604004384017, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 46671 + }, + { + "epoch": 0.46672, + "grad_norm": 0.8918859244192086, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 46672 + }, + { + "epoch": 0.46673, + "grad_norm": 0.8831729500174925, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 46673 + }, + { + "epoch": 0.46674, + "grad_norm": 0.88582678178138, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 46674 + }, + { + "epoch": 0.46675, + "grad_norm": 1.0075534279094072, + "learning_rate": 0.003, + "loss": 4.0407, + "step": 46675 + }, + { + "epoch": 0.46676, + "grad_norm": 1.0463020981122795, + "learning_rate": 0.003, + "loss": 4.0289, + "step": 46676 + }, + { + "epoch": 0.46677, + "grad_norm": 1.0544669669844353, + "learning_rate": 0.003, + "loss": 4.0123, + "step": 46677 + }, + { + "epoch": 0.46678, + "grad_norm": 1.1360050668531763, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 46678 + }, + { + "epoch": 0.46679, + "grad_norm": 0.8324994697490496, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 46679 + }, + { + "epoch": 0.4668, + "grad_norm": 0.7155896611939283, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 46680 + }, + { + "epoch": 0.46681, + "grad_norm": 0.6584923202307353, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 46681 + }, + { + "epoch": 0.46682, + "grad_norm": 0.6217197259625833, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 46682 + }, + { + "epoch": 0.46683, + "grad_norm": 0.5961247288052555, + "learning_rate": 0.003, + "loss": 3.9987, + "step": 46683 + }, + { + "epoch": 0.46684, + "grad_norm": 0.6280668356682624, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 46684 + }, + { + "epoch": 0.46685, + "grad_norm": 0.8086391623717891, + "learning_rate": 0.003, + "loss": 4.0264, + "step": 46685 + }, + { + "epoch": 0.46686, + "grad_norm": 1.0608956341646907, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 46686 + }, + { + "epoch": 0.46687, + "grad_norm": 1.0502104308180833, + "learning_rate": 0.003, + "loss": 4.0011, + "step": 46687 + }, + { + "epoch": 0.46688, + "grad_norm": 0.9975205941837413, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 46688 + }, + { + "epoch": 0.46689, + "grad_norm": 1.0114823383933218, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 46689 + }, + { + "epoch": 0.4669, + "grad_norm": 0.7754566350301918, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 46690 + }, + { + "epoch": 0.46691, + "grad_norm": 0.6923274611417372, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 46691 + }, + { + "epoch": 0.46692, + "grad_norm": 0.7063484521911123, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 46692 + }, + { + "epoch": 0.46693, + "grad_norm": 0.7365772381909358, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 46693 + }, + { + "epoch": 0.46694, + "grad_norm": 0.7876175145192645, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 46694 + }, + { + "epoch": 0.46695, + "grad_norm": 0.8506732791510069, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 46695 + }, + { + "epoch": 0.46696, + "grad_norm": 1.022403617202645, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 46696 + }, + { + "epoch": 0.46697, + "grad_norm": 1.0754729655281325, + "learning_rate": 0.003, + "loss": 4.0256, + "step": 46697 + }, + { + "epoch": 0.46698, + "grad_norm": 0.8101265723897065, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 46698 + }, + { + "epoch": 0.46699, + "grad_norm": 0.7497975661237157, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 46699 + }, + { + "epoch": 0.467, + "grad_norm": 0.7065594683687613, + "learning_rate": 0.003, + "loss": 4.007, + "step": 46700 + }, + { + "epoch": 0.46701, + "grad_norm": 0.7092052691405287, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 46701 + }, + { + "epoch": 0.46702, + "grad_norm": 0.7692212197753748, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 46702 + }, + { + "epoch": 0.46703, + "grad_norm": 0.8515504578217022, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 46703 + }, + { + "epoch": 0.46704, + "grad_norm": 0.9995702085905509, + "learning_rate": 0.003, + "loss": 4.052, + "step": 46704 + }, + { + "epoch": 0.46705, + "grad_norm": 0.9708731985290355, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 46705 + }, + { + "epoch": 0.46706, + "grad_norm": 0.8607754366429085, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 46706 + }, + { + "epoch": 0.46707, + "grad_norm": 0.7084321019727093, + "learning_rate": 0.003, + "loss": 3.9803, + "step": 46707 + }, + { + "epoch": 0.46708, + "grad_norm": 0.6432763996637288, + "learning_rate": 0.003, + "loss": 4.0084, + "step": 46708 + }, + { + "epoch": 0.46709, + "grad_norm": 0.6575243034663206, + "learning_rate": 0.003, + "loss": 3.9992, + "step": 46709 + }, + { + "epoch": 0.4671, + "grad_norm": 0.6460411113864065, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 46710 + }, + { + "epoch": 0.46711, + "grad_norm": 0.6792530158034382, + "learning_rate": 0.003, + "loss": 4.0143, + "step": 46711 + }, + { + "epoch": 0.46712, + "grad_norm": 0.6668041371134589, + "learning_rate": 0.003, + "loss": 4.0164, + "step": 46712 + }, + { + "epoch": 0.46713, + "grad_norm": 0.7107890895728852, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 46713 + }, + { + "epoch": 0.46714, + "grad_norm": 0.7816765955596603, + "learning_rate": 0.003, + "loss": 4.024, + "step": 46714 + }, + { + "epoch": 0.46715, + "grad_norm": 0.7785135431047248, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 46715 + }, + { + "epoch": 0.46716, + "grad_norm": 0.6682919505173217, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 46716 + }, + { + "epoch": 0.46717, + "grad_norm": 0.7627767919743226, + "learning_rate": 0.003, + "loss": 4.0652, + "step": 46717 + }, + { + "epoch": 0.46718, + "grad_norm": 0.859618632664127, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 46718 + }, + { + "epoch": 0.46719, + "grad_norm": 1.1015307913704941, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 46719 + }, + { + "epoch": 0.4672, + "grad_norm": 1.4089926042328655, + "learning_rate": 0.003, + "loss": 4.029, + "step": 46720 + }, + { + "epoch": 0.46721, + "grad_norm": 0.9833338898356583, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 46721 + }, + { + "epoch": 0.46722, + "grad_norm": 1.0451029991122198, + "learning_rate": 0.003, + "loss": 4.0506, + "step": 46722 + }, + { + "epoch": 0.46723, + "grad_norm": 1.0306303162717276, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 46723 + }, + { + "epoch": 0.46724, + "grad_norm": 1.1736977085101747, + "learning_rate": 0.003, + "loss": 4.0399, + "step": 46724 + }, + { + "epoch": 0.46725, + "grad_norm": 1.0748650951069751, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 46725 + }, + { + "epoch": 0.46726, + "grad_norm": 0.9176088369660058, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 46726 + }, + { + "epoch": 0.46727, + "grad_norm": 0.6923783843388778, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 46727 + }, + { + "epoch": 0.46728, + "grad_norm": 0.5743441381926488, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 46728 + }, + { + "epoch": 0.46729, + "grad_norm": 0.6027737169177171, + "learning_rate": 0.003, + "loss": 4.013, + "step": 46729 + }, + { + "epoch": 0.4673, + "grad_norm": 0.6352171369522414, + "learning_rate": 0.003, + "loss": 4.0359, + "step": 46730 + }, + { + "epoch": 0.46731, + "grad_norm": 0.6985875311563452, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 46731 + }, + { + "epoch": 0.46732, + "grad_norm": 0.6900683521251929, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 46732 + }, + { + "epoch": 0.46733, + "grad_norm": 0.616262943650443, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 46733 + }, + { + "epoch": 0.46734, + "grad_norm": 0.629697300250982, + "learning_rate": 0.003, + "loss": 3.9946, + "step": 46734 + }, + { + "epoch": 0.46735, + "grad_norm": 0.7656249115486041, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 46735 + }, + { + "epoch": 0.46736, + "grad_norm": 0.8862454506894805, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 46736 + }, + { + "epoch": 0.46737, + "grad_norm": 1.1485404424946741, + "learning_rate": 0.003, + "loss": 4.0541, + "step": 46737 + }, + { + "epoch": 0.46738, + "grad_norm": 0.8731619704034759, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 46738 + }, + { + "epoch": 0.46739, + "grad_norm": 0.7218974491481273, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 46739 + }, + { + "epoch": 0.4674, + "grad_norm": 0.7082653861656876, + "learning_rate": 0.003, + "loss": 4.0163, + "step": 46740 + }, + { + "epoch": 0.46741, + "grad_norm": 0.7834310501192564, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 46741 + }, + { + "epoch": 0.46742, + "grad_norm": 0.7139901596656789, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 46742 + }, + { + "epoch": 0.46743, + "grad_norm": 0.8122749184822348, + "learning_rate": 0.003, + "loss": 4.0464, + "step": 46743 + }, + { + "epoch": 0.46744, + "grad_norm": 0.9555507089162764, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 46744 + }, + { + "epoch": 0.46745, + "grad_norm": 0.9828158159181021, + "learning_rate": 0.003, + "loss": 4.043, + "step": 46745 + }, + { + "epoch": 0.46746, + "grad_norm": 1.0312829868702444, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 46746 + }, + { + "epoch": 0.46747, + "grad_norm": 0.9776547176245187, + "learning_rate": 0.003, + "loss": 4.0361, + "step": 46747 + }, + { + "epoch": 0.46748, + "grad_norm": 0.976642228129924, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 46748 + }, + { + "epoch": 0.46749, + "grad_norm": 1.032570636474651, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 46749 + }, + { + "epoch": 0.4675, + "grad_norm": 1.0410368069388254, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 46750 + }, + { + "epoch": 0.46751, + "grad_norm": 0.9149640156620393, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 46751 + }, + { + "epoch": 0.46752, + "grad_norm": 0.8316861103775116, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 46752 + }, + { + "epoch": 0.46753, + "grad_norm": 0.856613665932664, + "learning_rate": 0.003, + "loss": 4.0186, + "step": 46753 + }, + { + "epoch": 0.46754, + "grad_norm": 0.9759746701880588, + "learning_rate": 0.003, + "loss": 4.048, + "step": 46754 + }, + { + "epoch": 0.46755, + "grad_norm": 0.9544051375407943, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 46755 + }, + { + "epoch": 0.46756, + "grad_norm": 0.9270458068941494, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 46756 + }, + { + "epoch": 0.46757, + "grad_norm": 0.8112141060764779, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 46757 + }, + { + "epoch": 0.46758, + "grad_norm": 0.8879890097929034, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 46758 + }, + { + "epoch": 0.46759, + "grad_norm": 0.8536085765225739, + "learning_rate": 0.003, + "loss": 4.0616, + "step": 46759 + }, + { + "epoch": 0.4676, + "grad_norm": 0.9779495673194996, + "learning_rate": 0.003, + "loss": 4.029, + "step": 46760 + }, + { + "epoch": 0.46761, + "grad_norm": 0.9130358039099614, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 46761 + }, + { + "epoch": 0.46762, + "grad_norm": 0.8790207434647793, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 46762 + }, + { + "epoch": 0.46763, + "grad_norm": 0.9466267162301418, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 46763 + }, + { + "epoch": 0.46764, + "grad_norm": 1.059525705574738, + "learning_rate": 0.003, + "loss": 4.0572, + "step": 46764 + }, + { + "epoch": 0.46765, + "grad_norm": 1.0201070404173747, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 46765 + }, + { + "epoch": 0.46766, + "grad_norm": 0.9701654810349891, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 46766 + }, + { + "epoch": 0.46767, + "grad_norm": 0.9301626787500692, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 46767 + }, + { + "epoch": 0.46768, + "grad_norm": 0.8912404322187821, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 46768 + }, + { + "epoch": 0.46769, + "grad_norm": 0.8656658583813853, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 46769 + }, + { + "epoch": 0.4677, + "grad_norm": 0.7275656732059965, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 46770 + }, + { + "epoch": 0.46771, + "grad_norm": 0.6760702167204121, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 46771 + }, + { + "epoch": 0.46772, + "grad_norm": 0.7787152445739524, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 46772 + }, + { + "epoch": 0.46773, + "grad_norm": 0.856361137820173, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 46773 + }, + { + "epoch": 0.46774, + "grad_norm": 0.9572990846942915, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 46774 + }, + { + "epoch": 0.46775, + "grad_norm": 0.9571202522741908, + "learning_rate": 0.003, + "loss": 4.0085, + "step": 46775 + }, + { + "epoch": 0.46776, + "grad_norm": 0.8666733993729144, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 46776 + }, + { + "epoch": 0.46777, + "grad_norm": 0.7919843707738028, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 46777 + }, + { + "epoch": 0.46778, + "grad_norm": 0.6949148281237549, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 46778 + }, + { + "epoch": 0.46779, + "grad_norm": 0.6000937409048003, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 46779 + }, + { + "epoch": 0.4678, + "grad_norm": 0.5642090003868292, + "learning_rate": 0.003, + "loss": 4.012, + "step": 46780 + }, + { + "epoch": 0.46781, + "grad_norm": 0.5574592134192307, + "learning_rate": 0.003, + "loss": 3.9863, + "step": 46781 + }, + { + "epoch": 0.46782, + "grad_norm": 0.6466155083892717, + "learning_rate": 0.003, + "loss": 4.0113, + "step": 46782 + }, + { + "epoch": 0.46783, + "grad_norm": 0.7695334799454043, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 46783 + }, + { + "epoch": 0.46784, + "grad_norm": 0.9036231239724642, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 46784 + }, + { + "epoch": 0.46785, + "grad_norm": 0.8779542951206575, + "learning_rate": 0.003, + "loss": 4.0221, + "step": 46785 + }, + { + "epoch": 0.46786, + "grad_norm": 0.7841041546268539, + "learning_rate": 0.003, + "loss": 4.0015, + "step": 46786 + }, + { + "epoch": 0.46787, + "grad_norm": 0.7439277591536074, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 46787 + }, + { + "epoch": 0.46788, + "grad_norm": 0.7682068757261931, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 46788 + }, + { + "epoch": 0.46789, + "grad_norm": 0.8405303172103233, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 46789 + }, + { + "epoch": 0.4679, + "grad_norm": 0.8191696338384024, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 46790 + }, + { + "epoch": 0.46791, + "grad_norm": 0.8860143717871926, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 46791 + }, + { + "epoch": 0.46792, + "grad_norm": 0.8757400715601894, + "learning_rate": 0.003, + "loss": 4.0029, + "step": 46792 + }, + { + "epoch": 0.46793, + "grad_norm": 0.7277258894130746, + "learning_rate": 0.003, + "loss": 4.0043, + "step": 46793 + }, + { + "epoch": 0.46794, + "grad_norm": 0.6455528653089018, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 46794 + }, + { + "epoch": 0.46795, + "grad_norm": 0.6165415209550155, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 46795 + }, + { + "epoch": 0.46796, + "grad_norm": 0.5716780623559898, + "learning_rate": 0.003, + "loss": 4.032, + "step": 46796 + }, + { + "epoch": 0.46797, + "grad_norm": 0.5651465317273365, + "learning_rate": 0.003, + "loss": 3.9922, + "step": 46797 + }, + { + "epoch": 0.46798, + "grad_norm": 0.697715512500837, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 46798 + }, + { + "epoch": 0.46799, + "grad_norm": 0.8757956904338171, + "learning_rate": 0.003, + "loss": 4.033, + "step": 46799 + }, + { + "epoch": 0.468, + "grad_norm": 1.0677961544342114, + "learning_rate": 0.003, + "loss": 4.0442, + "step": 46800 + }, + { + "epoch": 0.46801, + "grad_norm": 0.9331013623979418, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 46801 + }, + { + "epoch": 0.46802, + "grad_norm": 0.9216983164543542, + "learning_rate": 0.003, + "loss": 4.0101, + "step": 46802 + }, + { + "epoch": 0.46803, + "grad_norm": 0.9104346496839903, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 46803 + }, + { + "epoch": 0.46804, + "grad_norm": 0.9330657875776734, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 46804 + }, + { + "epoch": 0.46805, + "grad_norm": 1.1093396502353876, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 46805 + }, + { + "epoch": 0.46806, + "grad_norm": 1.140470462613382, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 46806 + }, + { + "epoch": 0.46807, + "grad_norm": 0.9247638884993862, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 46807 + }, + { + "epoch": 0.46808, + "grad_norm": 0.8486398124191112, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 46808 + }, + { + "epoch": 0.46809, + "grad_norm": 0.7354017341397274, + "learning_rate": 0.003, + "loss": 4.042, + "step": 46809 + }, + { + "epoch": 0.4681, + "grad_norm": 0.7388513547538501, + "learning_rate": 0.003, + "loss": 4.0024, + "step": 46810 + }, + { + "epoch": 0.46811, + "grad_norm": 0.6680182048720665, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 46811 + }, + { + "epoch": 0.46812, + "grad_norm": 0.6287547162785825, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 46812 + }, + { + "epoch": 0.46813, + "grad_norm": 0.6781337464821099, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 46813 + }, + { + "epoch": 0.46814, + "grad_norm": 0.7957726982782537, + "learning_rate": 0.003, + "loss": 4.0024, + "step": 46814 + }, + { + "epoch": 0.46815, + "grad_norm": 0.7321376831666095, + "learning_rate": 0.003, + "loss": 4.0057, + "step": 46815 + }, + { + "epoch": 0.46816, + "grad_norm": 0.7947744954391992, + "learning_rate": 0.003, + "loss": 3.9975, + "step": 46816 + }, + { + "epoch": 0.46817, + "grad_norm": 1.0419825621279248, + "learning_rate": 0.003, + "loss": 3.9979, + "step": 46817 + }, + { + "epoch": 0.46818, + "grad_norm": 1.0604256951653, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 46818 + }, + { + "epoch": 0.46819, + "grad_norm": 0.6762564864133439, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 46819 + }, + { + "epoch": 0.4682, + "grad_norm": 0.6135077958117651, + "learning_rate": 0.003, + "loss": 3.9979, + "step": 46820 + }, + { + "epoch": 0.46821, + "grad_norm": 0.7545990805231974, + "learning_rate": 0.003, + "loss": 4.0032, + "step": 46821 + }, + { + "epoch": 0.46822, + "grad_norm": 0.8110170413992368, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 46822 + }, + { + "epoch": 0.46823, + "grad_norm": 0.8633463216306939, + "learning_rate": 0.003, + "loss": 4.03, + "step": 46823 + }, + { + "epoch": 0.46824, + "grad_norm": 0.8789206812431072, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 46824 + }, + { + "epoch": 0.46825, + "grad_norm": 0.8176634381185465, + "learning_rate": 0.003, + "loss": 4.0456, + "step": 46825 + }, + { + "epoch": 0.46826, + "grad_norm": 0.8570492090764184, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 46826 + }, + { + "epoch": 0.46827, + "grad_norm": 0.8075146734548255, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 46827 + }, + { + "epoch": 0.46828, + "grad_norm": 0.7814920653535985, + "learning_rate": 0.003, + "loss": 4.0051, + "step": 46828 + }, + { + "epoch": 0.46829, + "grad_norm": 0.8142598981583368, + "learning_rate": 0.003, + "loss": 4.031, + "step": 46829 + }, + { + "epoch": 0.4683, + "grad_norm": 0.8840686365434566, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 46830 + }, + { + "epoch": 0.46831, + "grad_norm": 0.9740283243979423, + "learning_rate": 0.003, + "loss": 4.0593, + "step": 46831 + }, + { + "epoch": 0.46832, + "grad_norm": 0.987290996762544, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 46832 + }, + { + "epoch": 0.46833, + "grad_norm": 1.1742904075185014, + "learning_rate": 0.003, + "loss": 4.061, + "step": 46833 + }, + { + "epoch": 0.46834, + "grad_norm": 0.9114261357338375, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 46834 + }, + { + "epoch": 0.46835, + "grad_norm": 0.8211439259655008, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 46835 + }, + { + "epoch": 0.46836, + "grad_norm": 0.8668242205578494, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 46836 + }, + { + "epoch": 0.46837, + "grad_norm": 0.8135801576207806, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 46837 + }, + { + "epoch": 0.46838, + "grad_norm": 0.7238109683561909, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 46838 + }, + { + "epoch": 0.46839, + "grad_norm": 0.6233832504116006, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 46839 + }, + { + "epoch": 0.4684, + "grad_norm": 0.6569711872727526, + "learning_rate": 0.003, + "loss": 4.0252, + "step": 46840 + }, + { + "epoch": 0.46841, + "grad_norm": 0.7093870945338988, + "learning_rate": 0.003, + "loss": 4.0083, + "step": 46841 + }, + { + "epoch": 0.46842, + "grad_norm": 0.743411908620197, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 46842 + }, + { + "epoch": 0.46843, + "grad_norm": 0.6927217260491055, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 46843 + }, + { + "epoch": 0.46844, + "grad_norm": 0.6864764104640431, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 46844 + }, + { + "epoch": 0.46845, + "grad_norm": 0.7659779950225815, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 46845 + }, + { + "epoch": 0.46846, + "grad_norm": 0.8034653817519509, + "learning_rate": 0.003, + "loss": 4.0026, + "step": 46846 + }, + { + "epoch": 0.46847, + "grad_norm": 1.0853718287315683, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 46847 + }, + { + "epoch": 0.46848, + "grad_norm": 1.3615464108009734, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 46848 + }, + { + "epoch": 0.46849, + "grad_norm": 0.8577806947936873, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 46849 + }, + { + "epoch": 0.4685, + "grad_norm": 0.7121646192789605, + "learning_rate": 0.003, + "loss": 4.0522, + "step": 46850 + }, + { + "epoch": 0.46851, + "grad_norm": 0.6852394783735053, + "learning_rate": 0.003, + "loss": 4.0608, + "step": 46851 + }, + { + "epoch": 0.46852, + "grad_norm": 0.7600565324872235, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 46852 + }, + { + "epoch": 0.46853, + "grad_norm": 0.7220101360509337, + "learning_rate": 0.003, + "loss": 4.0314, + "step": 46853 + }, + { + "epoch": 0.46854, + "grad_norm": 0.701467741355677, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 46854 + }, + { + "epoch": 0.46855, + "grad_norm": 0.6905139615050646, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 46855 + }, + { + "epoch": 0.46856, + "grad_norm": 0.7425527158751547, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 46856 + }, + { + "epoch": 0.46857, + "grad_norm": 0.9772363216042058, + "learning_rate": 0.003, + "loss": 4.048, + "step": 46857 + }, + { + "epoch": 0.46858, + "grad_norm": 1.258267075027417, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 46858 + }, + { + "epoch": 0.46859, + "grad_norm": 0.842960650028682, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 46859 + }, + { + "epoch": 0.4686, + "grad_norm": 0.8394757014594622, + "learning_rate": 0.003, + "loss": 4.0131, + "step": 46860 + }, + { + "epoch": 0.46861, + "grad_norm": 0.8282704563220994, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 46861 + }, + { + "epoch": 0.46862, + "grad_norm": 0.7856410999155988, + "learning_rate": 0.003, + "loss": 4.0065, + "step": 46862 + }, + { + "epoch": 0.46863, + "grad_norm": 0.8392723033720553, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 46863 + }, + { + "epoch": 0.46864, + "grad_norm": 0.7739679111996055, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 46864 + }, + { + "epoch": 0.46865, + "grad_norm": 0.7765045009638698, + "learning_rate": 0.003, + "loss": 4.0188, + "step": 46865 + }, + { + "epoch": 0.46866, + "grad_norm": 0.9423798050095795, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 46866 + }, + { + "epoch": 0.46867, + "grad_norm": 1.0840918524261074, + "learning_rate": 0.003, + "loss": 4.056, + "step": 46867 + }, + { + "epoch": 0.46868, + "grad_norm": 1.0732598835100784, + "learning_rate": 0.003, + "loss": 4.0686, + "step": 46868 + }, + { + "epoch": 0.46869, + "grad_norm": 1.0240345652504321, + "learning_rate": 0.003, + "loss": 4.032, + "step": 46869 + }, + { + "epoch": 0.4687, + "grad_norm": 1.2297880126600014, + "learning_rate": 0.003, + "loss": 4.0631, + "step": 46870 + }, + { + "epoch": 0.46871, + "grad_norm": 1.0858940013028837, + "learning_rate": 0.003, + "loss": 4.0704, + "step": 46871 + }, + { + "epoch": 0.46872, + "grad_norm": 0.9289733043026002, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 46872 + }, + { + "epoch": 0.46873, + "grad_norm": 1.0716909654156834, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 46873 + }, + { + "epoch": 0.46874, + "grad_norm": 1.1656350436786198, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 46874 + }, + { + "epoch": 0.46875, + "grad_norm": 0.9158874620847411, + "learning_rate": 0.003, + "loss": 4.028, + "step": 46875 + }, + { + "epoch": 0.46876, + "grad_norm": 0.7586707205937265, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 46876 + }, + { + "epoch": 0.46877, + "grad_norm": 0.8074166062744039, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 46877 + }, + { + "epoch": 0.46878, + "grad_norm": 0.8954935446395084, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 46878 + }, + { + "epoch": 0.46879, + "grad_norm": 0.9859768073047656, + "learning_rate": 0.003, + "loss": 4.032, + "step": 46879 + }, + { + "epoch": 0.4688, + "grad_norm": 0.9225548609491377, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 46880 + }, + { + "epoch": 0.46881, + "grad_norm": 1.0047994316097004, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 46881 + }, + { + "epoch": 0.46882, + "grad_norm": 1.0755894072884291, + "learning_rate": 0.003, + "loss": 4.032, + "step": 46882 + }, + { + "epoch": 0.46883, + "grad_norm": 0.9197743997058515, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 46883 + }, + { + "epoch": 0.46884, + "grad_norm": 0.8430622329763482, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 46884 + }, + { + "epoch": 0.46885, + "grad_norm": 0.9013751995109435, + "learning_rate": 0.003, + "loss": 4.0759, + "step": 46885 + }, + { + "epoch": 0.46886, + "grad_norm": 0.9211167825830701, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 46886 + }, + { + "epoch": 0.46887, + "grad_norm": 0.7883760614397114, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 46887 + }, + { + "epoch": 0.46888, + "grad_norm": 0.7505538017752085, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 46888 + }, + { + "epoch": 0.46889, + "grad_norm": 0.6914982133261871, + "learning_rate": 0.003, + "loss": 4.0728, + "step": 46889 + }, + { + "epoch": 0.4689, + "grad_norm": 0.6128349750142476, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 46890 + }, + { + "epoch": 0.46891, + "grad_norm": 0.6567290953416383, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 46891 + }, + { + "epoch": 0.46892, + "grad_norm": 0.7114323282840684, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 46892 + }, + { + "epoch": 0.46893, + "grad_norm": 0.7811134898059756, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 46893 + }, + { + "epoch": 0.46894, + "grad_norm": 0.9322112900422825, + "learning_rate": 0.003, + "loss": 4.041, + "step": 46894 + }, + { + "epoch": 0.46895, + "grad_norm": 1.0623042432661378, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 46895 + }, + { + "epoch": 0.46896, + "grad_norm": 0.9316347415355711, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 46896 + }, + { + "epoch": 0.46897, + "grad_norm": 0.8417486540943063, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 46897 + }, + { + "epoch": 0.46898, + "grad_norm": 0.7012364187171632, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 46898 + }, + { + "epoch": 0.46899, + "grad_norm": 0.6448715770667961, + "learning_rate": 0.003, + "loss": 4.0198, + "step": 46899 + }, + { + "epoch": 0.469, + "grad_norm": 0.5738839637289923, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 46900 + }, + { + "epoch": 0.46901, + "grad_norm": 0.6054986989495219, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 46901 + }, + { + "epoch": 0.46902, + "grad_norm": 0.7304896247742196, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 46902 + }, + { + "epoch": 0.46903, + "grad_norm": 0.8720013224068797, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 46903 + }, + { + "epoch": 0.46904, + "grad_norm": 1.1218122476658847, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 46904 + }, + { + "epoch": 0.46905, + "grad_norm": 0.9610862871204865, + "learning_rate": 0.003, + "loss": 4.0022, + "step": 46905 + }, + { + "epoch": 0.46906, + "grad_norm": 1.1006298819140787, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 46906 + }, + { + "epoch": 0.46907, + "grad_norm": 1.0188098557354002, + "learning_rate": 0.003, + "loss": 4.0496, + "step": 46907 + }, + { + "epoch": 0.46908, + "grad_norm": 0.8517959406828609, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 46908 + }, + { + "epoch": 0.46909, + "grad_norm": 0.6750911700231558, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 46909 + }, + { + "epoch": 0.4691, + "grad_norm": 0.6097945364929283, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 46910 + }, + { + "epoch": 0.46911, + "grad_norm": 0.6183803823622873, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 46911 + }, + { + "epoch": 0.46912, + "grad_norm": 0.5946348710327081, + "learning_rate": 0.003, + "loss": 4.0473, + "step": 46912 + }, + { + "epoch": 0.46913, + "grad_norm": 0.5810741863878217, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 46913 + }, + { + "epoch": 0.46914, + "grad_norm": 0.6477959164053166, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 46914 + }, + { + "epoch": 0.46915, + "grad_norm": 0.6419434780913298, + "learning_rate": 0.003, + "loss": 4.0075, + "step": 46915 + }, + { + "epoch": 0.46916, + "grad_norm": 0.7734183856970326, + "learning_rate": 0.003, + "loss": 4.0015, + "step": 46916 + }, + { + "epoch": 0.46917, + "grad_norm": 0.8882679507989217, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 46917 + }, + { + "epoch": 0.46918, + "grad_norm": 0.9000850722739345, + "learning_rate": 0.003, + "loss": 4.021, + "step": 46918 + }, + { + "epoch": 0.46919, + "grad_norm": 0.7540650106130276, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 46919 + }, + { + "epoch": 0.4692, + "grad_norm": 0.6989589488374122, + "learning_rate": 0.003, + "loss": 4.0009, + "step": 46920 + }, + { + "epoch": 0.46921, + "grad_norm": 0.7839500223660865, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 46921 + }, + { + "epoch": 0.46922, + "grad_norm": 0.9213998686248926, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 46922 + }, + { + "epoch": 0.46923, + "grad_norm": 1.0481199912781536, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 46923 + }, + { + "epoch": 0.46924, + "grad_norm": 0.8936285223331174, + "learning_rate": 0.003, + "loss": 4.0106, + "step": 46924 + }, + { + "epoch": 0.46925, + "grad_norm": 0.9906920049576498, + "learning_rate": 0.003, + "loss": 3.9941, + "step": 46925 + }, + { + "epoch": 0.46926, + "grad_norm": 1.1155592133890264, + "learning_rate": 0.003, + "loss": 4.0626, + "step": 46926 + }, + { + "epoch": 0.46927, + "grad_norm": 1.0670743318176774, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 46927 + }, + { + "epoch": 0.46928, + "grad_norm": 1.0344994218916583, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 46928 + }, + { + "epoch": 0.46929, + "grad_norm": 1.0344589466872944, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 46929 + }, + { + "epoch": 0.4693, + "grad_norm": 1.1510034229346202, + "learning_rate": 0.003, + "loss": 4.0636, + "step": 46930 + }, + { + "epoch": 0.46931, + "grad_norm": 0.9908875823010672, + "learning_rate": 0.003, + "loss": 4.009, + "step": 46931 + }, + { + "epoch": 0.46932, + "grad_norm": 1.04723906371038, + "learning_rate": 0.003, + "loss": 4.0746, + "step": 46932 + }, + { + "epoch": 0.46933, + "grad_norm": 1.1905506100429666, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 46933 + }, + { + "epoch": 0.46934, + "grad_norm": 0.9753942378231495, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 46934 + }, + { + "epoch": 0.46935, + "grad_norm": 1.065887897767436, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 46935 + }, + { + "epoch": 0.46936, + "grad_norm": 0.926694458140416, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 46936 + }, + { + "epoch": 0.46937, + "grad_norm": 0.9416700908897593, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 46937 + }, + { + "epoch": 0.46938, + "grad_norm": 0.8723361905025948, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 46938 + }, + { + "epoch": 0.46939, + "grad_norm": 0.8641386061354775, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 46939 + }, + { + "epoch": 0.4694, + "grad_norm": 0.8161978666931897, + "learning_rate": 0.003, + "loss": 4.0433, + "step": 46940 + }, + { + "epoch": 0.46941, + "grad_norm": 0.8002903016605932, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 46941 + }, + { + "epoch": 0.46942, + "grad_norm": 0.8917956994158688, + "learning_rate": 0.003, + "loss": 4.0655, + "step": 46942 + }, + { + "epoch": 0.46943, + "grad_norm": 0.8126612120452126, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 46943 + }, + { + "epoch": 0.46944, + "grad_norm": 0.6701474347102139, + "learning_rate": 0.003, + "loss": 4.0589, + "step": 46944 + }, + { + "epoch": 0.46945, + "grad_norm": 0.6327473818857593, + "learning_rate": 0.003, + "loss": 4.0048, + "step": 46945 + }, + { + "epoch": 0.46946, + "grad_norm": 0.5870972699680503, + "learning_rate": 0.003, + "loss": 4.0253, + "step": 46946 + }, + { + "epoch": 0.46947, + "grad_norm": 0.6240647841649751, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 46947 + }, + { + "epoch": 0.46948, + "grad_norm": 0.7969334172192216, + "learning_rate": 0.003, + "loss": 4.052, + "step": 46948 + }, + { + "epoch": 0.46949, + "grad_norm": 1.0516387805207443, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 46949 + }, + { + "epoch": 0.4695, + "grad_norm": 1.100536186090554, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 46950 + }, + { + "epoch": 0.46951, + "grad_norm": 0.7214781570084493, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 46951 + }, + { + "epoch": 0.46952, + "grad_norm": 0.5483545436126509, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 46952 + }, + { + "epoch": 0.46953, + "grad_norm": 0.6695010991262857, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 46953 + }, + { + "epoch": 0.46954, + "grad_norm": 0.8310012413139028, + "learning_rate": 0.003, + "loss": 4.0046, + "step": 46954 + }, + { + "epoch": 0.46955, + "grad_norm": 0.9970766023650792, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 46955 + }, + { + "epoch": 0.46956, + "grad_norm": 0.982752687705891, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 46956 + }, + { + "epoch": 0.46957, + "grad_norm": 0.9412437443726244, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 46957 + }, + { + "epoch": 0.46958, + "grad_norm": 0.9220691923878196, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 46958 + }, + { + "epoch": 0.46959, + "grad_norm": 0.8345555982242147, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 46959 + }, + { + "epoch": 0.4696, + "grad_norm": 0.8224076394544051, + "learning_rate": 0.003, + "loss": 4.0478, + "step": 46960 + }, + { + "epoch": 0.46961, + "grad_norm": 0.7920224195791277, + "learning_rate": 0.003, + "loss": 4.033, + "step": 46961 + }, + { + "epoch": 0.46962, + "grad_norm": 0.8240826480099213, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 46962 + }, + { + "epoch": 0.46963, + "grad_norm": 0.8683027212184229, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 46963 + }, + { + "epoch": 0.46964, + "grad_norm": 0.8106888643801232, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 46964 + }, + { + "epoch": 0.46965, + "grad_norm": 0.8456504959140547, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 46965 + }, + { + "epoch": 0.46966, + "grad_norm": 0.9423877503332766, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 46966 + }, + { + "epoch": 0.46967, + "grad_norm": 0.827251517314409, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 46967 + }, + { + "epoch": 0.46968, + "grad_norm": 0.8952421597588347, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 46968 + }, + { + "epoch": 0.46969, + "grad_norm": 0.9762682831779076, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 46969 + }, + { + "epoch": 0.4697, + "grad_norm": 0.9813344821881768, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 46970 + }, + { + "epoch": 0.46971, + "grad_norm": 0.9536921411058705, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 46971 + }, + { + "epoch": 0.46972, + "grad_norm": 0.9640138210616795, + "learning_rate": 0.003, + "loss": 4.0444, + "step": 46972 + }, + { + "epoch": 0.46973, + "grad_norm": 0.9903466547387828, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 46973 + }, + { + "epoch": 0.46974, + "grad_norm": 0.9577956352563333, + "learning_rate": 0.003, + "loss": 4.0639, + "step": 46974 + }, + { + "epoch": 0.46975, + "grad_norm": 0.8890407771020029, + "learning_rate": 0.003, + "loss": 3.9724, + "step": 46975 + }, + { + "epoch": 0.46976, + "grad_norm": 0.7992671440487891, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 46976 + }, + { + "epoch": 0.46977, + "grad_norm": 0.6100192508575653, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 46977 + }, + { + "epoch": 0.46978, + "grad_norm": 0.5027229975415751, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 46978 + }, + { + "epoch": 0.46979, + "grad_norm": 0.5182205309661058, + "learning_rate": 0.003, + "loss": 3.9928, + "step": 46979 + }, + { + "epoch": 0.4698, + "grad_norm": 0.5778927470492485, + "learning_rate": 0.003, + "loss": 4.0051, + "step": 46980 + }, + { + "epoch": 0.46981, + "grad_norm": 0.6439948783882807, + "learning_rate": 0.003, + "loss": 4.013, + "step": 46981 + }, + { + "epoch": 0.46982, + "grad_norm": 0.7214365039974263, + "learning_rate": 0.003, + "loss": 4.0136, + "step": 46982 + }, + { + "epoch": 0.46983, + "grad_norm": 0.8952482600804129, + "learning_rate": 0.003, + "loss": 4.021, + "step": 46983 + }, + { + "epoch": 0.46984, + "grad_norm": 0.909512236560142, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 46984 + }, + { + "epoch": 0.46985, + "grad_norm": 0.7996022596541522, + "learning_rate": 0.003, + "loss": 4.042, + "step": 46985 + }, + { + "epoch": 0.46986, + "grad_norm": 0.8347346729454073, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 46986 + }, + { + "epoch": 0.46987, + "grad_norm": 0.9279254360766007, + "learning_rate": 0.003, + "loss": 4.0493, + "step": 46987 + }, + { + "epoch": 0.46988, + "grad_norm": 0.983989352620746, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 46988 + }, + { + "epoch": 0.46989, + "grad_norm": 0.9193270020458187, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 46989 + }, + { + "epoch": 0.4699, + "grad_norm": 0.8940170376068153, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 46990 + }, + { + "epoch": 0.46991, + "grad_norm": 1.0461307589649407, + "learning_rate": 0.003, + "loss": 4.0272, + "step": 46991 + }, + { + "epoch": 0.46992, + "grad_norm": 1.0144871639806048, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 46992 + }, + { + "epoch": 0.46993, + "grad_norm": 1.0390582757557787, + "learning_rate": 0.003, + "loss": 4.0745, + "step": 46993 + }, + { + "epoch": 0.46994, + "grad_norm": 1.2059405018239067, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 46994 + }, + { + "epoch": 0.46995, + "grad_norm": 0.975684572950022, + "learning_rate": 0.003, + "loss": 4.0227, + "step": 46995 + }, + { + "epoch": 0.46996, + "grad_norm": 0.9330520051061432, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 46996 + }, + { + "epoch": 0.46997, + "grad_norm": 0.9275655557618598, + "learning_rate": 0.003, + "loss": 4.029, + "step": 46997 + }, + { + "epoch": 0.46998, + "grad_norm": 0.9045368158631404, + "learning_rate": 0.003, + "loss": 4.023, + "step": 46998 + }, + { + "epoch": 0.46999, + "grad_norm": 0.8386223989436933, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 46999 + }, + { + "epoch": 0.47, + "grad_norm": 0.8019265469463357, + "learning_rate": 0.003, + "loss": 4.0345, + "step": 47000 + }, + { + "epoch": 0.47001, + "grad_norm": 0.854127032671623, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 47001 + }, + { + "epoch": 0.47002, + "grad_norm": 0.7594612991859474, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 47002 + }, + { + "epoch": 0.47003, + "grad_norm": 0.8493645519142533, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 47003 + }, + { + "epoch": 0.47004, + "grad_norm": 0.9000276548188506, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 47004 + }, + { + "epoch": 0.47005, + "grad_norm": 0.8620101791538632, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 47005 + }, + { + "epoch": 0.47006, + "grad_norm": 0.7612428431285514, + "learning_rate": 0.003, + "loss": 3.999, + "step": 47006 + }, + { + "epoch": 0.47007, + "grad_norm": 0.7415121532697402, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 47007 + }, + { + "epoch": 0.47008, + "grad_norm": 0.7905740390037325, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 47008 + }, + { + "epoch": 0.47009, + "grad_norm": 0.9343503375050004, + "learning_rate": 0.003, + "loss": 4.003, + "step": 47009 + }, + { + "epoch": 0.4701, + "grad_norm": 1.0402394658572893, + "learning_rate": 0.003, + "loss": 4.0505, + "step": 47010 + }, + { + "epoch": 0.47011, + "grad_norm": 0.9162770203314803, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 47011 + }, + { + "epoch": 0.47012, + "grad_norm": 0.8392982643695592, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 47012 + }, + { + "epoch": 0.47013, + "grad_norm": 0.8960706606188251, + "learning_rate": 0.003, + "loss": 4.0447, + "step": 47013 + }, + { + "epoch": 0.47014, + "grad_norm": 0.8201979105805126, + "learning_rate": 0.003, + "loss": 4.0664, + "step": 47014 + }, + { + "epoch": 0.47015, + "grad_norm": 0.7803553837729895, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 47015 + }, + { + "epoch": 0.47016, + "grad_norm": 0.7504915156549756, + "learning_rate": 0.003, + "loss": 4.0067, + "step": 47016 + }, + { + "epoch": 0.47017, + "grad_norm": 0.8004824204426638, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 47017 + }, + { + "epoch": 0.47018, + "grad_norm": 0.7292945789359311, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 47018 + }, + { + "epoch": 0.47019, + "grad_norm": 0.6700749711276659, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 47019 + }, + { + "epoch": 0.4702, + "grad_norm": 0.6310471504647296, + "learning_rate": 0.003, + "loss": 4.026, + "step": 47020 + }, + { + "epoch": 0.47021, + "grad_norm": 0.572877972939703, + "learning_rate": 0.003, + "loss": 4.0135, + "step": 47021 + }, + { + "epoch": 0.47022, + "grad_norm": 0.5939993974883155, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 47022 + }, + { + "epoch": 0.47023, + "grad_norm": 0.6346584258267735, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 47023 + }, + { + "epoch": 0.47024, + "grad_norm": 0.6809879834694494, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 47024 + }, + { + "epoch": 0.47025, + "grad_norm": 0.7247279435729167, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 47025 + }, + { + "epoch": 0.47026, + "grad_norm": 0.7767510599167167, + "learning_rate": 0.003, + "loss": 4.0609, + "step": 47026 + }, + { + "epoch": 0.47027, + "grad_norm": 0.7567976797462472, + "learning_rate": 0.003, + "loss": 4.0558, + "step": 47027 + }, + { + "epoch": 0.47028, + "grad_norm": 0.7821973403606338, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 47028 + }, + { + "epoch": 0.47029, + "grad_norm": 0.7797646743472355, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 47029 + }, + { + "epoch": 0.4703, + "grad_norm": 0.8378626089729658, + "learning_rate": 0.003, + "loss": 4.007, + "step": 47030 + }, + { + "epoch": 0.47031, + "grad_norm": 0.9289366048056292, + "learning_rate": 0.003, + "loss": 4.0394, + "step": 47031 + }, + { + "epoch": 0.47032, + "grad_norm": 1.1127908128821933, + "learning_rate": 0.003, + "loss": 4.0429, + "step": 47032 + }, + { + "epoch": 0.47033, + "grad_norm": 1.0007359072648048, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 47033 + }, + { + "epoch": 0.47034, + "grad_norm": 0.9803406529822422, + "learning_rate": 0.003, + "loss": 4.032, + "step": 47034 + }, + { + "epoch": 0.47035, + "grad_norm": 1.0123190156337543, + "learning_rate": 0.003, + "loss": 4.0561, + "step": 47035 + }, + { + "epoch": 0.47036, + "grad_norm": 1.0788318937140158, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 47036 + }, + { + "epoch": 0.47037, + "grad_norm": 0.8667610335276112, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 47037 + }, + { + "epoch": 0.47038, + "grad_norm": 0.7434721986614886, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 47038 + }, + { + "epoch": 0.47039, + "grad_norm": 0.720787986777502, + "learning_rate": 0.003, + "loss": 4.0556, + "step": 47039 + }, + { + "epoch": 0.4704, + "grad_norm": 0.8284299605860984, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 47040 + }, + { + "epoch": 0.47041, + "grad_norm": 0.8928406048698062, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 47041 + }, + { + "epoch": 0.47042, + "grad_norm": 1.0230479909072954, + "learning_rate": 0.003, + "loss": 4.0126, + "step": 47042 + }, + { + "epoch": 0.47043, + "grad_norm": 1.0554186939160473, + "learning_rate": 0.003, + "loss": 4.0638, + "step": 47043 + }, + { + "epoch": 0.47044, + "grad_norm": 0.8946815228459303, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 47044 + }, + { + "epoch": 0.47045, + "grad_norm": 0.8825394051915296, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 47045 + }, + { + "epoch": 0.47046, + "grad_norm": 0.9031120612708682, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 47046 + }, + { + "epoch": 0.47047, + "grad_norm": 0.8586622287408184, + "learning_rate": 0.003, + "loss": 4.068, + "step": 47047 + }, + { + "epoch": 0.47048, + "grad_norm": 0.808080423452866, + "learning_rate": 0.003, + "loss": 4.0044, + "step": 47048 + }, + { + "epoch": 0.47049, + "grad_norm": 0.7475593303064468, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 47049 + }, + { + "epoch": 0.4705, + "grad_norm": 0.7826837604043635, + "learning_rate": 0.003, + "loss": 3.9943, + "step": 47050 + }, + { + "epoch": 0.47051, + "grad_norm": 0.7650566280771286, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 47051 + }, + { + "epoch": 0.47052, + "grad_norm": 0.6897091868769101, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 47052 + }, + { + "epoch": 0.47053, + "grad_norm": 0.5903902854233126, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 47053 + }, + { + "epoch": 0.47054, + "grad_norm": 0.6274089339798614, + "learning_rate": 0.003, + "loss": 4.0127, + "step": 47054 + }, + { + "epoch": 0.47055, + "grad_norm": 0.6455122356569065, + "learning_rate": 0.003, + "loss": 4.0004, + "step": 47055 + }, + { + "epoch": 0.47056, + "grad_norm": 0.6672756755546003, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 47056 + }, + { + "epoch": 0.47057, + "grad_norm": 0.6286150818833846, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 47057 + }, + { + "epoch": 0.47058, + "grad_norm": 0.6764103590960479, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 47058 + }, + { + "epoch": 0.47059, + "grad_norm": 0.7046808459621081, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 47059 + }, + { + "epoch": 0.4706, + "grad_norm": 0.7143986740859511, + "learning_rate": 0.003, + "loss": 4.0117, + "step": 47060 + }, + { + "epoch": 0.47061, + "grad_norm": 0.8664575034766687, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 47061 + }, + { + "epoch": 0.47062, + "grad_norm": 1.235407910675618, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 47062 + }, + { + "epoch": 0.47063, + "grad_norm": 0.9339171406488936, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 47063 + }, + { + "epoch": 0.47064, + "grad_norm": 0.7854510863097468, + "learning_rate": 0.003, + "loss": 3.9872, + "step": 47064 + }, + { + "epoch": 0.47065, + "grad_norm": 0.730963070174756, + "learning_rate": 0.003, + "loss": 4.0196, + "step": 47065 + }, + { + "epoch": 0.47066, + "grad_norm": 0.6969910995753915, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 47066 + }, + { + "epoch": 0.47067, + "grad_norm": 0.797788729745334, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 47067 + }, + { + "epoch": 0.47068, + "grad_norm": 0.7890544820969141, + "learning_rate": 0.003, + "loss": 4.0087, + "step": 47068 + }, + { + "epoch": 0.47069, + "grad_norm": 0.7808393723340146, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 47069 + }, + { + "epoch": 0.4707, + "grad_norm": 0.79981675957184, + "learning_rate": 0.003, + "loss": 3.9888, + "step": 47070 + }, + { + "epoch": 0.47071, + "grad_norm": 0.8011184773498278, + "learning_rate": 0.003, + "loss": 4.005, + "step": 47071 + }, + { + "epoch": 0.47072, + "grad_norm": 0.798213291176872, + "learning_rate": 0.003, + "loss": 4.004, + "step": 47072 + }, + { + "epoch": 0.47073, + "grad_norm": 0.7098421669272819, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 47073 + }, + { + "epoch": 0.47074, + "grad_norm": 0.8375616469591036, + "learning_rate": 0.003, + "loss": 4.0005, + "step": 47074 + }, + { + "epoch": 0.47075, + "grad_norm": 1.011977768434237, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 47075 + }, + { + "epoch": 0.47076, + "grad_norm": 1.3340970470381681, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 47076 + }, + { + "epoch": 0.47077, + "grad_norm": 0.7980107782359794, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 47077 + }, + { + "epoch": 0.47078, + "grad_norm": 0.7896756206103768, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 47078 + }, + { + "epoch": 0.47079, + "grad_norm": 0.8031374904774731, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 47079 + }, + { + "epoch": 0.4708, + "grad_norm": 1.0254678818535878, + "learning_rate": 0.003, + "loss": 4.044, + "step": 47080 + }, + { + "epoch": 0.47081, + "grad_norm": 1.0736904387311164, + "learning_rate": 0.003, + "loss": 4.028, + "step": 47081 + }, + { + "epoch": 0.47082, + "grad_norm": 0.8251393679233006, + "learning_rate": 0.003, + "loss": 4.0667, + "step": 47082 + }, + { + "epoch": 0.47083, + "grad_norm": 1.0092488588507178, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 47083 + }, + { + "epoch": 0.47084, + "grad_norm": 1.0329075288671004, + "learning_rate": 0.003, + "loss": 4.054, + "step": 47084 + }, + { + "epoch": 0.47085, + "grad_norm": 1.1150131781793258, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 47085 + }, + { + "epoch": 0.47086, + "grad_norm": 0.9893580865592743, + "learning_rate": 0.003, + "loss": 4.0612, + "step": 47086 + }, + { + "epoch": 0.47087, + "grad_norm": 0.951900642279687, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 47087 + }, + { + "epoch": 0.47088, + "grad_norm": 0.9009459632958488, + "learning_rate": 0.003, + "loss": 4.0324, + "step": 47088 + }, + { + "epoch": 0.47089, + "grad_norm": 1.0145571665113542, + "learning_rate": 0.003, + "loss": 4.058, + "step": 47089 + }, + { + "epoch": 0.4709, + "grad_norm": 1.1531915669315553, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 47090 + }, + { + "epoch": 0.47091, + "grad_norm": 0.9264429827091282, + "learning_rate": 0.003, + "loss": 4.027, + "step": 47091 + }, + { + "epoch": 0.47092, + "grad_norm": 0.963731790255001, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 47092 + }, + { + "epoch": 0.47093, + "grad_norm": 1.0203331806136609, + "learning_rate": 0.003, + "loss": 4.0343, + "step": 47093 + }, + { + "epoch": 0.47094, + "grad_norm": 0.8029770356206309, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 47094 + }, + { + "epoch": 0.47095, + "grad_norm": 0.8029181926044141, + "learning_rate": 0.003, + "loss": 4.057, + "step": 47095 + }, + { + "epoch": 0.47096, + "grad_norm": 0.8752143009035247, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 47096 + }, + { + "epoch": 0.47097, + "grad_norm": 0.9758085753880475, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 47097 + }, + { + "epoch": 0.47098, + "grad_norm": 1.081555243802072, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 47098 + }, + { + "epoch": 0.47099, + "grad_norm": 1.0246496233830231, + "learning_rate": 0.003, + "loss": 4.0719, + "step": 47099 + }, + { + "epoch": 0.471, + "grad_norm": 0.9197438327638088, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 47100 + }, + { + "epoch": 0.47101, + "grad_norm": 0.9285802631771225, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 47101 + }, + { + "epoch": 0.47102, + "grad_norm": 1.0418022520439536, + "learning_rate": 0.003, + "loss": 4.0585, + "step": 47102 + }, + { + "epoch": 0.47103, + "grad_norm": 1.030702229878385, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 47103 + }, + { + "epoch": 0.47104, + "grad_norm": 1.0139864618241996, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 47104 + }, + { + "epoch": 0.47105, + "grad_norm": 0.9385035882604225, + "learning_rate": 0.003, + "loss": 4.0382, + "step": 47105 + }, + { + "epoch": 0.47106, + "grad_norm": 0.8012963128870152, + "learning_rate": 0.003, + "loss": 4.0687, + "step": 47106 + }, + { + "epoch": 0.47107, + "grad_norm": 0.717669856014704, + "learning_rate": 0.003, + "loss": 4.009, + "step": 47107 + }, + { + "epoch": 0.47108, + "grad_norm": 0.6488783988455763, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 47108 + }, + { + "epoch": 0.47109, + "grad_norm": 0.5695922810781375, + "learning_rate": 0.003, + "loss": 4.0423, + "step": 47109 + }, + { + "epoch": 0.4711, + "grad_norm": 0.5427246806573612, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 47110 + }, + { + "epoch": 0.47111, + "grad_norm": 0.5673547524696914, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 47111 + }, + { + "epoch": 0.47112, + "grad_norm": 0.599224438090222, + "learning_rate": 0.003, + "loss": 4.0149, + "step": 47112 + }, + { + "epoch": 0.47113, + "grad_norm": 0.5596427144113727, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 47113 + }, + { + "epoch": 0.47114, + "grad_norm": 0.4823912142445604, + "learning_rate": 0.003, + "loss": 4.0303, + "step": 47114 + }, + { + "epoch": 0.47115, + "grad_norm": 0.4421949300001391, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 47115 + }, + { + "epoch": 0.47116, + "grad_norm": 0.4955098532956111, + "learning_rate": 0.003, + "loss": 4.0079, + "step": 47116 + }, + { + "epoch": 0.47117, + "grad_norm": 0.6888309141568091, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 47117 + }, + { + "epoch": 0.47118, + "grad_norm": 1.0504128695455415, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 47118 + }, + { + "epoch": 0.47119, + "grad_norm": 1.2807831300243846, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 47119 + }, + { + "epoch": 0.4712, + "grad_norm": 0.5878494523314112, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 47120 + }, + { + "epoch": 0.47121, + "grad_norm": 0.8110160317529862, + "learning_rate": 0.003, + "loss": 4.014, + "step": 47121 + }, + { + "epoch": 0.47122, + "grad_norm": 0.9006239947457508, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 47122 + }, + { + "epoch": 0.47123, + "grad_norm": 0.8199928301994459, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 47123 + }, + { + "epoch": 0.47124, + "grad_norm": 0.6551993298729447, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 47124 + }, + { + "epoch": 0.47125, + "grad_norm": 0.5998509470318841, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 47125 + }, + { + "epoch": 0.47126, + "grad_norm": 0.6640633210930986, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 47126 + }, + { + "epoch": 0.47127, + "grad_norm": 0.7473218948428525, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 47127 + }, + { + "epoch": 0.47128, + "grad_norm": 0.737241455230935, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 47128 + }, + { + "epoch": 0.47129, + "grad_norm": 0.7247386305012795, + "learning_rate": 0.003, + "loss": 4.0024, + "step": 47129 + }, + { + "epoch": 0.4713, + "grad_norm": 0.79786467775268, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 47130 + }, + { + "epoch": 0.47131, + "grad_norm": 0.8388174685433603, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 47131 + }, + { + "epoch": 0.47132, + "grad_norm": 1.1296248683008883, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 47132 + }, + { + "epoch": 0.47133, + "grad_norm": 1.1888500955638237, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 47133 + }, + { + "epoch": 0.47134, + "grad_norm": 0.7518621458995587, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 47134 + }, + { + "epoch": 0.47135, + "grad_norm": 0.5632317087307448, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 47135 + }, + { + "epoch": 0.47136, + "grad_norm": 0.5551501910696656, + "learning_rate": 0.003, + "loss": 4.0108, + "step": 47136 + }, + { + "epoch": 0.47137, + "grad_norm": 0.5423854911166522, + "learning_rate": 0.003, + "loss": 4.0053, + "step": 47137 + }, + { + "epoch": 0.47138, + "grad_norm": 0.48268146826559016, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 47138 + }, + { + "epoch": 0.47139, + "grad_norm": 0.48303796984686714, + "learning_rate": 0.003, + "loss": 4.0012, + "step": 47139 + }, + { + "epoch": 0.4714, + "grad_norm": 0.5021343740732643, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 47140 + }, + { + "epoch": 0.47141, + "grad_norm": 0.49394090397753204, + "learning_rate": 0.003, + "loss": 4.0042, + "step": 47141 + }, + { + "epoch": 0.47142, + "grad_norm": 0.5436080166890396, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 47142 + }, + { + "epoch": 0.47143, + "grad_norm": 0.6490863142609257, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 47143 + }, + { + "epoch": 0.47144, + "grad_norm": 0.736637439748446, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 47144 + }, + { + "epoch": 0.47145, + "grad_norm": 0.7917890713796152, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 47145 + }, + { + "epoch": 0.47146, + "grad_norm": 0.8768919988674013, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 47146 + }, + { + "epoch": 0.47147, + "grad_norm": 0.9856547094192328, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 47147 + }, + { + "epoch": 0.47148, + "grad_norm": 1.062039834555521, + "learning_rate": 0.003, + "loss": 3.9979, + "step": 47148 + }, + { + "epoch": 0.47149, + "grad_norm": 1.183815488434749, + "learning_rate": 0.003, + "loss": 4.018, + "step": 47149 + }, + { + "epoch": 0.4715, + "grad_norm": 1.0503970236079683, + "learning_rate": 0.003, + "loss": 4.0562, + "step": 47150 + }, + { + "epoch": 0.47151, + "grad_norm": 0.9798646562001458, + "learning_rate": 0.003, + "loss": 4.046, + "step": 47151 + }, + { + "epoch": 0.47152, + "grad_norm": 0.9726720843144998, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 47152 + }, + { + "epoch": 0.47153, + "grad_norm": 0.8815308453734122, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 47153 + }, + { + "epoch": 0.47154, + "grad_norm": 0.70578904311454, + "learning_rate": 0.003, + "loss": 4.0112, + "step": 47154 + }, + { + "epoch": 0.47155, + "grad_norm": 0.6668640859038354, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 47155 + }, + { + "epoch": 0.47156, + "grad_norm": 0.6683341217487696, + "learning_rate": 0.003, + "loss": 4.0153, + "step": 47156 + }, + { + "epoch": 0.47157, + "grad_norm": 0.7742963615748174, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 47157 + }, + { + "epoch": 0.47158, + "grad_norm": 0.8761194974469138, + "learning_rate": 0.003, + "loss": 4.0061, + "step": 47158 + }, + { + "epoch": 0.47159, + "grad_norm": 1.0043677544131149, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 47159 + }, + { + "epoch": 0.4716, + "grad_norm": 1.278774158033752, + "learning_rate": 0.003, + "loss": 4.0733, + "step": 47160 + }, + { + "epoch": 0.47161, + "grad_norm": 0.7431863385526053, + "learning_rate": 0.003, + "loss": 4.0104, + "step": 47161 + }, + { + "epoch": 0.47162, + "grad_norm": 0.7026445597009295, + "learning_rate": 0.003, + "loss": 4.0179, + "step": 47162 + }, + { + "epoch": 0.47163, + "grad_norm": 0.7357391816571779, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 47163 + }, + { + "epoch": 0.47164, + "grad_norm": 0.7037087375972438, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 47164 + }, + { + "epoch": 0.47165, + "grad_norm": 0.7153858762092195, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 47165 + }, + { + "epoch": 0.47166, + "grad_norm": 0.8242816951594741, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 47166 + }, + { + "epoch": 0.47167, + "grad_norm": 0.8652569185772323, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 47167 + }, + { + "epoch": 0.47168, + "grad_norm": 0.8887902317394442, + "learning_rate": 0.003, + "loss": 4.0588, + "step": 47168 + }, + { + "epoch": 0.47169, + "grad_norm": 1.0222834334924875, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 47169 + }, + { + "epoch": 0.4717, + "grad_norm": 1.0170262473884446, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 47170 + }, + { + "epoch": 0.47171, + "grad_norm": 1.0106081550930366, + "learning_rate": 0.003, + "loss": 4.0547, + "step": 47171 + }, + { + "epoch": 0.47172, + "grad_norm": 0.8367728294542435, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 47172 + }, + { + "epoch": 0.47173, + "grad_norm": 0.9543004212291549, + "learning_rate": 0.003, + "loss": 4.05, + "step": 47173 + }, + { + "epoch": 0.47174, + "grad_norm": 1.0872284302246005, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 47174 + }, + { + "epoch": 0.47175, + "grad_norm": 0.9317784451627655, + "learning_rate": 0.003, + "loss": 4.0352, + "step": 47175 + }, + { + "epoch": 0.47176, + "grad_norm": 0.8431197887459637, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 47176 + }, + { + "epoch": 0.47177, + "grad_norm": 0.8548576599234761, + "learning_rate": 0.003, + "loss": 4.0045, + "step": 47177 + }, + { + "epoch": 0.47178, + "grad_norm": 1.029548006464559, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 47178 + }, + { + "epoch": 0.47179, + "grad_norm": 0.9778051404717291, + "learning_rate": 0.003, + "loss": 4.0549, + "step": 47179 + }, + { + "epoch": 0.4718, + "grad_norm": 0.9561208883708328, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 47180 + }, + { + "epoch": 0.47181, + "grad_norm": 0.9060894564531494, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 47181 + }, + { + "epoch": 0.47182, + "grad_norm": 0.7578655157919821, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 47182 + }, + { + "epoch": 0.47183, + "grad_norm": 0.6817841348690553, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 47183 + }, + { + "epoch": 0.47184, + "grad_norm": 0.7143411663930916, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 47184 + }, + { + "epoch": 0.47185, + "grad_norm": 0.8134241442010272, + "learning_rate": 0.003, + "loss": 4.0553, + "step": 47185 + }, + { + "epoch": 0.47186, + "grad_norm": 0.8871204141740229, + "learning_rate": 0.003, + "loss": 4.0419, + "step": 47186 + }, + { + "epoch": 0.47187, + "grad_norm": 0.8036587748576465, + "learning_rate": 0.003, + "loss": 4.0634, + "step": 47187 + }, + { + "epoch": 0.47188, + "grad_norm": 1.0046155054386001, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 47188 + }, + { + "epoch": 0.47189, + "grad_norm": 1.2287229712980103, + "learning_rate": 0.003, + "loss": 4.0794, + "step": 47189 + }, + { + "epoch": 0.4719, + "grad_norm": 0.7093009597977656, + "learning_rate": 0.003, + "loss": 4.0727, + "step": 47190 + }, + { + "epoch": 0.47191, + "grad_norm": 0.6812801640282274, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 47191 + }, + { + "epoch": 0.47192, + "grad_norm": 0.6894346846440752, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 47192 + }, + { + "epoch": 0.47193, + "grad_norm": 0.7174440395216384, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 47193 + }, + { + "epoch": 0.47194, + "grad_norm": 0.7655287898885125, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 47194 + }, + { + "epoch": 0.47195, + "grad_norm": 1.0182086848491347, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 47195 + }, + { + "epoch": 0.47196, + "grad_norm": 1.3175138493008451, + "learning_rate": 0.003, + "loss": 4.0702, + "step": 47196 + }, + { + "epoch": 0.47197, + "grad_norm": 0.7232264930948255, + "learning_rate": 0.003, + "loss": 4.0319, + "step": 47197 + }, + { + "epoch": 0.47198, + "grad_norm": 0.664748046488526, + "learning_rate": 0.003, + "loss": 4.0241, + "step": 47198 + }, + { + "epoch": 0.47199, + "grad_norm": 0.6432935055222851, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 47199 + }, + { + "epoch": 0.472, + "grad_norm": 0.6479834800642958, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 47200 + }, + { + "epoch": 0.47201, + "grad_norm": 0.6587187564910674, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 47201 + }, + { + "epoch": 0.47202, + "grad_norm": 0.7373617729580111, + "learning_rate": 0.003, + "loss": 4.035, + "step": 47202 + }, + { + "epoch": 0.47203, + "grad_norm": 0.7093534276245955, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 47203 + }, + { + "epoch": 0.47204, + "grad_norm": 0.7483890142223324, + "learning_rate": 0.003, + "loss": 4.028, + "step": 47204 + }, + { + "epoch": 0.47205, + "grad_norm": 0.9381533527388979, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 47205 + }, + { + "epoch": 0.47206, + "grad_norm": 1.0506477986010516, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 47206 + }, + { + "epoch": 0.47207, + "grad_norm": 0.976377429320071, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 47207 + }, + { + "epoch": 0.47208, + "grad_norm": 0.9702901784460246, + "learning_rate": 0.003, + "loss": 4.0533, + "step": 47208 + }, + { + "epoch": 0.47209, + "grad_norm": 1.0328299276893538, + "learning_rate": 0.003, + "loss": 4.0474, + "step": 47209 + }, + { + "epoch": 0.4721, + "grad_norm": 1.0702212094754142, + "learning_rate": 0.003, + "loss": 4.0718, + "step": 47210 + }, + { + "epoch": 0.47211, + "grad_norm": 1.2355276223315645, + "learning_rate": 0.003, + "loss": 4.0189, + "step": 47211 + }, + { + "epoch": 0.47212, + "grad_norm": 0.8641709313675342, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 47212 + }, + { + "epoch": 0.47213, + "grad_norm": 0.8941470249210457, + "learning_rate": 0.003, + "loss": 4.0275, + "step": 47213 + }, + { + "epoch": 0.47214, + "grad_norm": 1.0058851695980897, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 47214 + }, + { + "epoch": 0.47215, + "grad_norm": 1.131878038895641, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 47215 + }, + { + "epoch": 0.47216, + "grad_norm": 0.8880430474049283, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 47216 + }, + { + "epoch": 0.47217, + "grad_norm": 0.8438853387493687, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 47217 + }, + { + "epoch": 0.47218, + "grad_norm": 0.982942888833157, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 47218 + }, + { + "epoch": 0.47219, + "grad_norm": 0.9948316510058648, + "learning_rate": 0.003, + "loss": 4.0015, + "step": 47219 + }, + { + "epoch": 0.4722, + "grad_norm": 0.8735070258436132, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 47220 + }, + { + "epoch": 0.47221, + "grad_norm": 0.7094516507944203, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 47221 + }, + { + "epoch": 0.47222, + "grad_norm": 0.6406723128783892, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 47222 + }, + { + "epoch": 0.47223, + "grad_norm": 0.5749369688150755, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 47223 + }, + { + "epoch": 0.47224, + "grad_norm": 0.5544047457357626, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 47224 + }, + { + "epoch": 0.47225, + "grad_norm": 0.49647092095623957, + "learning_rate": 0.003, + "loss": 4.021, + "step": 47225 + }, + { + "epoch": 0.47226, + "grad_norm": 0.5748552109227354, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 47226 + }, + { + "epoch": 0.47227, + "grad_norm": 0.6047348303982033, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 47227 + }, + { + "epoch": 0.47228, + "grad_norm": 0.6449969571561883, + "learning_rate": 0.003, + "loss": 4.04, + "step": 47228 + }, + { + "epoch": 0.47229, + "grad_norm": 0.7051118455664436, + "learning_rate": 0.003, + "loss": 3.9891, + "step": 47229 + }, + { + "epoch": 0.4723, + "grad_norm": 0.8279274347487426, + "learning_rate": 0.003, + "loss": 4.0334, + "step": 47230 + }, + { + "epoch": 0.47231, + "grad_norm": 1.0134005510071218, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 47231 + }, + { + "epoch": 0.47232, + "grad_norm": 1.3002744583181998, + "learning_rate": 0.003, + "loss": 4.0348, + "step": 47232 + }, + { + "epoch": 0.47233, + "grad_norm": 0.6223435705770182, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 47233 + }, + { + "epoch": 0.47234, + "grad_norm": 0.7342838693159326, + "learning_rate": 0.003, + "loss": 4.0643, + "step": 47234 + }, + { + "epoch": 0.47235, + "grad_norm": 0.8141124446805134, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 47235 + }, + { + "epoch": 0.47236, + "grad_norm": 0.8620906434925539, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 47236 + }, + { + "epoch": 0.47237, + "grad_norm": 0.8880158149645527, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 47237 + }, + { + "epoch": 0.47238, + "grad_norm": 0.9417275144715768, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 47238 + }, + { + "epoch": 0.47239, + "grad_norm": 0.975884113309621, + "learning_rate": 0.003, + "loss": 4.0507, + "step": 47239 + }, + { + "epoch": 0.4724, + "grad_norm": 0.9869619790926335, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 47240 + }, + { + "epoch": 0.47241, + "grad_norm": 1.0052527623555476, + "learning_rate": 0.003, + "loss": 4.0436, + "step": 47241 + }, + { + "epoch": 0.47242, + "grad_norm": 1.0467492856876233, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 47242 + }, + { + "epoch": 0.47243, + "grad_norm": 0.9728715946927549, + "learning_rate": 0.003, + "loss": 4.0299, + "step": 47243 + }, + { + "epoch": 0.47244, + "grad_norm": 0.9102265201644288, + "learning_rate": 0.003, + "loss": 4.0502, + "step": 47244 + }, + { + "epoch": 0.47245, + "grad_norm": 0.9076319929535642, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 47245 + }, + { + "epoch": 0.47246, + "grad_norm": 0.8817834324067597, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 47246 + }, + { + "epoch": 0.47247, + "grad_norm": 0.9505031924090699, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 47247 + }, + { + "epoch": 0.47248, + "grad_norm": 1.0535993007113085, + "learning_rate": 0.003, + "loss": 4.0697, + "step": 47248 + }, + { + "epoch": 0.47249, + "grad_norm": 0.8611332935359864, + "learning_rate": 0.003, + "loss": 4.0155, + "step": 47249 + }, + { + "epoch": 0.4725, + "grad_norm": 0.974511202038698, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 47250 + }, + { + "epoch": 0.47251, + "grad_norm": 1.0535127974262668, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 47251 + }, + { + "epoch": 0.47252, + "grad_norm": 0.8743557633816438, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 47252 + }, + { + "epoch": 0.47253, + "grad_norm": 0.7687918513635154, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 47253 + }, + { + "epoch": 0.47254, + "grad_norm": 0.9218898963821052, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 47254 + }, + { + "epoch": 0.47255, + "grad_norm": 1.1393777954578732, + "learning_rate": 0.003, + "loss": 4.0659, + "step": 47255 + }, + { + "epoch": 0.47256, + "grad_norm": 0.9223142961108203, + "learning_rate": 0.003, + "loss": 4.0008, + "step": 47256 + }, + { + "epoch": 0.47257, + "grad_norm": 0.7653302199479823, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 47257 + }, + { + "epoch": 0.47258, + "grad_norm": 0.7750137867772536, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 47258 + }, + { + "epoch": 0.47259, + "grad_norm": 0.7938032338164077, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 47259 + }, + { + "epoch": 0.4726, + "grad_norm": 0.7561473787226676, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 47260 + }, + { + "epoch": 0.47261, + "grad_norm": 0.6388764582686443, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 47261 + }, + { + "epoch": 0.47262, + "grad_norm": 0.6084938434815654, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 47262 + }, + { + "epoch": 0.47263, + "grad_norm": 0.5858430570318635, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 47263 + }, + { + "epoch": 0.47264, + "grad_norm": 0.6562193525717518, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 47264 + }, + { + "epoch": 0.47265, + "grad_norm": 0.8377125207165029, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 47265 + }, + { + "epoch": 0.47266, + "grad_norm": 1.0494460661826768, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 47266 + }, + { + "epoch": 0.47267, + "grad_norm": 1.1396809671061041, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 47267 + }, + { + "epoch": 0.47268, + "grad_norm": 0.8919069520711514, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 47268 + }, + { + "epoch": 0.47269, + "grad_norm": 0.8672534357159963, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 47269 + }, + { + "epoch": 0.4727, + "grad_norm": 0.8581040033128458, + "learning_rate": 0.003, + "loss": 4.0024, + "step": 47270 + }, + { + "epoch": 0.47271, + "grad_norm": 0.8233684812044479, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 47271 + }, + { + "epoch": 0.47272, + "grad_norm": 0.8385926079744587, + "learning_rate": 0.003, + "loss": 4.0583, + "step": 47272 + }, + { + "epoch": 0.47273, + "grad_norm": 0.8431510284392417, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 47273 + }, + { + "epoch": 0.47274, + "grad_norm": 0.7672418872494342, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 47274 + }, + { + "epoch": 0.47275, + "grad_norm": 0.7321117707653297, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 47275 + }, + { + "epoch": 0.47276, + "grad_norm": 0.7045827474148656, + "learning_rate": 0.003, + "loss": 3.9775, + "step": 47276 + }, + { + "epoch": 0.47277, + "grad_norm": 0.6626219458538147, + "learning_rate": 0.003, + "loss": 4.041, + "step": 47277 + }, + { + "epoch": 0.47278, + "grad_norm": 0.8308211117342742, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 47278 + }, + { + "epoch": 0.47279, + "grad_norm": 1.0310815730688947, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 47279 + }, + { + "epoch": 0.4728, + "grad_norm": 1.0851635419197394, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 47280 + }, + { + "epoch": 0.47281, + "grad_norm": 0.7752330278270029, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 47281 + }, + { + "epoch": 0.47282, + "grad_norm": 0.6600220645292626, + "learning_rate": 0.003, + "loss": 4.024, + "step": 47282 + }, + { + "epoch": 0.47283, + "grad_norm": 0.6853489950699372, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 47283 + }, + { + "epoch": 0.47284, + "grad_norm": 0.7809574645548271, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 47284 + }, + { + "epoch": 0.47285, + "grad_norm": 0.9487067844296785, + "learning_rate": 0.003, + "loss": 4.0138, + "step": 47285 + }, + { + "epoch": 0.47286, + "grad_norm": 1.0253777540149478, + "learning_rate": 0.003, + "loss": 4.0022, + "step": 47286 + }, + { + "epoch": 0.47287, + "grad_norm": 0.8807871957018003, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 47287 + }, + { + "epoch": 0.47288, + "grad_norm": 0.738636168028379, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 47288 + }, + { + "epoch": 0.47289, + "grad_norm": 0.7217652881457545, + "learning_rate": 0.003, + "loss": 4.04, + "step": 47289 + }, + { + "epoch": 0.4729, + "grad_norm": 0.708868017981002, + "learning_rate": 0.003, + "loss": 4.0187, + "step": 47290 + }, + { + "epoch": 0.47291, + "grad_norm": 0.742618615084672, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 47291 + }, + { + "epoch": 0.47292, + "grad_norm": 0.6919137830739185, + "learning_rate": 0.003, + "loss": 4.0173, + "step": 47292 + }, + { + "epoch": 0.47293, + "grad_norm": 0.650172276141663, + "learning_rate": 0.003, + "loss": 3.9977, + "step": 47293 + }, + { + "epoch": 0.47294, + "grad_norm": 0.7524721560760427, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 47294 + }, + { + "epoch": 0.47295, + "grad_norm": 0.8720590931806712, + "learning_rate": 0.003, + "loss": 4.0039, + "step": 47295 + }, + { + "epoch": 0.47296, + "grad_norm": 0.9955783695321304, + "learning_rate": 0.003, + "loss": 4.0032, + "step": 47296 + }, + { + "epoch": 0.47297, + "grad_norm": 1.115850746587089, + "learning_rate": 0.003, + "loss": 4.0358, + "step": 47297 + }, + { + "epoch": 0.47298, + "grad_norm": 0.9592326010422477, + "learning_rate": 0.003, + "loss": 4.0508, + "step": 47298 + }, + { + "epoch": 0.47299, + "grad_norm": 1.0289288316031433, + "learning_rate": 0.003, + "loss": 4.0565, + "step": 47299 + }, + { + "epoch": 0.473, + "grad_norm": 0.9955049102639716, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 47300 + }, + { + "epoch": 0.47301, + "grad_norm": 0.9797328076567569, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 47301 + }, + { + "epoch": 0.47302, + "grad_norm": 0.99066827916265, + "learning_rate": 0.003, + "loss": 4.0802, + "step": 47302 + }, + { + "epoch": 0.47303, + "grad_norm": 0.9318275437420245, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 47303 + }, + { + "epoch": 0.47304, + "grad_norm": 1.0381933899709759, + "learning_rate": 0.003, + "loss": 4.0525, + "step": 47304 + }, + { + "epoch": 0.47305, + "grad_norm": 0.8950750702102651, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 47305 + }, + { + "epoch": 0.47306, + "grad_norm": 0.8222993168268212, + "learning_rate": 0.003, + "loss": 4.0091, + "step": 47306 + }, + { + "epoch": 0.47307, + "grad_norm": 0.8159694315848991, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 47307 + }, + { + "epoch": 0.47308, + "grad_norm": 0.9153331658713612, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 47308 + }, + { + "epoch": 0.47309, + "grad_norm": 1.085840634464272, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 47309 + }, + { + "epoch": 0.4731, + "grad_norm": 0.8057238855195459, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 47310 + }, + { + "epoch": 0.47311, + "grad_norm": 0.6768965498683787, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 47311 + }, + { + "epoch": 0.47312, + "grad_norm": 0.6562004342094889, + "learning_rate": 0.003, + "loss": 4.0145, + "step": 47312 + }, + { + "epoch": 0.47313, + "grad_norm": 0.6303326616281757, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 47313 + }, + { + "epoch": 0.47314, + "grad_norm": 0.5909603342002195, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 47314 + }, + { + "epoch": 0.47315, + "grad_norm": 0.6172540707453259, + "learning_rate": 0.003, + "loss": 4.0527, + "step": 47315 + }, + { + "epoch": 0.47316, + "grad_norm": 0.7167777574555044, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 47316 + }, + { + "epoch": 0.47317, + "grad_norm": 0.8736869178933756, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 47317 + }, + { + "epoch": 0.47318, + "grad_norm": 0.9102994155901876, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 47318 + }, + { + "epoch": 0.47319, + "grad_norm": 0.9494172218160205, + "learning_rate": 0.003, + "loss": 4.0571, + "step": 47319 + }, + { + "epoch": 0.4732, + "grad_norm": 1.0651574183238401, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 47320 + }, + { + "epoch": 0.47321, + "grad_norm": 0.7988860875583681, + "learning_rate": 0.003, + "loss": 3.9957, + "step": 47321 + }, + { + "epoch": 0.47322, + "grad_norm": 0.7836905054064913, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 47322 + }, + { + "epoch": 0.47323, + "grad_norm": 0.7300749856146049, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 47323 + }, + { + "epoch": 0.47324, + "grad_norm": 0.6998549159198433, + "learning_rate": 0.003, + "loss": 3.9861, + "step": 47324 + }, + { + "epoch": 0.47325, + "grad_norm": 0.6809144236755168, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 47325 + }, + { + "epoch": 0.47326, + "grad_norm": 0.6776765684576294, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 47326 + }, + { + "epoch": 0.47327, + "grad_norm": 0.6821112273490095, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 47327 + }, + { + "epoch": 0.47328, + "grad_norm": 0.7327768402299167, + "learning_rate": 0.003, + "loss": 4.0219, + "step": 47328 + }, + { + "epoch": 0.47329, + "grad_norm": 0.8480202189589746, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 47329 + }, + { + "epoch": 0.4733, + "grad_norm": 0.9578344801873018, + "learning_rate": 0.003, + "loss": 4.0434, + "step": 47330 + }, + { + "epoch": 0.47331, + "grad_norm": 1.116358109744587, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 47331 + }, + { + "epoch": 0.47332, + "grad_norm": 0.7487123518315121, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 47332 + }, + { + "epoch": 0.47333, + "grad_norm": 0.5587608792188105, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 47333 + }, + { + "epoch": 0.47334, + "grad_norm": 0.7854361656242309, + "learning_rate": 0.003, + "loss": 4.0132, + "step": 47334 + }, + { + "epoch": 0.47335, + "grad_norm": 0.9185084903404857, + "learning_rate": 0.003, + "loss": 4.0088, + "step": 47335 + }, + { + "epoch": 0.47336, + "grad_norm": 1.0319447224502323, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 47336 + }, + { + "epoch": 0.47337, + "grad_norm": 0.915501320268583, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 47337 + }, + { + "epoch": 0.47338, + "grad_norm": 0.8725805454077326, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 47338 + }, + { + "epoch": 0.47339, + "grad_norm": 1.0014773644397217, + "learning_rate": 0.003, + "loss": 4.0403, + "step": 47339 + }, + { + "epoch": 0.4734, + "grad_norm": 1.0923661455172746, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 47340 + }, + { + "epoch": 0.47341, + "grad_norm": 0.9783663670935802, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 47341 + }, + { + "epoch": 0.47342, + "grad_norm": 1.1691520437374032, + "learning_rate": 0.003, + "loss": 4.0516, + "step": 47342 + }, + { + "epoch": 0.47343, + "grad_norm": 1.0289350530806467, + "learning_rate": 0.003, + "loss": 4.0768, + "step": 47343 + }, + { + "epoch": 0.47344, + "grad_norm": 0.9469928027674268, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 47344 + }, + { + "epoch": 0.47345, + "grad_norm": 0.8555299609892123, + "learning_rate": 0.003, + "loss": 4.047, + "step": 47345 + }, + { + "epoch": 0.47346, + "grad_norm": 0.781482824703136, + "learning_rate": 0.003, + "loss": 4.0349, + "step": 47346 + }, + { + "epoch": 0.47347, + "grad_norm": 0.7688595134829703, + "learning_rate": 0.003, + "loss": 3.9976, + "step": 47347 + }, + { + "epoch": 0.47348, + "grad_norm": 0.7115232705850402, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 47348 + }, + { + "epoch": 0.47349, + "grad_norm": 0.6582927594091602, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 47349 + }, + { + "epoch": 0.4735, + "grad_norm": 0.6757664917957404, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 47350 + }, + { + "epoch": 0.47351, + "grad_norm": 0.6626269203563512, + "learning_rate": 0.003, + "loss": 4.0475, + "step": 47351 + }, + { + "epoch": 0.47352, + "grad_norm": 0.7400397380765007, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 47352 + }, + { + "epoch": 0.47353, + "grad_norm": 0.6215900656668067, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 47353 + }, + { + "epoch": 0.47354, + "grad_norm": 0.595424454394542, + "learning_rate": 0.003, + "loss": 3.9912, + "step": 47354 + }, + { + "epoch": 0.47355, + "grad_norm": 0.6522331963862413, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 47355 + }, + { + "epoch": 0.47356, + "grad_norm": 0.826348938969325, + "learning_rate": 0.003, + "loss": 4.0841, + "step": 47356 + }, + { + "epoch": 0.47357, + "grad_norm": 1.0392102051357057, + "learning_rate": 0.003, + "loss": 3.9826, + "step": 47357 + }, + { + "epoch": 0.47358, + "grad_norm": 1.0193877064858354, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 47358 + }, + { + "epoch": 0.47359, + "grad_norm": 1.077080461868333, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 47359 + }, + { + "epoch": 0.4736, + "grad_norm": 1.0018826757178503, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 47360 + }, + { + "epoch": 0.47361, + "grad_norm": 0.859356944805961, + "learning_rate": 0.003, + "loss": 4.0077, + "step": 47361 + }, + { + "epoch": 0.47362, + "grad_norm": 0.8686902154431576, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 47362 + }, + { + "epoch": 0.47363, + "grad_norm": 0.945971291340774, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 47363 + }, + { + "epoch": 0.47364, + "grad_norm": 0.9803789392544118, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 47364 + }, + { + "epoch": 0.47365, + "grad_norm": 0.8802410633737254, + "learning_rate": 0.003, + "loss": 4.0432, + "step": 47365 + }, + { + "epoch": 0.47366, + "grad_norm": 0.9016225896696758, + "learning_rate": 0.003, + "loss": 4.0538, + "step": 47366 + }, + { + "epoch": 0.47367, + "grad_norm": 0.9219821603756622, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 47367 + }, + { + "epoch": 0.47368, + "grad_norm": 0.8523621039750451, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 47368 + }, + { + "epoch": 0.47369, + "grad_norm": 0.9881659415935139, + "learning_rate": 0.003, + "loss": 4.037, + "step": 47369 + }, + { + "epoch": 0.4737, + "grad_norm": 1.092206564529014, + "learning_rate": 0.003, + "loss": 4.0422, + "step": 47370 + }, + { + "epoch": 0.47371, + "grad_norm": 1.048609912170502, + "learning_rate": 0.003, + "loss": 4.0425, + "step": 47371 + }, + { + "epoch": 0.47372, + "grad_norm": 0.9616623777421794, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 47372 + }, + { + "epoch": 0.47373, + "grad_norm": 0.9307251899262888, + "learning_rate": 0.003, + "loss": 4.0245, + "step": 47373 + }, + { + "epoch": 0.47374, + "grad_norm": 0.8272463266478862, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 47374 + }, + { + "epoch": 0.47375, + "grad_norm": 0.83302809859009, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 47375 + }, + { + "epoch": 0.47376, + "grad_norm": 0.8987986466825629, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 47376 + }, + { + "epoch": 0.47377, + "grad_norm": 0.8572016272275985, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 47377 + }, + { + "epoch": 0.47378, + "grad_norm": 0.8282535663682714, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 47378 + }, + { + "epoch": 0.47379, + "grad_norm": 0.7740209629419746, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 47379 + }, + { + "epoch": 0.4738, + "grad_norm": 0.7834372462086606, + "learning_rate": 0.003, + "loss": 4.031, + "step": 47380 + }, + { + "epoch": 0.47381, + "grad_norm": 0.8753600233135483, + "learning_rate": 0.003, + "loss": 4.0439, + "step": 47381 + }, + { + "epoch": 0.47382, + "grad_norm": 1.0608899380535475, + "learning_rate": 0.003, + "loss": 4.0651, + "step": 47382 + }, + { + "epoch": 0.47383, + "grad_norm": 1.2413932154926401, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 47383 + }, + { + "epoch": 0.47384, + "grad_norm": 0.685069102980098, + "learning_rate": 0.003, + "loss": 4.0285, + "step": 47384 + }, + { + "epoch": 0.47385, + "grad_norm": 0.6509095823807903, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 47385 + }, + { + "epoch": 0.47386, + "grad_norm": 0.6567206006764176, + "learning_rate": 0.003, + "loss": 4.0267, + "step": 47386 + }, + { + "epoch": 0.47387, + "grad_norm": 0.7017858384223633, + "learning_rate": 0.003, + "loss": 4.0526, + "step": 47387 + }, + { + "epoch": 0.47388, + "grad_norm": 0.7795223818758318, + "learning_rate": 0.003, + "loss": 4.0539, + "step": 47388 + }, + { + "epoch": 0.47389, + "grad_norm": 0.7885976600278327, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 47389 + }, + { + "epoch": 0.4739, + "grad_norm": 0.8735989351098319, + "learning_rate": 0.003, + "loss": 4.0454, + "step": 47390 + }, + { + "epoch": 0.47391, + "grad_norm": 0.8570001625530216, + "learning_rate": 0.003, + "loss": 4.0042, + "step": 47391 + }, + { + "epoch": 0.47392, + "grad_norm": 0.8620713689987887, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 47392 + }, + { + "epoch": 0.47393, + "grad_norm": 0.8160087784757514, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 47393 + }, + { + "epoch": 0.47394, + "grad_norm": 0.7610904538393868, + "learning_rate": 0.003, + "loss": 4.0157, + "step": 47394 + }, + { + "epoch": 0.47395, + "grad_norm": 0.6134081374742693, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 47395 + }, + { + "epoch": 0.47396, + "grad_norm": 0.5717332458657741, + "learning_rate": 0.003, + "loss": 4.0081, + "step": 47396 + }, + { + "epoch": 0.47397, + "grad_norm": 0.5705348232161769, + "learning_rate": 0.003, + "loss": 4.0317, + "step": 47397 + }, + { + "epoch": 0.47398, + "grad_norm": 0.6031765360340591, + "learning_rate": 0.003, + "loss": 3.9977, + "step": 47398 + }, + { + "epoch": 0.47399, + "grad_norm": 0.5283724904006373, + "learning_rate": 0.003, + "loss": 4.0094, + "step": 47399 + }, + { + "epoch": 0.474, + "grad_norm": 0.5419442171859356, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 47400 + }, + { + "epoch": 0.47401, + "grad_norm": 0.5765055064799433, + "learning_rate": 0.003, + "loss": 4.015, + "step": 47401 + }, + { + "epoch": 0.47402, + "grad_norm": 0.6123501951864113, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 47402 + }, + { + "epoch": 0.47403, + "grad_norm": 0.754041831225555, + "learning_rate": 0.003, + "loss": 4.04, + "step": 47403 + }, + { + "epoch": 0.47404, + "grad_norm": 0.9284853913792364, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 47404 + }, + { + "epoch": 0.47405, + "grad_norm": 1.1167608630026442, + "learning_rate": 0.003, + "loss": 4.024, + "step": 47405 + }, + { + "epoch": 0.47406, + "grad_norm": 0.7832694434219292, + "learning_rate": 0.003, + "loss": 4.0262, + "step": 47406 + }, + { + "epoch": 0.47407, + "grad_norm": 0.6503943635839039, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 47407 + }, + { + "epoch": 0.47408, + "grad_norm": 0.6632261411703789, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 47408 + }, + { + "epoch": 0.47409, + "grad_norm": 0.6511466186779484, + "learning_rate": 0.003, + "loss": 4.0214, + "step": 47409 + }, + { + "epoch": 0.4741, + "grad_norm": 0.7487664400805467, + "learning_rate": 0.003, + "loss": 4.0072, + "step": 47410 + }, + { + "epoch": 0.47411, + "grad_norm": 0.8206706152448554, + "learning_rate": 0.003, + "loss": 4.0278, + "step": 47411 + }, + { + "epoch": 0.47412, + "grad_norm": 0.8317316241270534, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 47412 + }, + { + "epoch": 0.47413, + "grad_norm": 0.7899280208841586, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 47413 + }, + { + "epoch": 0.47414, + "grad_norm": 0.742100911198747, + "learning_rate": 0.003, + "loss": 3.9973, + "step": 47414 + }, + { + "epoch": 0.47415, + "grad_norm": 0.6778216814660342, + "learning_rate": 0.003, + "loss": 3.9814, + "step": 47415 + }, + { + "epoch": 0.47416, + "grad_norm": 0.7202673210147933, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 47416 + }, + { + "epoch": 0.47417, + "grad_norm": 1.0537975849141334, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 47417 + }, + { + "epoch": 0.47418, + "grad_norm": 1.4167806639479534, + "learning_rate": 0.003, + "loss": 4.0213, + "step": 47418 + }, + { + "epoch": 0.47419, + "grad_norm": 0.793954331347911, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 47419 + }, + { + "epoch": 0.4742, + "grad_norm": 0.722934625906173, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 47420 + }, + { + "epoch": 0.47421, + "grad_norm": 0.7206408485137508, + "learning_rate": 0.003, + "loss": 4.0197, + "step": 47421 + }, + { + "epoch": 0.47422, + "grad_norm": 0.8151373830214094, + "learning_rate": 0.003, + "loss": 4.01, + "step": 47422 + }, + { + "epoch": 0.47423, + "grad_norm": 0.8191459670603958, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 47423 + }, + { + "epoch": 0.47424, + "grad_norm": 0.8766936887053245, + "learning_rate": 0.003, + "loss": 3.9989, + "step": 47424 + }, + { + "epoch": 0.47425, + "grad_norm": 0.8168405999523051, + "learning_rate": 0.003, + "loss": 4.0095, + "step": 47425 + }, + { + "epoch": 0.47426, + "grad_norm": 0.8339860033719891, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 47426 + }, + { + "epoch": 0.47427, + "grad_norm": 0.9604513631885683, + "learning_rate": 0.003, + "loss": 4.0405, + "step": 47427 + }, + { + "epoch": 0.47428, + "grad_norm": 1.1274214556384532, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 47428 + }, + { + "epoch": 0.47429, + "grad_norm": 0.9938843318146844, + "learning_rate": 0.003, + "loss": 4.0606, + "step": 47429 + }, + { + "epoch": 0.4743, + "grad_norm": 0.9932218240787922, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 47430 + }, + { + "epoch": 0.47431, + "grad_norm": 1.025307548445563, + "learning_rate": 0.003, + "loss": 4.0592, + "step": 47431 + }, + { + "epoch": 0.47432, + "grad_norm": 1.0847837919474799, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 47432 + }, + { + "epoch": 0.47433, + "grad_norm": 0.9438863724839264, + "learning_rate": 0.003, + "loss": 4.0605, + "step": 47433 + }, + { + "epoch": 0.47434, + "grad_norm": 1.1441165082430305, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 47434 + }, + { + "epoch": 0.47435, + "grad_norm": 1.026298090674446, + "learning_rate": 0.003, + "loss": 4.0144, + "step": 47435 + }, + { + "epoch": 0.47436, + "grad_norm": 1.094564334115231, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 47436 + }, + { + "epoch": 0.47437, + "grad_norm": 1.0015827897363327, + "learning_rate": 0.003, + "loss": 4.0453, + "step": 47437 + }, + { + "epoch": 0.47438, + "grad_norm": 1.0679874693301958, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 47438 + }, + { + "epoch": 0.47439, + "grad_norm": 0.8642473310541415, + "learning_rate": 0.003, + "loss": 4.009, + "step": 47439 + }, + { + "epoch": 0.4744, + "grad_norm": 0.8052250108500079, + "learning_rate": 0.003, + "loss": 4.0257, + "step": 47440 + }, + { + "epoch": 0.47441, + "grad_norm": 0.9923626188358333, + "learning_rate": 0.003, + "loss": 4.0414, + "step": 47441 + }, + { + "epoch": 0.47442, + "grad_norm": 0.9977958062094385, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 47442 + }, + { + "epoch": 0.47443, + "grad_norm": 0.9554007560510567, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 47443 + }, + { + "epoch": 0.47444, + "grad_norm": 0.9424590423928642, + "learning_rate": 0.003, + "loss": 4.0642, + "step": 47444 + }, + { + "epoch": 0.47445, + "grad_norm": 0.9874564977188993, + "learning_rate": 0.003, + "loss": 4.0336, + "step": 47445 + }, + { + "epoch": 0.47446, + "grad_norm": 0.953576023007651, + "learning_rate": 0.003, + "loss": 4.0373, + "step": 47446 + }, + { + "epoch": 0.47447, + "grad_norm": 0.910520820247866, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 47447 + }, + { + "epoch": 0.47448, + "grad_norm": 1.023833108447027, + "learning_rate": 0.003, + "loss": 4.0739, + "step": 47448 + }, + { + "epoch": 0.47449, + "grad_norm": 0.881295414980877, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 47449 + }, + { + "epoch": 0.4745, + "grad_norm": 0.7475415324328158, + "learning_rate": 0.003, + "loss": 4.0396, + "step": 47450 + }, + { + "epoch": 0.47451, + "grad_norm": 0.7490733995178952, + "learning_rate": 0.003, + "loss": 4.0125, + "step": 47451 + }, + { + "epoch": 0.47452, + "grad_norm": 0.8625649469474778, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 47452 + }, + { + "epoch": 0.47453, + "grad_norm": 0.8032132513095692, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 47453 + }, + { + "epoch": 0.47454, + "grad_norm": 0.7037950687636105, + "learning_rate": 0.003, + "loss": 4.0523, + "step": 47454 + }, + { + "epoch": 0.47455, + "grad_norm": 0.7517498938338862, + "learning_rate": 0.003, + "loss": 4.0567, + "step": 47455 + }, + { + "epoch": 0.47456, + "grad_norm": 0.7125407565908438, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 47456 + }, + { + "epoch": 0.47457, + "grad_norm": 0.7297377247834073, + "learning_rate": 0.003, + "loss": 4.0362, + "step": 47457 + }, + { + "epoch": 0.47458, + "grad_norm": 0.8831679159145067, + "learning_rate": 0.003, + "loss": 4.0171, + "step": 47458 + }, + { + "epoch": 0.47459, + "grad_norm": 1.134715844288857, + "learning_rate": 0.003, + "loss": 4.0156, + "step": 47459 + }, + { + "epoch": 0.4746, + "grad_norm": 0.8074152252481829, + "learning_rate": 0.003, + "loss": 4.02, + "step": 47460 + }, + { + "epoch": 0.47461, + "grad_norm": 0.5954222005470062, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 47461 + }, + { + "epoch": 0.47462, + "grad_norm": 0.5967607164164894, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 47462 + }, + { + "epoch": 0.47463, + "grad_norm": 0.5592280232867486, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 47463 + }, + { + "epoch": 0.47464, + "grad_norm": 0.5672759782078174, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 47464 + }, + { + "epoch": 0.47465, + "grad_norm": 0.6414464788759173, + "learning_rate": 0.003, + "loss": 4.0395, + "step": 47465 + }, + { + "epoch": 0.47466, + "grad_norm": 0.6946582402173919, + "learning_rate": 0.003, + "loss": 4.0216, + "step": 47466 + }, + { + "epoch": 0.47467, + "grad_norm": 0.7442428991630932, + "learning_rate": 0.003, + "loss": 4.0225, + "step": 47467 + }, + { + "epoch": 0.47468, + "grad_norm": 0.775661482339965, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 47468 + }, + { + "epoch": 0.47469, + "grad_norm": 0.8352452788589969, + "learning_rate": 0.003, + "loss": 4.0, + "step": 47469 + }, + { + "epoch": 0.4747, + "grad_norm": 0.9742513257076323, + "learning_rate": 0.003, + "loss": 4.0364, + "step": 47470 + }, + { + "epoch": 0.47471, + "grad_norm": 1.1619592341275882, + "learning_rate": 0.003, + "loss": 4.0316, + "step": 47471 + }, + { + "epoch": 0.47472, + "grad_norm": 0.8031309298271193, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 47472 + }, + { + "epoch": 0.47473, + "grad_norm": 0.7325206762408147, + "learning_rate": 0.003, + "loss": 4.0341, + "step": 47473 + }, + { + "epoch": 0.47474, + "grad_norm": 0.7449271458712785, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 47474 + }, + { + "epoch": 0.47475, + "grad_norm": 0.8320571845011226, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 47475 + }, + { + "epoch": 0.47476, + "grad_norm": 0.8903985230506578, + "learning_rate": 0.003, + "loss": 4.0232, + "step": 47476 + }, + { + "epoch": 0.47477, + "grad_norm": 0.8626604330002412, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 47477 + }, + { + "epoch": 0.47478, + "grad_norm": 0.8384913823162584, + "learning_rate": 0.003, + "loss": 4.0074, + "step": 47478 + }, + { + "epoch": 0.47479, + "grad_norm": 0.8638680708410613, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 47479 + }, + { + "epoch": 0.4748, + "grad_norm": 0.8360457681658163, + "learning_rate": 0.003, + "loss": 4.0148, + "step": 47480 + }, + { + "epoch": 0.47481, + "grad_norm": 0.8385817522128405, + "learning_rate": 0.003, + "loss": 4.0011, + "step": 47481 + }, + { + "epoch": 0.47482, + "grad_norm": 0.6998705001961895, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 47482 + }, + { + "epoch": 0.47483, + "grad_norm": 0.8094776295345144, + "learning_rate": 0.003, + "loss": 3.999, + "step": 47483 + }, + { + "epoch": 0.47484, + "grad_norm": 0.9633572548107198, + "learning_rate": 0.003, + "loss": 4.0038, + "step": 47484 + }, + { + "epoch": 0.47485, + "grad_norm": 1.048274181367503, + "learning_rate": 0.003, + "loss": 4.0584, + "step": 47485 + }, + { + "epoch": 0.47486, + "grad_norm": 1.0645836229260792, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 47486 + }, + { + "epoch": 0.47487, + "grad_norm": 0.9768616242735658, + "learning_rate": 0.003, + "loss": 4.0061, + "step": 47487 + }, + { + "epoch": 0.47488, + "grad_norm": 0.9717116709539831, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 47488 + }, + { + "epoch": 0.47489, + "grad_norm": 0.9189387249330263, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 47489 + }, + { + "epoch": 0.4749, + "grad_norm": 0.9113754863235535, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 47490 + }, + { + "epoch": 0.47491, + "grad_norm": 0.9032425056368536, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 47491 + }, + { + "epoch": 0.47492, + "grad_norm": 0.9148730220505318, + "learning_rate": 0.003, + "loss": 4.0409, + "step": 47492 + }, + { + "epoch": 0.47493, + "grad_norm": 0.8539462842603944, + "learning_rate": 0.003, + "loss": 4.0514, + "step": 47493 + }, + { + "epoch": 0.47494, + "grad_norm": 0.853931469779715, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 47494 + }, + { + "epoch": 0.47495, + "grad_norm": 0.9138195683421922, + "learning_rate": 0.003, + "loss": 4.048, + "step": 47495 + }, + { + "epoch": 0.47496, + "grad_norm": 0.881687088801394, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 47496 + }, + { + "epoch": 0.47497, + "grad_norm": 0.6811427815079611, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 47497 + }, + { + "epoch": 0.47498, + "grad_norm": 0.611430802206228, + "learning_rate": 0.003, + "loss": 4.0452, + "step": 47498 + }, + { + "epoch": 0.47499, + "grad_norm": 0.6061762766032815, + "learning_rate": 0.003, + "loss": 4.003, + "step": 47499 + }, + { + "epoch": 0.475, + "grad_norm": 0.6199796742529895, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 47500 + }, + { + "epoch": 0.47501, + "grad_norm": 0.6297814686546989, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 47501 + }, + { + "epoch": 0.47502, + "grad_norm": 0.6808355439313556, + "learning_rate": 0.003, + "loss": 4.0438, + "step": 47502 + }, + { + "epoch": 0.47503, + "grad_norm": 0.6976700989479697, + "learning_rate": 0.003, + "loss": 4.0389, + "step": 47503 + }, + { + "epoch": 0.47504, + "grad_norm": 0.8299160875202131, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 47504 + }, + { + "epoch": 0.47505, + "grad_norm": 1.0595208757269945, + "learning_rate": 0.003, + "loss": 4.0479, + "step": 47505 + }, + { + "epoch": 0.47506, + "grad_norm": 1.1514536944694178, + "learning_rate": 0.003, + "loss": 4.0827, + "step": 47506 + }, + { + "epoch": 0.47507, + "grad_norm": 0.6863481906082565, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 47507 + }, + { + "epoch": 0.47508, + "grad_norm": 0.6976941848128039, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 47508 + }, + { + "epoch": 0.47509, + "grad_norm": 0.8403693978076893, + "learning_rate": 0.003, + "loss": 4.0206, + "step": 47509 + }, + { + "epoch": 0.4751, + "grad_norm": 0.8769596052537979, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 47510 + }, + { + "epoch": 0.47511, + "grad_norm": 0.9576636026766997, + "learning_rate": 0.003, + "loss": 4.0286, + "step": 47511 + }, + { + "epoch": 0.47512, + "grad_norm": 0.8941644091451623, + "learning_rate": 0.003, + "loss": 4.0107, + "step": 47512 + }, + { + "epoch": 0.47513, + "grad_norm": 0.7619938216605133, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 47513 + }, + { + "epoch": 0.47514, + "grad_norm": 0.7138506150186605, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 47514 + }, + { + "epoch": 0.47515, + "grad_norm": 0.7733144572498661, + "learning_rate": 0.003, + "loss": 4.017, + "step": 47515 + }, + { + "epoch": 0.47516, + "grad_norm": 0.9545423260201221, + "learning_rate": 0.003, + "loss": 4.0152, + "step": 47516 + }, + { + "epoch": 0.47517, + "grad_norm": 1.1305862635775648, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 47517 + }, + { + "epoch": 0.47518, + "grad_norm": 0.9774238440237615, + "learning_rate": 0.003, + "loss": 4.0563, + "step": 47518 + }, + { + "epoch": 0.47519, + "grad_norm": 0.8888139485855612, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 47519 + }, + { + "epoch": 0.4752, + "grad_norm": 0.8902670648215439, + "learning_rate": 0.003, + "loss": 4.0401, + "step": 47520 + }, + { + "epoch": 0.47521, + "grad_norm": 1.016569231223203, + "learning_rate": 0.003, + "loss": 4.0099, + "step": 47521 + }, + { + "epoch": 0.47522, + "grad_norm": 1.2551973793725895, + "learning_rate": 0.003, + "loss": 4.037, + "step": 47522 + }, + { + "epoch": 0.47523, + "grad_norm": 0.6708821899157166, + "learning_rate": 0.003, + "loss": 4.051, + "step": 47523 + }, + { + "epoch": 0.47524, + "grad_norm": 0.6890399314198608, + "learning_rate": 0.003, + "loss": 4.001, + "step": 47524 + }, + { + "epoch": 0.47525, + "grad_norm": 0.8733077247224074, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 47525 + }, + { + "epoch": 0.47526, + "grad_norm": 1.0034264766903274, + "learning_rate": 0.003, + "loss": 4.0207, + "step": 47526 + }, + { + "epoch": 0.47527, + "grad_norm": 0.9984370788395662, + "learning_rate": 0.003, + "loss": 4.0499, + "step": 47527 + }, + { + "epoch": 0.47528, + "grad_norm": 0.7542827354147795, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 47528 + }, + { + "epoch": 0.47529, + "grad_norm": 0.7355023432437223, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 47529 + }, + { + "epoch": 0.4753, + "grad_norm": 0.8161896610317478, + "learning_rate": 0.003, + "loss": 4.0512, + "step": 47530 + }, + { + "epoch": 0.47531, + "grad_norm": 0.9559531555796885, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 47531 + }, + { + "epoch": 0.47532, + "grad_norm": 1.0407167002432747, + "learning_rate": 0.003, + "loss": 3.9971, + "step": 47532 + }, + { + "epoch": 0.47533, + "grad_norm": 1.0413537372108381, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 47533 + }, + { + "epoch": 0.47534, + "grad_norm": 0.9720815723467959, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 47534 + }, + { + "epoch": 0.47535, + "grad_norm": 0.8291082608556773, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 47535 + }, + { + "epoch": 0.47536, + "grad_norm": 0.8247250013496531, + "learning_rate": 0.003, + "loss": 4.056, + "step": 47536 + }, + { + "epoch": 0.47537, + "grad_norm": 0.9040780394856177, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 47537 + }, + { + "epoch": 0.47538, + "grad_norm": 1.030513110225093, + "learning_rate": 0.003, + "loss": 4.0121, + "step": 47538 + }, + { + "epoch": 0.47539, + "grad_norm": 1.1169514193900465, + "learning_rate": 0.003, + "loss": 4.0548, + "step": 47539 + }, + { + "epoch": 0.4754, + "grad_norm": 0.939858261212193, + "learning_rate": 0.003, + "loss": 4.0487, + "step": 47540 + }, + { + "epoch": 0.47541, + "grad_norm": 1.1151138843267543, + "learning_rate": 0.003, + "loss": 4.0765, + "step": 47541 + }, + { + "epoch": 0.47542, + "grad_norm": 0.9111318793821608, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 47542 + }, + { + "epoch": 0.47543, + "grad_norm": 0.703000850492412, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 47543 + }, + { + "epoch": 0.47544, + "grad_norm": 0.6352626487183202, + "learning_rate": 0.003, + "loss": 4.0406, + "step": 47544 + }, + { + "epoch": 0.47545, + "grad_norm": 0.6637989445478372, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 47545 + }, + { + "epoch": 0.47546, + "grad_norm": 0.7546600337196552, + "learning_rate": 0.003, + "loss": 4.0326, + "step": 47546 + }, + { + "epoch": 0.47547, + "grad_norm": 0.8806884030879463, + "learning_rate": 0.003, + "loss": 4.02, + "step": 47547 + }, + { + "epoch": 0.47548, + "grad_norm": 0.9641700082134267, + "learning_rate": 0.003, + "loss": 4.0383, + "step": 47548 + }, + { + "epoch": 0.47549, + "grad_norm": 1.0591662373159254, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 47549 + }, + { + "epoch": 0.4755, + "grad_norm": 0.9775315681207852, + "learning_rate": 0.003, + "loss": 4.0773, + "step": 47550 + }, + { + "epoch": 0.47551, + "grad_norm": 0.8967870236417337, + "learning_rate": 0.003, + "loss": 4.0408, + "step": 47551 + }, + { + "epoch": 0.47552, + "grad_norm": 0.7977910930076761, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 47552 + }, + { + "epoch": 0.47553, + "grad_norm": 0.7243556263916225, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 47553 + }, + { + "epoch": 0.47554, + "grad_norm": 0.7149072418144832, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 47554 + }, + { + "epoch": 0.47555, + "grad_norm": 0.6788574914447846, + "learning_rate": 0.003, + "loss": 4.0384, + "step": 47555 + }, + { + "epoch": 0.47556, + "grad_norm": 0.7555594609264338, + "learning_rate": 0.003, + "loss": 4.0569, + "step": 47556 + }, + { + "epoch": 0.47557, + "grad_norm": 0.7334441975868764, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 47557 + }, + { + "epoch": 0.47558, + "grad_norm": 0.8188096345431805, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 47558 + }, + { + "epoch": 0.47559, + "grad_norm": 0.8403204918757706, + "learning_rate": 0.003, + "loss": 4.034, + "step": 47559 + }, + { + "epoch": 0.4756, + "grad_norm": 0.7822055803203246, + "learning_rate": 0.003, + "loss": 4.0511, + "step": 47560 + }, + { + "epoch": 0.47561, + "grad_norm": 0.7297288011434149, + "learning_rate": 0.003, + "loss": 3.9843, + "step": 47561 + }, + { + "epoch": 0.47562, + "grad_norm": 0.6648199192718085, + "learning_rate": 0.003, + "loss": 4.0451, + "step": 47562 + }, + { + "epoch": 0.47563, + "grad_norm": 0.5461331734873469, + "learning_rate": 0.003, + "loss": 4.0259, + "step": 47563 + }, + { + "epoch": 0.47564, + "grad_norm": 0.5732115653116798, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 47564 + }, + { + "epoch": 0.47565, + "grad_norm": 0.6068798202069063, + "learning_rate": 0.003, + "loss": 4.0084, + "step": 47565 + }, + { + "epoch": 0.47566, + "grad_norm": 0.7883577594057999, + "learning_rate": 0.003, + "loss": 4.007, + "step": 47566 + }, + { + "epoch": 0.47567, + "grad_norm": 1.1190211239978405, + "learning_rate": 0.003, + "loss": 4.0426, + "step": 47567 + }, + { + "epoch": 0.47568, + "grad_norm": 1.090539839847193, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 47568 + }, + { + "epoch": 0.47569, + "grad_norm": 0.8120821697218863, + "learning_rate": 0.003, + "loss": 3.9932, + "step": 47569 + }, + { + "epoch": 0.4757, + "grad_norm": 0.7202843247028032, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 47570 + }, + { + "epoch": 0.47571, + "grad_norm": 0.787493751721023, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 47571 + }, + { + "epoch": 0.47572, + "grad_norm": 0.8154559406334877, + "learning_rate": 0.003, + "loss": 4.0254, + "step": 47572 + }, + { + "epoch": 0.47573, + "grad_norm": 0.8256027439544361, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 47573 + }, + { + "epoch": 0.47574, + "grad_norm": 0.7789839560337557, + "learning_rate": 0.003, + "loss": 4.0009, + "step": 47574 + }, + { + "epoch": 0.47575, + "grad_norm": 0.8304980596740212, + "learning_rate": 0.003, + "loss": 3.9968, + "step": 47575 + }, + { + "epoch": 0.47576, + "grad_norm": 0.9747812169833906, + "learning_rate": 0.003, + "loss": 4.019, + "step": 47576 + }, + { + "epoch": 0.47577, + "grad_norm": 1.034891718363229, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 47577 + }, + { + "epoch": 0.47578, + "grad_norm": 0.9821236880716878, + "learning_rate": 0.003, + "loss": 4.0223, + "step": 47578 + }, + { + "epoch": 0.47579, + "grad_norm": 1.0118860762654442, + "learning_rate": 0.003, + "loss": 4.044, + "step": 47579 + }, + { + "epoch": 0.4758, + "grad_norm": 0.9204393605203838, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 47580 + }, + { + "epoch": 0.47581, + "grad_norm": 0.9175695026899997, + "learning_rate": 0.003, + "loss": 4.0637, + "step": 47581 + }, + { + "epoch": 0.47582, + "grad_norm": 0.9604818832631192, + "learning_rate": 0.003, + "loss": 4.0412, + "step": 47582 + }, + { + "epoch": 0.47583, + "grad_norm": 1.023863656493665, + "learning_rate": 0.003, + "loss": 4.0297, + "step": 47583 + }, + { + "epoch": 0.47584, + "grad_norm": 1.02201653404024, + "learning_rate": 0.003, + "loss": 4.0492, + "step": 47584 + }, + { + "epoch": 0.47585, + "grad_norm": 0.8884899717317686, + "learning_rate": 0.003, + "loss": 4.0513, + "step": 47585 + }, + { + "epoch": 0.47586, + "grad_norm": 0.8792497976631902, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 47586 + }, + { + "epoch": 0.47587, + "grad_norm": 0.826774973246381, + "learning_rate": 0.003, + "loss": 4.0141, + "step": 47587 + }, + { + "epoch": 0.47588, + "grad_norm": 0.8550628937619118, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 47588 + }, + { + "epoch": 0.47589, + "grad_norm": 0.841913070464551, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 47589 + }, + { + "epoch": 0.4759, + "grad_norm": 0.7939443991753441, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 47590 + }, + { + "epoch": 0.47591, + "grad_norm": 0.8232469965382759, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 47591 + }, + { + "epoch": 0.47592, + "grad_norm": 0.7365095674498748, + "learning_rate": 0.003, + "loss": 4.0056, + "step": 47592 + }, + { + "epoch": 0.47593, + "grad_norm": 0.6230442534293464, + "learning_rate": 0.003, + "loss": 3.9959, + "step": 47593 + }, + { + "epoch": 0.47594, + "grad_norm": 0.5933536870876539, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 47594 + }, + { + "epoch": 0.47595, + "grad_norm": 0.5558306180209382, + "learning_rate": 0.003, + "loss": 4.0381, + "step": 47595 + }, + { + "epoch": 0.47596, + "grad_norm": 0.5273789397808628, + "learning_rate": 0.003, + "loss": 4.0048, + "step": 47596 + }, + { + "epoch": 0.47597, + "grad_norm": 0.5329577234010504, + "learning_rate": 0.003, + "loss": 4.0122, + "step": 47597 + }, + { + "epoch": 0.47598, + "grad_norm": 0.523398537472556, + "learning_rate": 0.003, + "loss": 4.0069, + "step": 47598 + }, + { + "epoch": 0.47599, + "grad_norm": 0.60974232376848, + "learning_rate": 0.003, + "loss": 4.0054, + "step": 47599 + }, + { + "epoch": 0.476, + "grad_norm": 0.7010297207321048, + "learning_rate": 0.003, + "loss": 4.0459, + "step": 47600 + }, + { + "epoch": 0.47601, + "grad_norm": 0.7462992884971767, + "learning_rate": 0.003, + "loss": 4.0084, + "step": 47601 + }, + { + "epoch": 0.47602, + "grad_norm": 0.7729847381757315, + "learning_rate": 0.003, + "loss": 4.011, + "step": 47602 + }, + { + "epoch": 0.47603, + "grad_norm": 0.9101810046833523, + "learning_rate": 0.003, + "loss": 3.9975, + "step": 47603 + }, + { + "epoch": 0.47604, + "grad_norm": 1.0922809327603213, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 47604 + }, + { + "epoch": 0.47605, + "grad_norm": 0.9556431222414963, + "learning_rate": 0.003, + "loss": 4.0501, + "step": 47605 + }, + { + "epoch": 0.47606, + "grad_norm": 0.9002987303248436, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 47606 + }, + { + "epoch": 0.47607, + "grad_norm": 0.813106801018474, + "learning_rate": 0.003, + "loss": 4.0194, + "step": 47607 + }, + { + "epoch": 0.47608, + "grad_norm": 0.7848542152169278, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 47608 + }, + { + "epoch": 0.47609, + "grad_norm": 0.7228751372264803, + "learning_rate": 0.003, + "loss": 4.0268, + "step": 47609 + }, + { + "epoch": 0.4761, + "grad_norm": 0.6415604688712434, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 47610 + }, + { + "epoch": 0.47611, + "grad_norm": 0.6573748769598844, + "learning_rate": 0.003, + "loss": 4.0242, + "step": 47611 + }, + { + "epoch": 0.47612, + "grad_norm": 0.8080108929633145, + "learning_rate": 0.003, + "loss": 4.0342, + "step": 47612 + }, + { + "epoch": 0.47613, + "grad_norm": 1.0066355474385977, + "learning_rate": 0.003, + "loss": 4.0378, + "step": 47613 + }, + { + "epoch": 0.47614, + "grad_norm": 1.2242346903526151, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 47614 + }, + { + "epoch": 0.47615, + "grad_norm": 0.8275590298978482, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 47615 + }, + { + "epoch": 0.47616, + "grad_norm": 0.9972301968584978, + "learning_rate": 0.003, + "loss": 4.0674, + "step": 47616 + }, + { + "epoch": 0.47617, + "grad_norm": 1.2546474655218267, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 47617 + }, + { + "epoch": 0.47618, + "grad_norm": 0.9186833214814879, + "learning_rate": 0.003, + "loss": 4.0243, + "step": 47618 + }, + { + "epoch": 0.47619, + "grad_norm": 1.0322150261933805, + "learning_rate": 0.003, + "loss": 4.042, + "step": 47619 + }, + { + "epoch": 0.4762, + "grad_norm": 1.1268534994013288, + "learning_rate": 0.003, + "loss": 4.0411, + "step": 47620 + }, + { + "epoch": 0.47621, + "grad_norm": 0.81970008287355, + "learning_rate": 0.003, + "loss": 4.0437, + "step": 47621 + }, + { + "epoch": 0.47622, + "grad_norm": 0.797082612255426, + "learning_rate": 0.003, + "loss": 4.0367, + "step": 47622 + }, + { + "epoch": 0.47623, + "grad_norm": 0.8072576542261874, + "learning_rate": 0.003, + "loss": 4.0291, + "step": 47623 + }, + { + "epoch": 0.47624, + "grad_norm": 0.8284693625342192, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 47624 + }, + { + "epoch": 0.47625, + "grad_norm": 0.9229152804011024, + "learning_rate": 0.003, + "loss": 4.0509, + "step": 47625 + }, + { + "epoch": 0.47626, + "grad_norm": 1.119335605680228, + "learning_rate": 0.003, + "loss": 4.0517, + "step": 47626 + }, + { + "epoch": 0.47627, + "grad_norm": 0.9782215283311749, + "learning_rate": 0.003, + "loss": 4.0416, + "step": 47627 + }, + { + "epoch": 0.47628, + "grad_norm": 1.0239369728946672, + "learning_rate": 0.003, + "loss": 4.0566, + "step": 47628 + }, + { + "epoch": 0.47629, + "grad_norm": 0.925244254650923, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 47629 + }, + { + "epoch": 0.4763, + "grad_norm": 0.8363091655810684, + "learning_rate": 0.003, + "loss": 4.048, + "step": 47630 + }, + { + "epoch": 0.47631, + "grad_norm": 0.6471518830048935, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 47631 + }, + { + "epoch": 0.47632, + "grad_norm": 0.6666217757930388, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 47632 + }, + { + "epoch": 0.47633, + "grad_norm": 0.7321395905449485, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 47633 + }, + { + "epoch": 0.47634, + "grad_norm": 0.7384665259231166, + "learning_rate": 0.003, + "loss": 4.0337, + "step": 47634 + }, + { + "epoch": 0.47635, + "grad_norm": 0.7219232426454965, + "learning_rate": 0.003, + "loss": 4.018, + "step": 47635 + }, + { + "epoch": 0.47636, + "grad_norm": 0.729622459387532, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 47636 + }, + { + "epoch": 0.47637, + "grad_norm": 0.780865471800314, + "learning_rate": 0.003, + "loss": 4.0368, + "step": 47637 + }, + { + "epoch": 0.47638, + "grad_norm": 0.8482064587090854, + "learning_rate": 0.003, + "loss": 4.0287, + "step": 47638 + }, + { + "epoch": 0.47639, + "grad_norm": 0.8615328556877496, + "learning_rate": 0.003, + "loss": 4.0551, + "step": 47639 + }, + { + "epoch": 0.4764, + "grad_norm": 0.8905876689796756, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 47640 + }, + { + "epoch": 0.47641, + "grad_norm": 0.9001717290100453, + "learning_rate": 0.003, + "loss": 4.0062, + "step": 47641 + }, + { + "epoch": 0.47642, + "grad_norm": 1.1060745277619672, + "learning_rate": 0.003, + "loss": 4.0417, + "step": 47642 + }, + { + "epoch": 0.47643, + "grad_norm": 0.896956540556394, + "learning_rate": 0.003, + "loss": 4.0247, + "step": 47643 + }, + { + "epoch": 0.47644, + "grad_norm": 0.8958003029741517, + "learning_rate": 0.003, + "loss": 4.0258, + "step": 47644 + }, + { + "epoch": 0.47645, + "grad_norm": 0.8565511741583958, + "learning_rate": 0.003, + "loss": 4.031, + "step": 47645 + }, + { + "epoch": 0.47646, + "grad_norm": 0.8155724504898623, + "learning_rate": 0.003, + "loss": 4.0105, + "step": 47646 + }, + { + "epoch": 0.47647, + "grad_norm": 0.7107314134889471, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 47647 + }, + { + "epoch": 0.47648, + "grad_norm": 0.5898792442005305, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 47648 + }, + { + "epoch": 0.47649, + "grad_norm": 0.5946131254597927, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 47649 + }, + { + "epoch": 0.4765, + "grad_norm": 0.5792072983838489, + "learning_rate": 0.003, + "loss": 4.016, + "step": 47650 + }, + { + "epoch": 0.47651, + "grad_norm": 0.564869910662415, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 47651 + }, + { + "epoch": 0.47652, + "grad_norm": 0.6174162500511973, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 47652 + }, + { + "epoch": 0.47653, + "grad_norm": 0.7363625985359497, + "learning_rate": 0.003, + "loss": 4.0418, + "step": 47653 + }, + { + "epoch": 0.47654, + "grad_norm": 0.8370572509406229, + "learning_rate": 0.003, + "loss": 4.043, + "step": 47654 + }, + { + "epoch": 0.47655, + "grad_norm": 1.0006831079011025, + "learning_rate": 0.003, + "loss": 3.9902, + "step": 47655 + }, + { + "epoch": 0.47656, + "grad_norm": 1.1301226084840001, + "learning_rate": 0.003, + "loss": 4.007, + "step": 47656 + }, + { + "epoch": 0.47657, + "grad_norm": 0.9726065196312808, + "learning_rate": 0.003, + "loss": 4.0491, + "step": 47657 + }, + { + "epoch": 0.47658, + "grad_norm": 0.8785861283551794, + "learning_rate": 0.003, + "loss": 4.046, + "step": 47658 + }, + { + "epoch": 0.47659, + "grad_norm": 0.775925386567446, + "learning_rate": 0.003, + "loss": 4.0215, + "step": 47659 + }, + { + "epoch": 0.4766, + "grad_norm": 0.8168984102578334, + "learning_rate": 0.003, + "loss": 4.0102, + "step": 47660 + }, + { + "epoch": 0.47661, + "grad_norm": 0.8930817848287571, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 47661 + }, + { + "epoch": 0.47662, + "grad_norm": 0.8208009032725302, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 47662 + }, + { + "epoch": 0.47663, + "grad_norm": 0.7771225228913843, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 47663 + }, + { + "epoch": 0.47664, + "grad_norm": 0.7227578354538274, + "learning_rate": 0.003, + "loss": 3.9865, + "step": 47664 + }, + { + "epoch": 0.47665, + "grad_norm": 0.7495691411294093, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 47665 + }, + { + "epoch": 0.47666, + "grad_norm": 0.7648394300138409, + "learning_rate": 0.003, + "loss": 4.0355, + "step": 47666 + }, + { + "epoch": 0.47667, + "grad_norm": 0.7953633190192027, + "learning_rate": 0.003, + "loss": 4.0596, + "step": 47667 + }, + { + "epoch": 0.47668, + "grad_norm": 0.8247840790414418, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 47668 + }, + { + "epoch": 0.47669, + "grad_norm": 0.8548389563642611, + "learning_rate": 0.003, + "loss": 4.0218, + "step": 47669 + }, + { + "epoch": 0.4767, + "grad_norm": 0.9281549309165589, + "learning_rate": 0.003, + "loss": 4.0379, + "step": 47670 + }, + { + "epoch": 0.47671, + "grad_norm": 1.0139038142879098, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 47671 + }, + { + "epoch": 0.47672, + "grad_norm": 1.0081825931521977, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 47672 + }, + { + "epoch": 0.47673, + "grad_norm": 1.2280956234369924, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 47673 + }, + { + "epoch": 0.47674, + "grad_norm": 0.9505972020373886, + "learning_rate": 0.003, + "loss": 4.0582, + "step": 47674 + }, + { + "epoch": 0.47675, + "grad_norm": 0.90099605498499, + "learning_rate": 0.003, + "loss": 4.0485, + "step": 47675 + }, + { + "epoch": 0.47676, + "grad_norm": 0.9674130364003583, + "learning_rate": 0.003, + "loss": 4.0532, + "step": 47676 + }, + { + "epoch": 0.47677, + "grad_norm": 0.9686014050289332, + "learning_rate": 0.003, + "loss": 4.0552, + "step": 47677 + }, + { + "epoch": 0.47678, + "grad_norm": 0.8548355288615523, + "learning_rate": 0.003, + "loss": 4.0536, + "step": 47678 + }, + { + "epoch": 0.47679, + "grad_norm": 0.7140776501819007, + "learning_rate": 0.003, + "loss": 4.0169, + "step": 47679 + }, + { + "epoch": 0.4768, + "grad_norm": 0.747736045413728, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 47680 + }, + { + "epoch": 0.47681, + "grad_norm": 0.816712468576141, + "learning_rate": 0.003, + "loss": 4.04, + "step": 47681 + }, + { + "epoch": 0.47682, + "grad_norm": 1.0170822872993321, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 47682 + }, + { + "epoch": 0.47683, + "grad_norm": 1.2239001575448005, + "learning_rate": 0.003, + "loss": 4.057, + "step": 47683 + }, + { + "epoch": 0.47684, + "grad_norm": 0.8874467772472558, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 47684 + }, + { + "epoch": 0.47685, + "grad_norm": 0.8171934057333661, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 47685 + }, + { + "epoch": 0.47686, + "grad_norm": 0.8698052352814176, + "learning_rate": 0.003, + "loss": 4.065, + "step": 47686 + }, + { + "epoch": 0.47687, + "grad_norm": 1.0273930492965366, + "learning_rate": 0.003, + "loss": 4.0482, + "step": 47687 + }, + { + "epoch": 0.47688, + "grad_norm": 1.0711679685627236, + "learning_rate": 0.003, + "loss": 4.0531, + "step": 47688 + }, + { + "epoch": 0.47689, + "grad_norm": 0.9030995709953549, + "learning_rate": 0.003, + "loss": 4.0082, + "step": 47689 + }, + { + "epoch": 0.4769, + "grad_norm": 0.9253361668705187, + "learning_rate": 0.003, + "loss": 4.0691, + "step": 47690 + }, + { + "epoch": 0.47691, + "grad_norm": 0.9453239330264422, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 47691 + }, + { + "epoch": 0.47692, + "grad_norm": 1.0202712763780646, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 47692 + }, + { + "epoch": 0.47693, + "grad_norm": 1.0342384664495703, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 47693 + }, + { + "epoch": 0.47694, + "grad_norm": 0.929288679714747, + "learning_rate": 0.003, + "loss": 4.0786, + "step": 47694 + }, + { + "epoch": 0.47695, + "grad_norm": 0.8738439719732429, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 47695 + }, + { + "epoch": 0.47696, + "grad_norm": 0.8209788056648498, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 47696 + }, + { + "epoch": 0.47697, + "grad_norm": 0.7942356844238143, + "learning_rate": 0.003, + "loss": 4.0595, + "step": 47697 + }, + { + "epoch": 0.47698, + "grad_norm": 0.7892564699200975, + "learning_rate": 0.003, + "loss": 4.0397, + "step": 47698 + }, + { + "epoch": 0.47699, + "grad_norm": 0.7683909441825465, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 47699 + }, + { + "epoch": 0.477, + "grad_norm": 0.8371908763344773, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 47700 + }, + { + "epoch": 0.47701, + "grad_norm": 0.8234003252968018, + "learning_rate": 0.003, + "loss": 4.0333, + "step": 47701 + }, + { + "epoch": 0.47702, + "grad_norm": 0.8192205908856834, + "learning_rate": 0.003, + "loss": 4.0209, + "step": 47702 + }, + { + "epoch": 0.47703, + "grad_norm": 0.8073399417159287, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 47703 + }, + { + "epoch": 0.47704, + "grad_norm": 0.8482736846775742, + "learning_rate": 0.003, + "loss": 4.0282, + "step": 47704 + }, + { + "epoch": 0.47705, + "grad_norm": 0.9787749527840875, + "learning_rate": 0.003, + "loss": 4.0327, + "step": 47705 + }, + { + "epoch": 0.47706, + "grad_norm": 1.0100431543668988, + "learning_rate": 0.003, + "loss": 4.021, + "step": 47706 + }, + { + "epoch": 0.47707, + "grad_norm": 0.892807317892028, + "learning_rate": 0.003, + "loss": 4.037, + "step": 47707 + }, + { + "epoch": 0.47708, + "grad_norm": 0.9071317003800945, + "learning_rate": 0.003, + "loss": 4.0372, + "step": 47708 + }, + { + "epoch": 0.47709, + "grad_norm": 0.8326986019693331, + "learning_rate": 0.003, + "loss": 4.0611, + "step": 47709 + }, + { + "epoch": 0.4771, + "grad_norm": 0.8189871179135116, + "learning_rate": 0.003, + "loss": 4.0321, + "step": 47710 + }, + { + "epoch": 0.47711, + "grad_norm": 0.8291075895193161, + "learning_rate": 0.003, + "loss": 4.0498, + "step": 47711 + }, + { + "epoch": 0.47712, + "grad_norm": 0.7755951680866521, + "learning_rate": 0.003, + "loss": 4.0151, + "step": 47712 + }, + { + "epoch": 0.47713, + "grad_norm": 0.6910585868562399, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 47713 + }, + { + "epoch": 0.47714, + "grad_norm": 0.6623590699944903, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 47714 + }, + { + "epoch": 0.47715, + "grad_norm": 0.6525160694907679, + "learning_rate": 0.003, + "loss": 4.0374, + "step": 47715 + }, + { + "epoch": 0.47716, + "grad_norm": 0.7580605990354823, + "learning_rate": 0.003, + "loss": 4.0271, + "step": 47716 + }, + { + "epoch": 0.47717, + "grad_norm": 0.7864488270813857, + "learning_rate": 0.003, + "loss": 3.9928, + "step": 47717 + }, + { + "epoch": 0.47718, + "grad_norm": 0.9203838133947485, + "learning_rate": 0.003, + "loss": 4.0012, + "step": 47718 + }, + { + "epoch": 0.47719, + "grad_norm": 1.1450380493880437, + "learning_rate": 0.003, + "loss": 4.0308, + "step": 47719 + }, + { + "epoch": 0.4772, + "grad_norm": 0.9131323771566408, + "learning_rate": 0.003, + "loss": 4.0142, + "step": 47720 + }, + { + "epoch": 0.47721, + "grad_norm": 0.7939779611141573, + "learning_rate": 0.003, + "loss": 4.0231, + "step": 47721 + }, + { + "epoch": 0.47722, + "grad_norm": 0.7270648513361444, + "learning_rate": 0.003, + "loss": 4.002, + "step": 47722 + }, + { + "epoch": 0.47723, + "grad_norm": 0.7771054224878176, + "learning_rate": 0.003, + "loss": 4.0246, + "step": 47723 + }, + { + "epoch": 0.47724, + "grad_norm": 0.823197489391335, + "learning_rate": 0.003, + "loss": 4.0446, + "step": 47724 + }, + { + "epoch": 0.47725, + "grad_norm": 1.0512484572743803, + "learning_rate": 0.003, + "loss": 4.019, + "step": 47725 + }, + { + "epoch": 0.47726, + "grad_norm": 0.864145134481049, + "learning_rate": 0.003, + "loss": 4.0519, + "step": 47726 + }, + { + "epoch": 0.47727, + "grad_norm": 0.6655514857410619, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 47727 + }, + { + "epoch": 0.47728, + "grad_norm": 0.7688684421968073, + "learning_rate": 0.003, + "loss": 4.0235, + "step": 47728 + }, + { + "epoch": 0.47729, + "grad_norm": 0.9057672794348468, + "learning_rate": 0.003, + "loss": 4.0281, + "step": 47729 + }, + { + "epoch": 0.4773, + "grad_norm": 0.8908314682720808, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 47730 + }, + { + "epoch": 0.47731, + "grad_norm": 0.8901963781796234, + "learning_rate": 0.003, + "loss": 4.0276, + "step": 47731 + }, + { + "epoch": 0.47732, + "grad_norm": 0.8831791403878996, + "learning_rate": 0.003, + "loss": 4.0288, + "step": 47732 + }, + { + "epoch": 0.47733, + "grad_norm": 0.8312824564350844, + "learning_rate": 0.003, + "loss": 4.035, + "step": 47733 + }, + { + "epoch": 0.47734, + "grad_norm": 0.7886462628884287, + "learning_rate": 0.003, + "loss": 4.0707, + "step": 47734 + }, + { + "epoch": 0.47735, + "grad_norm": 0.8443580513442028, + "learning_rate": 0.003, + "loss": 4.0339, + "step": 47735 + }, + { + "epoch": 0.47736, + "grad_norm": 0.9034144633432558, + "learning_rate": 0.003, + "loss": 4.0096, + "step": 47736 + }, + { + "epoch": 0.47737, + "grad_norm": 1.043749912033846, + "learning_rate": 0.003, + "loss": 4.0577, + "step": 47737 + }, + { + "epoch": 0.47738, + "grad_norm": 1.0382539630216903, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 47738 + }, + { + "epoch": 0.47739, + "grad_norm": 0.9516684918633396, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 47739 + }, + { + "epoch": 0.4774, + "grad_norm": 0.8490665802991326, + "learning_rate": 0.003, + "loss": 4.0544, + "step": 47740 + }, + { + "epoch": 0.47741, + "grad_norm": 0.8376481125876462, + "learning_rate": 0.003, + "loss": 4.049, + "step": 47741 + }, + { + "epoch": 0.47742, + "grad_norm": 0.8269808981762711, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 47742 + }, + { + "epoch": 0.47743, + "grad_norm": 0.93289219868791, + "learning_rate": 0.003, + "loss": 4.0347, + "step": 47743 + }, + { + "epoch": 0.47744, + "grad_norm": 1.022493133093228, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 47744 + }, + { + "epoch": 0.47745, + "grad_norm": 1.042020532555963, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 47745 + }, + { + "epoch": 0.47746, + "grad_norm": 0.7680720164085884, + "learning_rate": 0.003, + "loss": 4.0312, + "step": 47746 + }, + { + "epoch": 0.47747, + "grad_norm": 0.685565008871464, + "learning_rate": 0.003, + "loss": 4.0089, + "step": 47747 + }, + { + "epoch": 0.47748, + "grad_norm": 0.6154814944845184, + "learning_rate": 0.003, + "loss": 4.0016, + "step": 47748 + }, + { + "epoch": 0.47749, + "grad_norm": 0.5467835689969801, + "learning_rate": 0.003, + "loss": 4.033, + "step": 47749 + }, + { + "epoch": 0.4775, + "grad_norm": 0.5619861270340842, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 47750 + }, + { + "epoch": 0.47751, + "grad_norm": 0.553208761563468, + "learning_rate": 0.003, + "loss": 4.045, + "step": 47751 + }, + { + "epoch": 0.47752, + "grad_norm": 0.5828306963798403, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 47752 + }, + { + "epoch": 0.47753, + "grad_norm": 0.6210393341801229, + "learning_rate": 0.003, + "loss": 4.005, + "step": 47753 + }, + { + "epoch": 0.47754, + "grad_norm": 0.7795009041780427, + "learning_rate": 0.003, + "loss": 4.0353, + "step": 47754 + }, + { + "epoch": 0.47755, + "grad_norm": 0.9477561167021658, + "learning_rate": 0.003, + "loss": 4.0192, + "step": 47755 + }, + { + "epoch": 0.47756, + "grad_norm": 1.1487235059546375, + "learning_rate": 0.003, + "loss": 4.0404, + "step": 47756 + }, + { + "epoch": 0.47757, + "grad_norm": 0.8925314862972017, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 47757 + }, + { + "epoch": 0.47758, + "grad_norm": 0.8655956870595615, + "learning_rate": 0.003, + "loss": 4.0201, + "step": 47758 + }, + { + "epoch": 0.47759, + "grad_norm": 0.8528438986310456, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 47759 + }, + { + "epoch": 0.4776, + "grad_norm": 0.9264616813786447, + "learning_rate": 0.003, + "loss": 4.0265, + "step": 47760 + }, + { + "epoch": 0.47761, + "grad_norm": 0.930630914935072, + "learning_rate": 0.003, + "loss": 4.0673, + "step": 47761 + }, + { + "epoch": 0.47762, + "grad_norm": 0.881423474807415, + "learning_rate": 0.003, + "loss": 4.0212, + "step": 47762 + }, + { + "epoch": 0.47763, + "grad_norm": 0.861563154962649, + "learning_rate": 0.003, + "loss": 4.0365, + "step": 47763 + }, + { + "epoch": 0.47764, + "grad_norm": 0.8863648227382548, + "learning_rate": 0.003, + "loss": 4.0055, + "step": 47764 + }, + { + "epoch": 0.47765, + "grad_norm": 1.0386081751705556, + "learning_rate": 0.003, + "loss": 4.072, + "step": 47765 + }, + { + "epoch": 0.47766, + "grad_norm": 1.1722138332735235, + "learning_rate": 0.003, + "loss": 4.0237, + "step": 47766 + }, + { + "epoch": 0.47767, + "grad_norm": 0.7727620103256565, + "learning_rate": 0.003, + "loss": 4.024, + "step": 47767 + }, + { + "epoch": 0.47768, + "grad_norm": 0.7944151689631941, + "learning_rate": 0.003, + "loss": 4.0293, + "step": 47768 + }, + { + "epoch": 0.47769, + "grad_norm": 0.9336916541712109, + "learning_rate": 0.003, + "loss": 4.0224, + "step": 47769 + }, + { + "epoch": 0.4777, + "grad_norm": 1.0892623574515115, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 47770 + }, + { + "epoch": 0.47771, + "grad_norm": 0.9773545801734361, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 47771 + }, + { + "epoch": 0.47772, + "grad_norm": 0.9283269464885936, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 47772 + }, + { + "epoch": 0.47773, + "grad_norm": 0.9466791398220683, + "learning_rate": 0.003, + "loss": 4.0465, + "step": 47773 + }, + { + "epoch": 0.47774, + "grad_norm": 0.8717350915369964, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 47774 + }, + { + "epoch": 0.47775, + "grad_norm": 0.929729910615477, + "learning_rate": 0.003, + "loss": 4.0377, + "step": 47775 + }, + { + "epoch": 0.47776, + "grad_norm": 0.9664535970385606, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 47776 + }, + { + "epoch": 0.47777, + "grad_norm": 0.8688005914292652, + "learning_rate": 0.003, + "loss": 4.0191, + "step": 47777 + }, + { + "epoch": 0.47778, + "grad_norm": 0.6530557707725342, + "learning_rate": 0.003, + "loss": 4.0181, + "step": 47778 + }, + { + "epoch": 0.47779, + "grad_norm": 0.7101209387541696, + "learning_rate": 0.003, + "loss": 4.0346, + "step": 47779 + }, + { + "epoch": 0.4778, + "grad_norm": 0.8400852438440579, + "learning_rate": 0.003, + "loss": 4.0298, + "step": 47780 + }, + { + "epoch": 0.47781, + "grad_norm": 1.036455766218724, + "learning_rate": 0.003, + "loss": 4.0524, + "step": 47781 + }, + { + "epoch": 0.47782, + "grad_norm": 0.9689457970877015, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 47782 + }, + { + "epoch": 0.47783, + "grad_norm": 0.7943556316266203, + "learning_rate": 0.003, + "loss": 4.0427, + "step": 47783 + }, + { + "epoch": 0.47784, + "grad_norm": 0.6874523848365067, + "learning_rate": 0.003, + "loss": 4.0304, + "step": 47784 + }, + { + "epoch": 0.47785, + "grad_norm": 0.652841473511447, + "learning_rate": 0.003, + "loss": 4.0315, + "step": 47785 + }, + { + "epoch": 0.47786, + "grad_norm": 0.6932901110919263, + "learning_rate": 0.003, + "loss": 3.9978, + "step": 47786 + }, + { + "epoch": 0.47787, + "grad_norm": 0.7190075643471489, + "learning_rate": 0.003, + "loss": 4.047, + "step": 47787 + }, + { + "epoch": 0.47788, + "grad_norm": 0.6391292107957316, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 47788 + }, + { + "epoch": 0.47789, + "grad_norm": 0.6241930783939177, + "learning_rate": 0.003, + "loss": 4.0101, + "step": 47789 + }, + { + "epoch": 0.4779, + "grad_norm": 0.5514484103733314, + "learning_rate": 0.003, + "loss": 4.0494, + "step": 47790 + }, + { + "epoch": 0.47791, + "grad_norm": 0.5309092325077018, + "learning_rate": 0.003, + "loss": 4.0199, + "step": 47791 + }, + { + "epoch": 0.47792, + "grad_norm": 0.5595655222447189, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 47792 + }, + { + "epoch": 0.47793, + "grad_norm": 0.5398588023924418, + "learning_rate": 0.003, + "loss": 4.0063, + "step": 47793 + }, + { + "epoch": 0.47794, + "grad_norm": 0.5668337519620845, + "learning_rate": 0.003, + "loss": 4.029, + "step": 47794 + }, + { + "epoch": 0.47795, + "grad_norm": 0.6516799231335698, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 47795 + }, + { + "epoch": 0.47796, + "grad_norm": 0.7555420011942031, + "learning_rate": 0.003, + "loss": 3.9976, + "step": 47796 + }, + { + "epoch": 0.47797, + "grad_norm": 0.893365328652948, + "learning_rate": 0.003, + "loss": 4.0248, + "step": 47797 + }, + { + "epoch": 0.47798, + "grad_norm": 1.1991450777224657, + "learning_rate": 0.003, + "loss": 4.038, + "step": 47798 + }, + { + "epoch": 0.47799, + "grad_norm": 0.9497108111800409, + "learning_rate": 0.003, + "loss": 4.0462, + "step": 47799 + }, + { + "epoch": 0.478, + "grad_norm": 0.9908101854385032, + "learning_rate": 0.003, + "loss": 4.0277, + "step": 47800 + }, + { + "epoch": 0.47801, + "grad_norm": 1.2548053258830503, + "learning_rate": 0.003, + "loss": 4.0168, + "step": 47801 + }, + { + "epoch": 0.47802, + "grad_norm": 0.8024089720060079, + "learning_rate": 0.003, + "loss": 4.0481, + "step": 47802 + }, + { + "epoch": 0.47803, + "grad_norm": 0.7272564879969802, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 47803 + }, + { + "epoch": 0.47804, + "grad_norm": 0.7064458745198293, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 47804 + }, + { + "epoch": 0.47805, + "grad_norm": 0.779077052986723, + "learning_rate": 0.003, + "loss": 4.0234, + "step": 47805 + }, + { + "epoch": 0.47806, + "grad_norm": 0.8399846489795875, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 47806 + }, + { + "epoch": 0.47807, + "grad_norm": 1.1026476758093564, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 47807 + }, + { + "epoch": 0.47808, + "grad_norm": 0.8872417966101015, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 47808 + }, + { + "epoch": 0.47809, + "grad_norm": 0.8776965758112133, + "learning_rate": 0.003, + "loss": 4.0306, + "step": 47809 + }, + { + "epoch": 0.4781, + "grad_norm": 1.10774482807065, + "learning_rate": 0.003, + "loss": 4.062, + "step": 47810 + }, + { + "epoch": 0.47811, + "grad_norm": 1.0072713739893888, + "learning_rate": 0.003, + "loss": 4.0386, + "step": 47811 + }, + { + "epoch": 0.47812, + "grad_norm": 1.0525293428167717, + "learning_rate": 0.003, + "loss": 4.0273, + "step": 47812 + }, + { + "epoch": 0.47813, + "grad_norm": 1.001106274088428, + "learning_rate": 0.003, + "loss": 4.0415, + "step": 47813 + }, + { + "epoch": 0.47814, + "grad_norm": 0.9437335981598103, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 47814 + }, + { + "epoch": 0.47815, + "grad_norm": 0.8744782941836338, + "learning_rate": 0.003, + "loss": 4.0305, + "step": 47815 + }, + { + "epoch": 0.47816, + "grad_norm": 0.942633256186016, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 47816 + }, + { + "epoch": 0.47817, + "grad_norm": 1.061849494345333, + "learning_rate": 0.003, + "loss": 4.0486, + "step": 47817 + }, + { + "epoch": 0.47818, + "grad_norm": 0.9865168375282416, + "learning_rate": 0.003, + "loss": 4.0716, + "step": 47818 + }, + { + "epoch": 0.47819, + "grad_norm": 0.9942034590499694, + "learning_rate": 0.003, + "loss": 4.075, + "step": 47819 + }, + { + "epoch": 0.4782, + "grad_norm": 1.060886233083507, + "learning_rate": 0.003, + "loss": 4.0388, + "step": 47820 + }, + { + "epoch": 0.47821, + "grad_norm": 0.9825780511624795, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 47821 + }, + { + "epoch": 0.47822, + "grad_norm": 0.8835845011356633, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 47822 + }, + { + "epoch": 0.47823, + "grad_norm": 0.8847503566213869, + "learning_rate": 0.003, + "loss": 4.0792, + "step": 47823 + }, + { + "epoch": 0.47824, + "grad_norm": 0.9401249149937013, + "learning_rate": 0.003, + "loss": 4.0467, + "step": 47824 + }, + { + "epoch": 0.47825, + "grad_norm": 0.9438390074694927, + "learning_rate": 0.003, + "loss": 4.0387, + "step": 47825 + }, + { + "epoch": 0.47826, + "grad_norm": 1.0128677680578244, + "learning_rate": 0.003, + "loss": 4.0619, + "step": 47826 + }, + { + "epoch": 0.47827, + "grad_norm": 0.9621410418611928, + "learning_rate": 0.003, + "loss": 4.0657, + "step": 47827 + }, + { + "epoch": 0.47828, + "grad_norm": 0.9745467585635258, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 47828 + }, + { + "epoch": 0.47829, + "grad_norm": 0.9062873131388343, + "learning_rate": 0.003, + "loss": 4.0413, + "step": 47829 + }, + { + "epoch": 0.4783, + "grad_norm": 1.0224148388255747, + "learning_rate": 0.003, + "loss": 4.0376, + "step": 47830 + }, + { + "epoch": 0.47831, + "grad_norm": 1.1238505287273532, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 47831 + }, + { + "epoch": 0.47832, + "grad_norm": 0.7926605382192834, + "learning_rate": 0.003, + "loss": 3.9952, + "step": 47832 + }, + { + "epoch": 0.47833, + "grad_norm": 0.6658719899517878, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 47833 + }, + { + "epoch": 0.47834, + "grad_norm": 0.6149128034313331, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 47834 + }, + { + "epoch": 0.47835, + "grad_norm": 0.5641642861330569, + "learning_rate": 0.003, + "loss": 4.0323, + "step": 47835 + }, + { + "epoch": 0.47836, + "grad_norm": 0.6090625056760954, + "learning_rate": 0.003, + "loss": 4.0329, + "step": 47836 + }, + { + "epoch": 0.47837, + "grad_norm": 0.6636777900931233, + "learning_rate": 0.003, + "loss": 4.0133, + "step": 47837 + }, + { + "epoch": 0.47838, + "grad_norm": 0.689828360418003, + "learning_rate": 0.003, + "loss": 4.0033, + "step": 47838 + }, + { + "epoch": 0.47839, + "grad_norm": 0.6458464617835158, + "learning_rate": 0.003, + "loss": 4.0545, + "step": 47839 + }, + { + "epoch": 0.4784, + "grad_norm": 0.6235535337025512, + "learning_rate": 0.003, + "loss": 4.0457, + "step": 47840 + }, + { + "epoch": 0.47841, + "grad_norm": 0.5638113943619326, + "learning_rate": 0.003, + "loss": 3.9987, + "step": 47841 + }, + { + "epoch": 0.47842, + "grad_norm": 0.5905803535039974, + "learning_rate": 0.003, + "loss": 4.0069, + "step": 47842 + }, + { + "epoch": 0.47843, + "grad_norm": 0.48459250589644515, + "learning_rate": 0.003, + "loss": 4.0542, + "step": 47843 + }, + { + "epoch": 0.47844, + "grad_norm": 0.39902688262841923, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 47844 + }, + { + "epoch": 0.47845, + "grad_norm": 0.39492454050290293, + "learning_rate": 0.003, + "loss": 4.0331, + "step": 47845 + }, + { + "epoch": 0.47846, + "grad_norm": 0.3906127686473391, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 47846 + }, + { + "epoch": 0.47847, + "grad_norm": 0.33490388196415716, + "learning_rate": 0.003, + "loss": 4.0229, + "step": 47847 + }, + { + "epoch": 0.47848, + "grad_norm": 0.39084376998858594, + "learning_rate": 0.003, + "loss": 3.9826, + "step": 47848 + }, + { + "epoch": 0.47849, + "grad_norm": 0.4904376595279477, + "learning_rate": 0.003, + "loss": 4.0097, + "step": 47849 + }, + { + "epoch": 0.4785, + "grad_norm": 0.7286450561072402, + "learning_rate": 0.003, + "loss": 4.0037, + "step": 47850 + }, + { + "epoch": 0.47851, + "grad_norm": 1.0971372268631676, + "learning_rate": 0.003, + "loss": 4.0139, + "step": 47851 + }, + { + "epoch": 0.47852, + "grad_norm": 1.3242836648492595, + "learning_rate": 0.003, + "loss": 4.0103, + "step": 47852 + }, + { + "epoch": 0.47853, + "grad_norm": 0.6593930228621407, + "learning_rate": 0.003, + "loss": 4.0129, + "step": 47853 + }, + { + "epoch": 0.47854, + "grad_norm": 0.9759976137189783, + "learning_rate": 0.003, + "loss": 4.0391, + "step": 47854 + }, + { + "epoch": 0.47855, + "grad_norm": 1.2279102483397648, + "learning_rate": 0.003, + "loss": 4.0092, + "step": 47855 + }, + { + "epoch": 0.47856, + "grad_norm": 0.6513094284190404, + "learning_rate": 0.003, + "loss": 3.9949, + "step": 47856 + }, + { + "epoch": 0.47857, + "grad_norm": 0.783862325471082, + "learning_rate": 0.003, + "loss": 4.023, + "step": 47857 + }, + { + "epoch": 0.47858, + "grad_norm": 0.8579820479280847, + "learning_rate": 0.003, + "loss": 4.0301, + "step": 47858 + }, + { + "epoch": 0.47859, + "grad_norm": 0.8552706035956072, + "learning_rate": 0.003, + "loss": 3.9724, + "step": 47859 + }, + { + "epoch": 0.4786, + "grad_norm": 0.9131605105807742, + "learning_rate": 0.003, + "loss": 4.0428, + "step": 47860 + }, + { + "epoch": 0.47861, + "grad_norm": 0.844678374823979, + "learning_rate": 0.003, + "loss": 4.0421, + "step": 47861 + }, + { + "epoch": 0.47862, + "grad_norm": 0.8617960844894962, + "learning_rate": 0.003, + "loss": 4.0445, + "step": 47862 + }, + { + "epoch": 0.47863, + "grad_norm": 0.941612505383843, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 47863 + }, + { + "epoch": 0.47864, + "grad_norm": 0.9639559064754847, + "learning_rate": 0.003, + "loss": 4.0128, + "step": 47864 + }, + { + "epoch": 0.47865, + "grad_norm": 0.9699631614532889, + "learning_rate": 0.003, + "loss": 4.0466, + "step": 47865 + }, + { + "epoch": 0.47866, + "grad_norm": 0.9743872005962144, + "learning_rate": 0.003, + "loss": 4.0322, + "step": 47866 + }, + { + "epoch": 0.47867, + "grad_norm": 0.926444779028823, + "learning_rate": 0.003, + "loss": 4.0357, + "step": 47867 + }, + { + "epoch": 0.47868, + "grad_norm": 0.8478865513453202, + "learning_rate": 0.003, + "loss": 4.0147, + "step": 47868 + }, + { + "epoch": 0.47869, + "grad_norm": 0.7649179539922206, + "learning_rate": 0.003, + "loss": 4.0292, + "step": 47869 + }, + { + "epoch": 0.4787, + "grad_norm": 0.7665408651772448, + "learning_rate": 0.003, + "loss": 4.0495, + "step": 47870 + }, + { + "epoch": 0.47871, + "grad_norm": 1.070642152029606, + "learning_rate": 0.003, + "loss": 4.0184, + "step": 47871 + }, + { + "epoch": 0.47872, + "grad_norm": 1.4198893226326375, + "learning_rate": 0.003, + "loss": 4.0441, + "step": 47872 + }, + { + "epoch": 0.47873, + "grad_norm": 0.6092417590665852, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 47873 + }, + { + "epoch": 0.47874, + "grad_norm": 0.7249939135777458, + "learning_rate": 0.003, + "loss": 4.0356, + "step": 47874 + }, + { + "epoch": 0.47875, + "grad_norm": 0.8627184658559173, + "learning_rate": 0.003, + "loss": 4.0255, + "step": 47875 + }, + { + "epoch": 0.47876, + "grad_norm": 1.0805429548443115, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 47876 + }, + { + "epoch": 0.47877, + "grad_norm": 1.0845615566943803, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 47877 + }, + { + "epoch": 0.47878, + "grad_norm": 0.6308798990645162, + "learning_rate": 0.003, + "loss": 4.0354, + "step": 47878 + }, + { + "epoch": 0.47879, + "grad_norm": 0.5822841133468076, + "learning_rate": 0.003, + "loss": 3.9954, + "step": 47879 + }, + { + "epoch": 0.4788, + "grad_norm": 0.59053316385297, + "learning_rate": 0.003, + "loss": 4.0295, + "step": 47880 + }, + { + "epoch": 0.47881, + "grad_norm": 0.561023790600162, + "learning_rate": 0.003, + "loss": 4.0385, + "step": 47881 + }, + { + "epoch": 0.47882, + "grad_norm": 0.5837138033259733, + "learning_rate": 0.003, + "loss": 4.0392, + "step": 47882 + }, + { + "epoch": 0.47883, + "grad_norm": 0.6897691723371869, + "learning_rate": 0.003, + "loss": 3.9843, + "step": 47883 + }, + { + "epoch": 0.47884, + "grad_norm": 0.7515512963075885, + "learning_rate": 0.003, + "loss": 4.0158, + "step": 47884 + }, + { + "epoch": 0.47885, + "grad_norm": 0.7386453469435191, + "learning_rate": 0.003, + "loss": 3.9954, + "step": 47885 + }, + { + "epoch": 0.47886, + "grad_norm": 0.7772739513544076, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 47886 + }, + { + "epoch": 0.47887, + "grad_norm": 0.8312288592321461, + "learning_rate": 0.003, + "loss": 4.0167, + "step": 47887 + }, + { + "epoch": 0.47888, + "grad_norm": 0.8941200958110377, + "learning_rate": 0.003, + "loss": 4.0263, + "step": 47888 + }, + { + "epoch": 0.47889, + "grad_norm": 1.1577327026755109, + "learning_rate": 0.003, + "loss": 4.0468, + "step": 47889 + }, + { + "epoch": 0.4789, + "grad_norm": 0.8828034189504068, + "learning_rate": 0.003, + "loss": 4.0174, + "step": 47890 + }, + { + "epoch": 0.47891, + "grad_norm": 0.7145913965932318, + "learning_rate": 0.003, + "loss": 3.9986, + "step": 47891 + }, + { + "epoch": 0.47892, + "grad_norm": 0.7409155464127424, + "learning_rate": 0.003, + "loss": 4.0195, + "step": 47892 + }, + { + "epoch": 0.47893, + "grad_norm": 0.8767241991567041, + "learning_rate": 0.003, + "loss": 4.0537, + "step": 47893 + }, + { + "epoch": 0.47894, + "grad_norm": 0.9574070968474538, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 47894 + }, + { + "epoch": 0.47895, + "grad_norm": 0.870366604313218, + "learning_rate": 0.003, + "loss": 4.0294, + "step": 47895 + }, + { + "epoch": 0.47896, + "grad_norm": 0.8463674318446441, + "learning_rate": 0.003, + "loss": 4.03, + "step": 47896 + }, + { + "epoch": 0.47897, + "grad_norm": 0.8870511427955626, + "learning_rate": 0.003, + "loss": 4.0177, + "step": 47897 + }, + { + "epoch": 0.47898, + "grad_norm": 0.9304569511932266, + "learning_rate": 0.003, + "loss": 4.0166, + "step": 47898 + }, + { + "epoch": 0.47899, + "grad_norm": 0.8474452093975874, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 47899 + }, + { + "epoch": 0.479, + "grad_norm": 0.8615029407484182, + "learning_rate": 0.003, + "loss": 4.0302, + "step": 47900 + }, + { + "epoch": 0.47901, + "grad_norm": 0.9357137665469563, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 47901 + }, + { + "epoch": 0.47902, + "grad_norm": 0.9527120898924382, + "learning_rate": 0.003, + "loss": 4.014, + "step": 47902 + }, + { + "epoch": 0.47903, + "grad_norm": 1.0332806507520222, + "learning_rate": 0.003, + "loss": 4.0641, + "step": 47903 + }, + { + "epoch": 0.47904, + "grad_norm": 1.0678501394118303, + "learning_rate": 0.003, + "loss": 4.0366, + "step": 47904 + }, + { + "epoch": 0.47905, + "grad_norm": 0.8928226483319223, + "learning_rate": 0.003, + "loss": 4.019, + "step": 47905 + }, + { + "epoch": 0.47906, + "grad_norm": 0.7135862360397894, + "learning_rate": 0.003, + "loss": 4.0328, + "step": 47906 + }, + { + "epoch": 0.47907, + "grad_norm": 0.7638446951549809, + "learning_rate": 0.003, + "loss": 4.024, + "step": 47907 + }, + { + "epoch": 0.47908, + "grad_norm": 0.8023134016904745, + "learning_rate": 0.003, + "loss": 4.0573, + "step": 47908 + }, + { + "epoch": 0.47909, + "grad_norm": 0.8690055416202024, + "learning_rate": 0.003, + "loss": 4.0431, + "step": 47909 + }, + { + "epoch": 0.4791, + "grad_norm": 0.8886741187263025, + "learning_rate": 0.003, + "loss": 4.0109, + "step": 47910 + }, + { + "epoch": 0.47911, + "grad_norm": 0.9659270669218242, + "learning_rate": 0.003, + "loss": 4.0098, + "step": 47911 + }, + { + "epoch": 0.47912, + "grad_norm": 1.138203974616603, + "learning_rate": 0.003, + "loss": 4.005, + "step": 47912 + }, + { + "epoch": 0.47913, + "grad_norm": 0.9756956260873573, + "learning_rate": 0.003, + "loss": 4.044, + "step": 47913 + }, + { + "epoch": 0.47914, + "grad_norm": 0.9700672498726479, + "learning_rate": 0.003, + "loss": 4.0226, + "step": 47914 + }, + { + "epoch": 0.47915, + "grad_norm": 0.9719881734382427, + "learning_rate": 0.003, + "loss": 4.0443, + "step": 47915 + }, + { + "epoch": 0.47916, + "grad_norm": 0.9290290024359084, + "learning_rate": 0.003, + "loss": 4.0369, + "step": 47916 + }, + { + "epoch": 0.47917, + "grad_norm": 0.8199027166294517, + "learning_rate": 0.003, + "loss": 4.0398, + "step": 47917 + }, + { + "epoch": 0.47918, + "grad_norm": 0.8058259334386425, + "learning_rate": 0.003, + "loss": 4.0645, + "step": 47918 + }, + { + "epoch": 0.47919, + "grad_norm": 0.9455776624474582, + "learning_rate": 0.003, + "loss": 3.9942, + "step": 47919 + }, + { + "epoch": 0.4792, + "grad_norm": 1.062662973664479, + "learning_rate": 0.003, + "loss": 4.028, + "step": 47920 + }, + { + "epoch": 0.47921, + "grad_norm": 1.0932190061006697, + "learning_rate": 0.003, + "loss": 4.0296, + "step": 47921 + }, + { + "epoch": 0.47922, + "grad_norm": 1.0180891451396474, + "learning_rate": 0.003, + "loss": 4.0504, + "step": 47922 + }, + { + "epoch": 0.47923, + "grad_norm": 0.9102300651844245, + "learning_rate": 0.003, + "loss": 4.049, + "step": 47923 + }, + { + "epoch": 0.47924, + "grad_norm": 0.7798868640190149, + "learning_rate": 0.003, + "loss": 4.0202, + "step": 47924 + }, + { + "epoch": 0.47925, + "grad_norm": 0.8466924179385419, + "learning_rate": 0.003, + "loss": 4.041, + "step": 47925 + }, + { + "epoch": 0.47926, + "grad_norm": 0.8014349154299129, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 47926 + }, + { + "epoch": 0.47927, + "grad_norm": 0.949508838202136, + "learning_rate": 0.003, + "loss": 4.0476, + "step": 47927 + }, + { + "epoch": 0.47928, + "grad_norm": 1.026119619708393, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 47928 + }, + { + "epoch": 0.47929, + "grad_norm": 0.9399780628719582, + "learning_rate": 0.003, + "loss": 4.0236, + "step": 47929 + }, + { + "epoch": 0.4793, + "grad_norm": 0.7383820672512792, + "learning_rate": 0.003, + "loss": 4.0393, + "step": 47930 + }, + { + "epoch": 0.47931, + "grad_norm": 0.5378121553169838, + "learning_rate": 0.003, + "loss": 4.0162, + "step": 47931 + }, + { + "epoch": 0.47932, + "grad_norm": 0.5948465609777394, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 47932 + }, + { + "epoch": 0.47933, + "grad_norm": 0.6965968019610396, + "learning_rate": 0.003, + "loss": 4.0047, + "step": 47933 + }, + { + "epoch": 0.47934, + "grad_norm": 0.7160454631181921, + "learning_rate": 0.003, + "loss": 4.0284, + "step": 47934 + }, + { + "epoch": 0.47935, + "grad_norm": 0.8176109478588213, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 47935 + }, + { + "epoch": 0.47936, + "grad_norm": 0.8815275553861506, + "learning_rate": 0.003, + "loss": 4.0176, + "step": 47936 + }, + { + "epoch": 0.47937, + "grad_norm": 0.8320095952916652, + "learning_rate": 0.003, + "loss": 4.0217, + "step": 47937 + }, + { + "epoch": 0.47938, + "grad_norm": 0.7153336598077438, + "learning_rate": 0.003, + "loss": 4.0424, + "step": 47938 + }, + { + "epoch": 0.47939, + "grad_norm": 0.7378349465731824, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 47939 + }, + { + "epoch": 0.4794, + "grad_norm": 0.7416355520789409, + "learning_rate": 0.003, + "loss": 4.0274, + "step": 47940 + }, + { + "epoch": 0.47941, + "grad_norm": 0.795913700335058, + "learning_rate": 0.003, + "loss": 4.0332, + "step": 47941 + }, + { + "epoch": 0.47942, + "grad_norm": 0.7723566301083334, + "learning_rate": 0.003, + "loss": 4.0115, + "step": 47942 + }, + { + "epoch": 0.47943, + "grad_norm": 0.7722539965078348, + "learning_rate": 0.003, + "loss": 4.0586, + "step": 47943 + }, + { + "epoch": 0.47944, + "grad_norm": 0.8030738270597371, + "learning_rate": 0.003, + "loss": 4.0116, + "step": 47944 + }, + { + "epoch": 0.47945, + "grad_norm": 0.8511293834831359, + "learning_rate": 0.003, + "loss": 4.0338, + "step": 47945 + }, + { + "epoch": 0.47946, + "grad_norm": 0.8496389375596758, + "learning_rate": 0.003, + "loss": 4.0335, + "step": 47946 + }, + { + "epoch": 0.47947, + "grad_norm": 0.836659135465997, + "learning_rate": 0.003, + "loss": 4.0159, + "step": 47947 + }, + { + "epoch": 0.47948, + "grad_norm": 0.7964684793598288, + "learning_rate": 0.003, + "loss": 4.0309, + "step": 47948 + }, + { + "epoch": 0.47949, + "grad_norm": 0.895764798680028, + "learning_rate": 0.003, + "loss": 4.0185, + "step": 47949 + }, + { + "epoch": 0.4795, + "grad_norm": 1.0492153201793404, + "learning_rate": 0.003, + "loss": 4.0471, + "step": 47950 + }, + { + "epoch": 0.47951, + "grad_norm": 0.9747037362599673, + "learning_rate": 0.003, + "loss": 4.0472, + "step": 47951 + }, + { + "epoch": 0.47952, + "grad_norm": 0.9361370115797258, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 47952 + }, + { + "epoch": 0.47953, + "grad_norm": 0.8969212247487213, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 47953 + }, + { + "epoch": 0.47954, + "grad_norm": 0.7117295874414217, + "learning_rate": 0.003, + "loss": 4.0661, + "step": 47954 + }, + { + "epoch": 0.47955, + "grad_norm": 0.6475492413663173, + "learning_rate": 0.003, + "loss": 4.0307, + "step": 47955 + }, + { + "epoch": 0.47956, + "grad_norm": 0.5966444169931194, + "learning_rate": 0.003, + "loss": 4.0318, + "step": 47956 + }, + { + "epoch": 0.47957, + "grad_norm": 0.6668519204247151, + "learning_rate": 0.003, + "loss": 4.0249, + "step": 47957 + }, + { + "epoch": 0.47958, + "grad_norm": 0.8337306665500819, + "learning_rate": 0.003, + "loss": 4.0279, + "step": 47958 + }, + { + "epoch": 0.47959, + "grad_norm": 0.9111835290167892, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 47959 + }, + { + "epoch": 0.4796, + "grad_norm": 0.7918698861142475, + "learning_rate": 0.003, + "loss": 4.0, + "step": 47960 + }, + { + "epoch": 0.47961, + "grad_norm": 0.7232800008053349, + "learning_rate": 0.003, + "loss": 4.0193, + "step": 47961 + }, + { + "epoch": 0.47962, + "grad_norm": 0.672032395605447, + "learning_rate": 0.003, + "loss": 3.9934, + "step": 47962 + }, + { + "epoch": 0.47963, + "grad_norm": 0.6397930069633762, + "learning_rate": 0.003, + "loss": 4.0449, + "step": 47963 + }, + { + "epoch": 0.47964, + "grad_norm": 0.7471330480506219, + "learning_rate": 0.003, + "loss": 4.0228, + "step": 47964 + }, + { + "epoch": 0.47965, + "grad_norm": 0.8637926352820127, + "learning_rate": 0.003, + "loss": 4.0203, + "step": 47965 + }, + { + "epoch": 0.47966, + "grad_norm": 1.0402868712928788, + "learning_rate": 0.003, + "loss": 4.0435, + "step": 47966 + }, + { + "epoch": 0.47967, + "grad_norm": 1.075981346999981, + "learning_rate": 0.003, + "loss": 3.9875, + "step": 47967 + }, + { + "epoch": 0.47968, + "grad_norm": 0.9452094046504274, + "learning_rate": 0.003, + "loss": 4.0175, + "step": 47968 + }, + { + "epoch": 0.47969, + "grad_norm": 0.8147295464017508, + "learning_rate": 0.003, + "loss": 4.031, + "step": 47969 + }, + { + "epoch": 0.4797, + "grad_norm": 0.7432122634104672, + "learning_rate": 0.003, + "loss": 4.044, + "step": 47970 + }, + { + "epoch": 0.47971, + "grad_norm": 0.6981492135220484, + "learning_rate": 0.003, + "loss": 4.0172, + "step": 47971 + }, + { + "epoch": 0.47972, + "grad_norm": 0.6709347809627773, + "learning_rate": 0.003, + "loss": 4.034, + "step": 47972 + }, + { + "epoch": 0.47973, + "grad_norm": 0.6397270224996527, + "learning_rate": 0.003, + "loss": 3.9889, + "step": 47973 + }, + { + "epoch": 0.47974, + "grad_norm": 0.710731502082902, + "learning_rate": 0.003, + "loss": 4.0458, + "step": 47974 + }, + { + "epoch": 0.47975, + "grad_norm": 0.6383865367484592, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 47975 + }, + { + "epoch": 0.47976, + "grad_norm": 0.6566706019241856, + "learning_rate": 0.003, + "loss": 4.0518, + "step": 47976 + }, + { + "epoch": 0.47977, + "grad_norm": 0.7468186962192893, + "learning_rate": 0.003, + "loss": 4.0351, + "step": 47977 + }, + { + "epoch": 0.47978, + "grad_norm": 0.9008677145907894, + "learning_rate": 0.003, + "loss": 4.0084, + "step": 47978 + }, + { + "epoch": 0.47979, + "grad_norm": 0.8445503132922096, + "learning_rate": 0.003, + "loss": 4.0325, + "step": 47979 + }, + { + "epoch": 0.4798, + "grad_norm": 0.8728134200724392, + "learning_rate": 0.003, + "loss": 4.0266, + "step": 47980 + }, + { + "epoch": 0.47981, + "grad_norm": 0.8803952461051788, + "learning_rate": 0.003, + "loss": 4.03, + "step": 47981 + }, + { + "epoch": 0.47982, + "grad_norm": 1.1631605357464612, + "learning_rate": 0.003, + "loss": 4.0244, + "step": 47982 + }, + { + "epoch": 0.47983, + "grad_norm": 0.9838149406560917, + "learning_rate": 0.003, + "loss": 4.0402, + "step": 47983 + }, + { + "epoch": 0.47984, + "grad_norm": 1.1168271891502715, + "learning_rate": 0.003, + "loss": 4.0311, + "step": 47984 + }, + { + "epoch": 0.47985, + "grad_norm": 0.8939016807743798, + "learning_rate": 0.003, + "loss": 4.0574, + "step": 47985 + }, + { + "epoch": 0.47986, + "grad_norm": 0.8541405984135797, + "learning_rate": 0.003, + "loss": 4.0204, + "step": 47986 + }, + { + "epoch": 0.47987, + "grad_norm": 0.8181077741958114, + "learning_rate": 0.003, + "loss": 4.0093, + "step": 47987 + }, + { + "epoch": 0.47988, + "grad_norm": 0.9134060850722475, + "learning_rate": 0.003, + "loss": 4.0165, + "step": 47988 + }, + { + "epoch": 0.47989, + "grad_norm": 1.0246362054317033, + "learning_rate": 0.003, + "loss": 4.0119, + "step": 47989 + }, + { + "epoch": 0.4799, + "grad_norm": 0.9267057478128473, + "learning_rate": 0.003, + "loss": 4.0375, + "step": 47990 + }, + { + "epoch": 0.47991, + "grad_norm": 1.0113151701285072, + "learning_rate": 0.003, + "loss": 4.0208, + "step": 47991 + }, + { + "epoch": 0.47992, + "grad_norm": 0.9102546626680256, + "learning_rate": 0.003, + "loss": 4.0775, + "step": 47992 + }, + { + "epoch": 0.47993, + "grad_norm": 0.9053053635897125, + "learning_rate": 0.003, + "loss": 4.048, + "step": 47993 + }, + { + "epoch": 0.47994, + "grad_norm": 0.9169771635623979, + "learning_rate": 0.003, + "loss": 4.0488, + "step": 47994 + }, + { + "epoch": 0.47995, + "grad_norm": 0.8693642632732881, + "learning_rate": 0.003, + "loss": 4.0581, + "step": 47995 + }, + { + "epoch": 0.47996, + "grad_norm": 0.9718401038106728, + "learning_rate": 0.003, + "loss": 4.0555, + "step": 47996 + }, + { + "epoch": 0.47997, + "grad_norm": 1.0483478668541324, + "learning_rate": 0.003, + "loss": 4.0448, + "step": 47997 + }, + { + "epoch": 0.47998, + "grad_norm": 0.8504460778650569, + "learning_rate": 0.003, + "loss": 4.0269, + "step": 47998 + }, + { + "epoch": 0.47999, + "grad_norm": 0.8770260657887297, + "learning_rate": 0.003, + "loss": 4.021, + "step": 47999 + }, + { + "epoch": 0.48, + "grad_norm": 0.9399290347100531, + "learning_rate": 0.003, + "loss": 4.072, + "step": 48000 + } + ], + "logging_steps": 1, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.902894857060352e+18, + "train_batch_size": 1024, + "trial_name": null, + "trial_params": null +}