|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5, |
|
"eval_steps": 500, |
|
"global_step": 838, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.0434608134535555, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7234, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.9514763639201144, |
|
"learning_rate": 4e-05, |
|
"loss": 0.5916, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.8903336235741379, |
|
"learning_rate": 6e-05, |
|
"loss": 0.5827, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.764603046688215, |
|
"learning_rate": 8e-05, |
|
"loss": 0.5555, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.8956967835024867, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6073, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.378526599131456, |
|
"learning_rate": 0.00012, |
|
"loss": 0.6204, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.1442512069955262, |
|
"learning_rate": 0.00014, |
|
"loss": 0.5452, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.9100913748328603, |
|
"learning_rate": 0.00016, |
|
"loss": 0.5858, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.6859557442210851, |
|
"learning_rate": 0.00018, |
|
"loss": 0.5182, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.8288606055568941, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6083, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.0417686973414615, |
|
"learning_rate": 0.0001999999558168346, |
|
"loss": 0.65, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.8756017006672853, |
|
"learning_rate": 0.00019999982326737747, |
|
"loss": 0.5837, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.7284541589078422, |
|
"learning_rate": 0.0001999996023517457, |
|
"loss": 0.5738, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.8602799120413903, |
|
"learning_rate": 0.0001999992930701345, |
|
"loss": 0.595, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.7938840633060059, |
|
"learning_rate": 0.00019999889542281728, |
|
"loss": 0.5907, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.7022632853348306, |
|
"learning_rate": 0.00019999840941014525, |
|
"loss": 0.5513, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.6531781263691616, |
|
"learning_rate": 0.00019999783503254803, |
|
"loss": 0.5475, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5808528104992969, |
|
"learning_rate": 0.0001999971722905331, |
|
"loss": 0.519, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5452643486331965, |
|
"learning_rate": 0.00019999642118468614, |
|
"loss": 0.5421, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5893567274117093, |
|
"learning_rate": 0.00019999558171567082, |
|
"loss": 0.6016, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5039905113559068, |
|
"learning_rate": 0.000199994653884229, |
|
"loss": 0.6096, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5236847425188783, |
|
"learning_rate": 0.00019999363769118055, |
|
"loss": 0.5845, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.4403777461558745, |
|
"learning_rate": 0.00019999253313742344, |
|
"loss": 0.5657, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.411935443568472, |
|
"learning_rate": 0.00019999134022393375, |
|
"loss": 0.5619, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3669921362459581, |
|
"learning_rate": 0.0001999900589517656, |
|
"loss": 0.6115, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3613613842516578, |
|
"learning_rate": 0.0001999886893220512, |
|
"loss": 0.5286, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.378560233146017, |
|
"learning_rate": 0.0001999872313360008, |
|
"loss": 0.5887, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3562687135057843, |
|
"learning_rate": 0.00019998568499490283, |
|
"loss": 0.5598, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.34581851507208355, |
|
"learning_rate": 0.00019998405030012371, |
|
"loss": 0.5772, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3812400800902662, |
|
"learning_rate": 0.00019998232725310796, |
|
"loss": 0.6154, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.2876811822631032, |
|
"learning_rate": 0.00019998051585537818, |
|
"loss": 0.4949, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3291934945038139, |
|
"learning_rate": 0.00019997861610853503, |
|
"loss": 0.5388, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.35220229516562385, |
|
"learning_rate": 0.00019997662801425725, |
|
"loss": 0.5801, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3908550087374589, |
|
"learning_rate": 0.00019997455157430165, |
|
"loss": 0.5783, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.38564043955621646, |
|
"learning_rate": 0.00019997238679050308, |
|
"loss": 0.5628, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3248792879576579, |
|
"learning_rate": 0.00019997013366477453, |
|
"loss": 0.5896, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.33243708146621687, |
|
"learning_rate": 0.00019996779219910696, |
|
"loss": 0.5618, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3589452470436555, |
|
"learning_rate": 0.00019996536239556942, |
|
"loss": 0.5387, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3635189263065437, |
|
"learning_rate": 0.0001999628442563091, |
|
"loss": 0.629, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.36761442942017947, |
|
"learning_rate": 0.00019996023778355113, |
|
"loss": 0.6133, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.30331424568033827, |
|
"learning_rate": 0.00019995754297959882, |
|
"loss": 0.5377, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3157701200247212, |
|
"learning_rate": 0.0001999547598468334, |
|
"loss": 0.6249, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3112819252913729, |
|
"learning_rate": 0.00019995188838771425, |
|
"loss": 0.5424, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.33384944369487113, |
|
"learning_rate": 0.0001999489286047788, |
|
"loss": 0.6014, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3330564524921197, |
|
"learning_rate": 0.00019994588050064243, |
|
"loss": 0.5469, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.2889450580843479, |
|
"learning_rate": 0.00019994274407799872, |
|
"loss": 0.512, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3273617072745067, |
|
"learning_rate": 0.00019993951933961913, |
|
"loss": 0.5456, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3228418979730564, |
|
"learning_rate": 0.00019993620628835332, |
|
"loss": 0.5716, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3439278828333003, |
|
"learning_rate": 0.0001999328049271289, |
|
"loss": 0.5177, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3186271172935729, |
|
"learning_rate": 0.0001999293152589515, |
|
"loss": 0.5502, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.38357922086488366, |
|
"learning_rate": 0.0001999257372869048, |
|
"loss": 0.6178, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.36013497860303273, |
|
"learning_rate": 0.00019992207101415053, |
|
"loss": 0.6278, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3146595251755829, |
|
"learning_rate": 0.00019991831644392848, |
|
"loss": 0.5348, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3986948949803995, |
|
"learning_rate": 0.00019991447357955639, |
|
"loss": 0.6331, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3194176204715625, |
|
"learning_rate": 0.00019991054242443008, |
|
"loss": 0.5817, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.29564051537116465, |
|
"learning_rate": 0.00019990652298202335, |
|
"loss": 0.545, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.2908246398716361, |
|
"learning_rate": 0.00019990241525588804, |
|
"loss": 0.5294, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3480952622658696, |
|
"learning_rate": 0.000199898219249654, |
|
"loss": 0.6282, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.4278607132919695, |
|
"learning_rate": 0.00019989393496702907, |
|
"loss": 0.7008, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3088760407735635, |
|
"learning_rate": 0.00019988956241179912, |
|
"loss": 0.5747, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3549589308890128, |
|
"learning_rate": 0.00019988510158782804, |
|
"loss": 0.615, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.28349959678885256, |
|
"learning_rate": 0.00019988055249905767, |
|
"loss": 0.577, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.35001480138074803, |
|
"learning_rate": 0.00019987591514950787, |
|
"loss": 0.5551, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.32895401860950285, |
|
"learning_rate": 0.00019987118954327654, |
|
"loss": 0.5617, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.347007326906862, |
|
"learning_rate": 0.00019986637568453945, |
|
"loss": 0.5935, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.30223641676037666, |
|
"learning_rate": 0.00019986147357755048, |
|
"loss": 0.5355, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.302279046184156, |
|
"learning_rate": 0.00019985648322664145, |
|
"loss": 0.5571, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3181910281320864, |
|
"learning_rate": 0.00019985140463622215, |
|
"loss": 0.5198, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.32334719229096776, |
|
"learning_rate": 0.0001998462378107803, |
|
"loss": 0.5063, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.31038499461943353, |
|
"learning_rate": 0.0001998409827548817, |
|
"loss": 0.5805, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3032049786542595, |
|
"learning_rate": 0.00019983563947316996, |
|
"loss": 0.564, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3345643555445713, |
|
"learning_rate": 0.00019983020797036683, |
|
"loss": 0.5442, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.32583720357675877, |
|
"learning_rate": 0.00019982468825127187, |
|
"loss": 0.5674, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.30278224625704836, |
|
"learning_rate": 0.0001998190803207627, |
|
"loss": 0.569, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.29996902392483177, |
|
"learning_rate": 0.0001998133841837948, |
|
"loss": 0.6142, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2947151895973628, |
|
"learning_rate": 0.00019980759984540168, |
|
"loss": 0.5084, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.33309005605837944, |
|
"learning_rate": 0.0001998017273106947, |
|
"loss": 0.5807, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.31281343399912853, |
|
"learning_rate": 0.00019979576658486325, |
|
"loss": 0.6299, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.30980839190781245, |
|
"learning_rate": 0.00019978971767317457, |
|
"loss": 0.5521, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2988115356408324, |
|
"learning_rate": 0.00019978358058097388, |
|
"loss": 0.5645, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2864799366004751, |
|
"learning_rate": 0.0001997773553136843, |
|
"loss": 0.5604, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.28284272149262185, |
|
"learning_rate": 0.00019977104187680688, |
|
"loss": 0.5964, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2807639116477172, |
|
"learning_rate": 0.00019976464027592053, |
|
"loss": 0.5441, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3169919212395633, |
|
"learning_rate": 0.00019975815051668217, |
|
"loss": 0.5672, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2827524094344841, |
|
"learning_rate": 0.0001997515726048265, |
|
"loss": 0.5631, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.29538492598974014, |
|
"learning_rate": 0.00019974490654616625, |
|
"loss": 0.609, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.29397994414783907, |
|
"learning_rate": 0.0001997381523465919, |
|
"loss": 0.5723, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2824005824065347, |
|
"learning_rate": 0.00019973131001207195, |
|
"loss": 0.5209, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2847018611508931, |
|
"learning_rate": 0.00019972437954865265, |
|
"loss": 0.5617, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2908040007926844, |
|
"learning_rate": 0.00019971736096245825, |
|
"loss": 0.5624, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.27754759063410545, |
|
"learning_rate": 0.00019971025425969083, |
|
"loss": 0.5353, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2885964599083646, |
|
"learning_rate": 0.0001997030594466303, |
|
"loss": 0.5181, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.29372725474160666, |
|
"learning_rate": 0.00019969577652963444, |
|
"loss": 0.5757, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.32149915053639194, |
|
"learning_rate": 0.0001996884055151389, |
|
"loss": 0.552, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.2816717380191918, |
|
"learning_rate": 0.00019968094640965717, |
|
"loss": 0.4968, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.2719020140724135, |
|
"learning_rate": 0.00019967339921978062, |
|
"loss": 0.5503, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.28166729851780475, |
|
"learning_rate": 0.00019966576395217837, |
|
"loss": 0.5546, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.27817598279558775, |
|
"learning_rate": 0.0001996580406135975, |
|
"loss": 0.6145, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3471492732103861, |
|
"learning_rate": 0.00019965022921086275, |
|
"loss": 0.6464, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.2838977359279957, |
|
"learning_rate": 0.00019964232975087687, |
|
"loss": 0.5576, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.2989119777268752, |
|
"learning_rate": 0.00019963434224062025, |
|
"loss": 0.5747, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.26079447242968457, |
|
"learning_rate": 0.0001996262666871512, |
|
"loss": 0.5144, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.2904578974664885, |
|
"learning_rate": 0.00019961810309760577, |
|
"loss": 0.5623, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.2682051539483259, |
|
"learning_rate": 0.00019960985147919778, |
|
"loss": 0.5722, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3004061592870477, |
|
"learning_rate": 0.00019960151183921897, |
|
"loss": 0.5526, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.27675115608209533, |
|
"learning_rate": 0.00019959308418503877, |
|
"loss": 0.5859, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.26526760651496173, |
|
"learning_rate": 0.00019958456852410433, |
|
"loss": 0.5395, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.29513224606753785, |
|
"learning_rate": 0.0001995759648639406, |
|
"loss": 0.59, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.27848396362609984, |
|
"learning_rate": 0.00019956727321215044, |
|
"loss": 0.6076, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.29804931563512865, |
|
"learning_rate": 0.00019955849357641424, |
|
"loss": 0.5555, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.2756038079003531, |
|
"learning_rate": 0.00019954962596449024, |
|
"loss": 0.5542, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.28433942136507906, |
|
"learning_rate": 0.0001995406703842145, |
|
"loss": 0.5527, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.646736414676863, |
|
"learning_rate": 0.0001995316268435007, |
|
"loss": 0.7024, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.25030459489112267, |
|
"learning_rate": 0.00019952249535034025, |
|
"loss": 0.4928, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.30977646038996587, |
|
"learning_rate": 0.00019951327591280236, |
|
"loss": 0.5883, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.2854791945432696, |
|
"learning_rate": 0.0001995039685390339, |
|
"loss": 0.6318, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3264119003161445, |
|
"learning_rate": 0.00019949457323725946, |
|
"loss": 0.5654, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.266512900378873, |
|
"learning_rate": 0.0001994850900157813, |
|
"loss": 0.5457, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.30259231663288877, |
|
"learning_rate": 0.0001994755188829794, |
|
"loss": 0.5828, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.33798187117798184, |
|
"learning_rate": 0.00019946585984731142, |
|
"loss": 0.5669, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.2769767197030659, |
|
"learning_rate": 0.00019945611291731274, |
|
"loss": 0.5619, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.2657775119925365, |
|
"learning_rate": 0.00019944627810159632, |
|
"loss": 0.59, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.2918370192930011, |
|
"learning_rate": 0.00019943635540885279, |
|
"loss": 0.5816, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.33417238245851544, |
|
"learning_rate": 0.00019942634484785052, |
|
"loss": 0.5921, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.2482914745160091, |
|
"learning_rate": 0.00019941624642743548, |
|
"loss": 0.5113, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.28272991564412037, |
|
"learning_rate": 0.0001994060601565313, |
|
"loss": 0.5543, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.35389083143384653, |
|
"learning_rate": 0.00019939578604413912, |
|
"loss": 0.5921, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3426787529883331, |
|
"learning_rate": 0.00019938542409933787, |
|
"loss": 0.6073, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.25621145606980844, |
|
"learning_rate": 0.000199374974331284, |
|
"loss": 0.5639, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.2996022840977615, |
|
"learning_rate": 0.00019936443674921158, |
|
"loss": 0.5874, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3283726508140091, |
|
"learning_rate": 0.0001993538113624323, |
|
"loss": 0.6295, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.2878523809947236, |
|
"learning_rate": 0.00019934309818033544, |
|
"loss": 0.5565, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.2764010556696928, |
|
"learning_rate": 0.0001993322972123878, |
|
"loss": 0.554, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.28827215996506156, |
|
"learning_rate": 0.0001993214084681338, |
|
"loss": 0.5788, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.2945105689397871, |
|
"learning_rate": 0.00019931043195719548, |
|
"loss": 0.5197, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.25439488455073433, |
|
"learning_rate": 0.00019929936768927232, |
|
"loss": 0.509, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.2746742447613063, |
|
"learning_rate": 0.00019928821567414144, |
|
"loss": 0.5479, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3033960913538536, |
|
"learning_rate": 0.00019927697592165747, |
|
"loss": 0.5859, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.28486244663010424, |
|
"learning_rate": 0.00019926564844175256, |
|
"loss": 0.5951, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3208016816012168, |
|
"learning_rate": 0.00019925423324443638, |
|
"loss": 0.5823, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3005362808247367, |
|
"learning_rate": 0.00019924273033979613, |
|
"loss": 0.5652, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.26910338931514155, |
|
"learning_rate": 0.0001992311397379965, |
|
"loss": 0.5463, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.28718562659091934, |
|
"learning_rate": 0.00019921946144927966, |
|
"loss": 0.5245, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.2616953117781411, |
|
"learning_rate": 0.0001992076954839653, |
|
"loss": 0.5358, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.3107257337368987, |
|
"learning_rate": 0.00019919584185245062, |
|
"loss": 0.5536, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.28427245738065265, |
|
"learning_rate": 0.00019918390056521018, |
|
"loss": 0.6126, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.25398310452304734, |
|
"learning_rate": 0.00019917187163279605, |
|
"loss": 0.5068, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.2556256469730818, |
|
"learning_rate": 0.00019915975506583778, |
|
"loss": 0.5416, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.2611702329742577, |
|
"learning_rate": 0.00019914755087504236, |
|
"loss": 0.5276, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.2789045987166575, |
|
"learning_rate": 0.00019913525907119418, |
|
"loss": 0.5591, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.26837658503581957, |
|
"learning_rate": 0.000199122879665155, |
|
"loss": 0.6581, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.34601396912277804, |
|
"learning_rate": 0.0001991104126678641, |
|
"loss": 0.5394, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.25684957257052443, |
|
"learning_rate": 0.00019909785809033806, |
|
"loss": 0.5392, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.2906797315813485, |
|
"learning_rate": 0.00019908521594367098, |
|
"loss": 0.5185, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.2852843546202924, |
|
"learning_rate": 0.0001990724862390342, |
|
"loss": 0.5436, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.2875355300862882, |
|
"learning_rate": 0.0001990596689876765, |
|
"loss": 0.6009, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.32052910212305513, |
|
"learning_rate": 0.00019904676420092404, |
|
"loss": 0.5831, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.266884162852661, |
|
"learning_rate": 0.00019903377189018024, |
|
"loss": 0.5459, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.2957365744895018, |
|
"learning_rate": 0.000199020692066926, |
|
"loss": 0.5211, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.24951992931808137, |
|
"learning_rate": 0.00019900752474271945, |
|
"loss": 0.497, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.30509964150122953, |
|
"learning_rate": 0.0001989942699291961, |
|
"loss": 0.5812, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.2790293776337124, |
|
"learning_rate": 0.0001989809276380687, |
|
"loss": 0.5856, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.24940387850774506, |
|
"learning_rate": 0.00019896749788112737, |
|
"loss": 0.5281, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.2664890107453781, |
|
"learning_rate": 0.0001989539806702395, |
|
"loss": 0.524, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.2896608423493073, |
|
"learning_rate": 0.0001989403760173497, |
|
"loss": 0.5171, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.2544937836162412, |
|
"learning_rate": 0.00019892668393447997, |
|
"loss": 0.5546, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.2626626371027326, |
|
"learning_rate": 0.00019891290443372944, |
|
"loss": 0.5498, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.281410490858952, |
|
"learning_rate": 0.0001988990375272746, |
|
"loss": 0.5377, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.3376943176164128, |
|
"learning_rate": 0.0001988850832273691, |
|
"loss": 0.5469, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.2507691377758427, |
|
"learning_rate": 0.0001988710415463439, |
|
"loss": 0.549, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.27178996752570117, |
|
"learning_rate": 0.00019885691249660702, |
|
"loss": 0.5636, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.3359421766962587, |
|
"learning_rate": 0.00019884269609064386, |
|
"loss": 0.5957, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.2638709645045905, |
|
"learning_rate": 0.0001988283923410169, |
|
"loss": 0.5793, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.25585919726912226, |
|
"learning_rate": 0.00019881400126036582, |
|
"loss": 0.5817, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.2905067973645414, |
|
"learning_rate": 0.00019879952286140754, |
|
"loss": 0.5585, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.24197399766587002, |
|
"learning_rate": 0.0001987849571569361, |
|
"loss": 0.507, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.28898034252731664, |
|
"learning_rate": 0.0001987703041598226, |
|
"loss": 0.5981, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.26516349701479863, |
|
"learning_rate": 0.00019875556388301543, |
|
"loss": 0.56, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.27235027968517367, |
|
"learning_rate": 0.00019874073633953997, |
|
"loss": 0.5872, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2692241567318253, |
|
"learning_rate": 0.00019872582154249884, |
|
"loss": 0.5397, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2560507155942398, |
|
"learning_rate": 0.00019871081950507163, |
|
"loss": 0.5431, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.26691224099103567, |
|
"learning_rate": 0.00019869573024051517, |
|
"loss": 0.5608, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2961375506924155, |
|
"learning_rate": 0.00019868055376216323, |
|
"loss": 0.5784, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.26055755072015874, |
|
"learning_rate": 0.00019866529008342673, |
|
"loss": 0.5369, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2525359310079611, |
|
"learning_rate": 0.00019864993921779361, |
|
"loss": 0.5438, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.249327141855566, |
|
"learning_rate": 0.0001986345011788289, |
|
"loss": 0.5668, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2983950007732028, |
|
"learning_rate": 0.00019861897598017457, |
|
"loss": 0.5271, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.25610455444964525, |
|
"learning_rate": 0.00019860336363554973, |
|
"loss": 0.6012, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.24760444410184018, |
|
"learning_rate": 0.0001985876641587504, |
|
"loss": 0.5066, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2614264060863463, |
|
"learning_rate": 0.00019857187756364958, |
|
"loss": 0.5792, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.27219045408444215, |
|
"learning_rate": 0.00019855600386419744, |
|
"loss": 0.543, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.24606131498871828, |
|
"learning_rate": 0.00019854004307442088, |
|
"loss": 0.5676, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.28394763236035964, |
|
"learning_rate": 0.0001985239952084239, |
|
"loss": 0.6032, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.28350170917034406, |
|
"learning_rate": 0.0001985078602803874, |
|
"loss": 0.6264, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.24011552907338696, |
|
"learning_rate": 0.00019849163830456922, |
|
"loss": 0.4793, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.2561209086280576, |
|
"learning_rate": 0.00019847532929530415, |
|
"loss": 0.6198, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.23712886628255178, |
|
"learning_rate": 0.00019845893326700384, |
|
"loss": 0.4989, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.26720592489417233, |
|
"learning_rate": 0.00019844245023415685, |
|
"loss": 0.4934, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.2753251397417421, |
|
"learning_rate": 0.0001984258802113287, |
|
"loss": 0.5544, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.2557869713293877, |
|
"learning_rate": 0.0001984092232131616, |
|
"loss": 0.5643, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.2669651919314609, |
|
"learning_rate": 0.0001983924792543748, |
|
"loss": 0.5879, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.25579187132615644, |
|
"learning_rate": 0.00019837564834976432, |
|
"loss": 0.5742, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.2550207949421237, |
|
"learning_rate": 0.000198358730514203, |
|
"loss": 0.574, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.23565090455665763, |
|
"learning_rate": 0.0001983417257626405, |
|
"loss": 0.5299, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.236980034600526, |
|
"learning_rate": 0.00019832463411010331, |
|
"loss": 0.5199, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.2434029841189093, |
|
"learning_rate": 0.0001983074555716947, |
|
"loss": 0.5477, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.24771511082227154, |
|
"learning_rate": 0.00019829019016259468, |
|
"loss": 0.5697, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.23705880771864213, |
|
"learning_rate": 0.00019827283789806011, |
|
"loss": 0.521, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.24167295291353477, |
|
"learning_rate": 0.0001982553987934245, |
|
"loss": 0.558, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.2535406245356529, |
|
"learning_rate": 0.0001982378728640982, |
|
"loss": 0.5693, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.24865334136075806, |
|
"learning_rate": 0.00019822026012556818, |
|
"loss": 0.5499, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.2544751551481819, |
|
"learning_rate": 0.0001982025605933982, |
|
"loss": 0.5449, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.2599391794330939, |
|
"learning_rate": 0.0001981847742832287, |
|
"loss": 0.6222, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.23171934920449544, |
|
"learning_rate": 0.00019816690121077674, |
|
"loss": 0.5448, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.24380268930715565, |
|
"learning_rate": 0.00019814894139183614, |
|
"loss": 0.5773, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.2518755786374484, |
|
"learning_rate": 0.00019813089484227732, |
|
"loss": 0.5479, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.23133984467720642, |
|
"learning_rate": 0.00019811276157804733, |
|
"loss": 0.471, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.250968947574734, |
|
"learning_rate": 0.00019809454161516993, |
|
"loss": 0.5738, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.25976015596485974, |
|
"learning_rate": 0.00019807623496974537, |
|
"loss": 0.5592, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.24400199531999783, |
|
"learning_rate": 0.0001980578416579506, |
|
"loss": 0.5266, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.24001572180370875, |
|
"learning_rate": 0.00019803936169603912, |
|
"loss": 0.5843, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.22867208326764507, |
|
"learning_rate": 0.00019802079510034096, |
|
"loss": 0.518, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.2381724022911579, |
|
"learning_rate": 0.00019800214188726276, |
|
"loss": 0.5175, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.2700455397530704, |
|
"learning_rate": 0.00019798340207328766, |
|
"loss": 0.5804, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.24320219539003604, |
|
"learning_rate": 0.00019796457567497537, |
|
"loss": 0.5304, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.2370472610839002, |
|
"learning_rate": 0.0001979456627089621, |
|
"loss": 0.5671, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.26756212643991917, |
|
"learning_rate": 0.0001979266631919605, |
|
"loss": 0.5528, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.24929490389202372, |
|
"learning_rate": 0.00019790757714075979, |
|
"loss": 0.5407, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.23090595152280974, |
|
"learning_rate": 0.00019788840457222556, |
|
"loss": 0.5258, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.2979564627406142, |
|
"learning_rate": 0.0001978691455033, |
|
"loss": 0.5367, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.24228845587479894, |
|
"learning_rate": 0.0001978497999510015, |
|
"loss": 0.5344, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.25363482164729867, |
|
"learning_rate": 0.00019783036793242516, |
|
"loss": 0.5669, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.23622712417060854, |
|
"learning_rate": 0.00019781084946474226, |
|
"loss": 0.5797, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.21594682302559634, |
|
"learning_rate": 0.00019779124456520056, |
|
"loss": 0.5011, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.24211833950801223, |
|
"learning_rate": 0.0001977715532511242, |
|
"loss": 0.5164, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.2693820157715391, |
|
"learning_rate": 0.0001977517755399137, |
|
"loss": 0.5806, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.24734981937576542, |
|
"learning_rate": 0.00019773191144904586, |
|
"loss": 0.5233, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.2741663737268136, |
|
"learning_rate": 0.00019771196099607386, |
|
"loss": 0.5402, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.2521489075033339, |
|
"learning_rate": 0.00019769192419862716, |
|
"loss": 0.5862, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.2633255671236209, |
|
"learning_rate": 0.0001976718010744116, |
|
"loss": 0.548, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.2691399721238696, |
|
"learning_rate": 0.00019765159164120916, |
|
"loss": 0.5648, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.25501545746966797, |
|
"learning_rate": 0.00019763129591687827, |
|
"loss": 0.5602, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.3049976908839563, |
|
"learning_rate": 0.00019761091391935347, |
|
"loss": 0.5508, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.24467331400916031, |
|
"learning_rate": 0.00019759044566664558, |
|
"loss": 0.5229, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.27011009612786374, |
|
"learning_rate": 0.00019756989117684164, |
|
"loss": 0.5448, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.24427143044387528, |
|
"learning_rate": 0.00019754925046810493, |
|
"loss": 0.5435, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.22753961311031143, |
|
"learning_rate": 0.00019752852355867486, |
|
"loss": 0.5369, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.23865046003559778, |
|
"learning_rate": 0.00019750771046686704, |
|
"loss": 0.5354, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.2736283930569903, |
|
"learning_rate": 0.00019748681121107325, |
|
"loss": 0.5588, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.24727127426749082, |
|
"learning_rate": 0.00019746582580976136, |
|
"loss": 0.5753, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.2828829340227291, |
|
"learning_rate": 0.00019744475428147546, |
|
"loss": 0.6793, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.21818184663896711, |
|
"learning_rate": 0.00019742359664483563, |
|
"loss": 0.5248, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.2320708306833192, |
|
"learning_rate": 0.00019740235291853812, |
|
"loss": 0.5461, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.25703347930088793, |
|
"learning_rate": 0.00019738102312135523, |
|
"loss": 0.5713, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.24399588128874033, |
|
"learning_rate": 0.0001973596072721353, |
|
"loss": 0.5178, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.2229881452119291, |
|
"learning_rate": 0.00019733810538980281, |
|
"loss": 0.5144, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.23889465035265364, |
|
"learning_rate": 0.0001973165174933581, |
|
"loss": 0.5727, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.25790569964877214, |
|
"learning_rate": 0.0001972948436018776, |
|
"loss": 0.5659, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.22511338701135042, |
|
"learning_rate": 0.00019727308373451377, |
|
"loss": 0.5292, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.23111498863739158, |
|
"learning_rate": 0.000197251237910495, |
|
"loss": 0.5267, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.23740021982137896, |
|
"learning_rate": 0.00019722930614912563, |
|
"loss": 0.5499, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.24020258332985106, |
|
"learning_rate": 0.00019720728846978598, |
|
"loss": 0.5604, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.23947573439011133, |
|
"learning_rate": 0.00019718518489193225, |
|
"loss": 0.5638, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.23526187217481284, |
|
"learning_rate": 0.00019716299543509654, |
|
"loss": 0.5436, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.2505831003191156, |
|
"learning_rate": 0.00019714072011888686, |
|
"loss": 0.5039, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.21592487431784965, |
|
"learning_rate": 0.00019711835896298713, |
|
"loss": 0.484, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.26528309878122613, |
|
"learning_rate": 0.00019709591198715707, |
|
"loss": 0.539, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.21635789850987178, |
|
"learning_rate": 0.00019707337921123221, |
|
"loss": 0.5553, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.23671151623321054, |
|
"learning_rate": 0.00019705076065512398, |
|
"loss": 0.4968, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.25400871793456326, |
|
"learning_rate": 0.00019702805633881957, |
|
"loss": 0.5982, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.2622810971314154, |
|
"learning_rate": 0.0001970052662823819, |
|
"loss": 0.5879, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.22931814830456296, |
|
"learning_rate": 0.00019698239050594977, |
|
"loss": 0.5611, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.213695866193263, |
|
"learning_rate": 0.0001969594290297376, |
|
"loss": 0.5386, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.2431252328609808, |
|
"learning_rate": 0.00019693638187403563, |
|
"loss": 0.6039, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.23108667253454973, |
|
"learning_rate": 0.00019691324905920984, |
|
"loss": 0.5579, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.22718831415064272, |
|
"learning_rate": 0.0001968900306057018, |
|
"loss": 0.5196, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.23632362967796033, |
|
"learning_rate": 0.0001968667265340288, |
|
"loss": 0.5336, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.2421878973201691, |
|
"learning_rate": 0.00019684333686478383, |
|
"loss": 0.5928, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.225775487602821, |
|
"learning_rate": 0.00019681986161863542, |
|
"loss": 0.552, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.23037375759338816, |
|
"learning_rate": 0.00019679630081632782, |
|
"loss": 0.4983, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.24684136612832333, |
|
"learning_rate": 0.00019677265447868086, |
|
"loss": 0.5655, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.2412756534364674, |
|
"learning_rate": 0.0001967489226265899, |
|
"loss": 0.5063, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.22005765622474396, |
|
"learning_rate": 0.00019672510528102597, |
|
"loss": 0.5188, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.25071531725514384, |
|
"learning_rate": 0.0001967012024630355, |
|
"loss": 0.5938, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.22139592405512468, |
|
"learning_rate": 0.00019667721419374065, |
|
"loss": 0.4917, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.23067244251762076, |
|
"learning_rate": 0.00019665314049433888, |
|
"loss": 0.5584, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.23829875535152545, |
|
"learning_rate": 0.00019662898138610323, |
|
"loss": 0.5264, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.2641034663020514, |
|
"learning_rate": 0.00019660473689038228, |
|
"loss": 0.5805, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.22321690487140503, |
|
"learning_rate": 0.00019658040702859997, |
|
"loss": 0.5529, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.2502632163555589, |
|
"learning_rate": 0.00019655599182225565, |
|
"loss": 0.5347, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.2392608020883604, |
|
"learning_rate": 0.00019653149129292426, |
|
"loss": 0.5263, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.2539237490519494, |
|
"learning_rate": 0.00019650690546225592, |
|
"loss": 0.5156, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.21964099103511592, |
|
"learning_rate": 0.00019648223435197627, |
|
"loss": 0.5101, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.24992985700157416, |
|
"learning_rate": 0.00019645747798388628, |
|
"loss": 0.5621, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.2559439615345381, |
|
"learning_rate": 0.0001964326363798622, |
|
"loss": 0.5753, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.2504368010690795, |
|
"learning_rate": 0.00019640770956185567, |
|
"loss": 0.5558, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.21022187251218089, |
|
"learning_rate": 0.0001963826975518936, |
|
"loss": 0.5322, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.2422143856532352, |
|
"learning_rate": 0.00019635760037207817, |
|
"loss": 0.538, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.23174554470584352, |
|
"learning_rate": 0.00019633241804458687, |
|
"loss": 0.5545, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.267070426953347, |
|
"learning_rate": 0.00019630715059167238, |
|
"loss": 0.5632, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.22256523603127878, |
|
"learning_rate": 0.0001962817980356626, |
|
"loss": 0.545, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.24403300497950306, |
|
"learning_rate": 0.0001962563603989607, |
|
"loss": 0.5448, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.2193103621019292, |
|
"learning_rate": 0.00019623083770404492, |
|
"loss": 0.5064, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.23299992325798072, |
|
"learning_rate": 0.0001962052299734688, |
|
"loss": 0.5192, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.2371054150083945, |
|
"learning_rate": 0.00019617953722986096, |
|
"loss": 0.5157, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.2436064901189273, |
|
"learning_rate": 0.00019615375949592504, |
|
"loss": 0.5672, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.25098365347678436, |
|
"learning_rate": 0.00019612789679443997, |
|
"loss": 0.5548, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.2319425382974216, |
|
"learning_rate": 0.00019610194914825962, |
|
"loss": 0.5293, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.24156576209403272, |
|
"learning_rate": 0.000196075916580313, |
|
"loss": 0.5672, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.2337383575844323, |
|
"learning_rate": 0.0001960497991136041, |
|
"loss": 0.5509, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.23799692988502053, |
|
"learning_rate": 0.00019602359677121199, |
|
"loss": 0.5604, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.2296728275122706, |
|
"learning_rate": 0.0001959973095762907, |
|
"loss": 0.5371, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.22381626870035518, |
|
"learning_rate": 0.00019597093755206936, |
|
"loss": 0.5465, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.23335681761234933, |
|
"learning_rate": 0.00019594448072185182, |
|
"loss": 0.5386, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.22582265649304345, |
|
"learning_rate": 0.00019591793910901707, |
|
"loss": 0.543, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.2439330072441743, |
|
"learning_rate": 0.00019589131273701894, |
|
"loss": 0.5177, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.2138593422237162, |
|
"learning_rate": 0.00019586460162938622, |
|
"loss": 0.5157, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.24003613679646058, |
|
"learning_rate": 0.00019583780580972253, |
|
"loss": 0.5611, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.2552582800734971, |
|
"learning_rate": 0.00019581092530170633, |
|
"loss": 0.5922, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.21898423827197905, |
|
"learning_rate": 0.00019578396012909092, |
|
"loss": 0.5272, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.22013525107478477, |
|
"learning_rate": 0.00019575691031570446, |
|
"loss": 0.5184, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.21113180640163418, |
|
"learning_rate": 0.00019572977588544986, |
|
"loss": 0.5134, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.22335860079943387, |
|
"learning_rate": 0.00019570255686230485, |
|
"loss": 0.5227, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.23006684721287293, |
|
"learning_rate": 0.00019567525327032187, |
|
"loss": 0.5885, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.21933564641390155, |
|
"learning_rate": 0.0001956478651336281, |
|
"loss": 0.5598, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.21770749652400337, |
|
"learning_rate": 0.00019562039247642546, |
|
"loss": 0.5082, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.22800344133653658, |
|
"learning_rate": 0.00019559283532299043, |
|
"loss": 0.5539, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2385574924163353, |
|
"learning_rate": 0.00019556519369767438, |
|
"loss": 0.5497, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.23099538598172079, |
|
"learning_rate": 0.0001955374676249031, |
|
"loss": 0.5138, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.21517217725478144, |
|
"learning_rate": 0.0001955096571291772, |
|
"loss": 0.5051, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.21535771106277588, |
|
"learning_rate": 0.0001954817622350717, |
|
"loss": 0.524, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.20361747402971658, |
|
"learning_rate": 0.00019545378296723635, |
|
"loss": 0.4989, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.24644921068325687, |
|
"learning_rate": 0.0001954257193503954, |
|
"loss": 0.5927, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.24765268362172385, |
|
"learning_rate": 0.0001953975714093476, |
|
"loss": 0.5451, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.20846277824915477, |
|
"learning_rate": 0.00019536933916896633, |
|
"loss": 0.5259, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2457371199220107, |
|
"learning_rate": 0.00019534102265419932, |
|
"loss": 0.5784, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.23029745387228598, |
|
"learning_rate": 0.00019531262189006882, |
|
"loss": 0.5918, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2387820151516941, |
|
"learning_rate": 0.0001952841369016716, |
|
"loss": 0.5576, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.226451643448924, |
|
"learning_rate": 0.00019525556771417875, |
|
"loss": 0.5241, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.22086691724075064, |
|
"learning_rate": 0.00019522691435283585, |
|
"loss": 0.5392, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2259720671796772, |
|
"learning_rate": 0.00019519817684296285, |
|
"loss": 0.516, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2244741513315317, |
|
"learning_rate": 0.00019516935520995393, |
|
"loss": 0.569, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.23890602213836412, |
|
"learning_rate": 0.0001951404494792778, |
|
"loss": 0.524, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.22136745892767679, |
|
"learning_rate": 0.00019511145967647737, |
|
"loss": 0.5472, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.22275740066078306, |
|
"learning_rate": 0.00019508238582716984, |
|
"loss": 0.5553, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.21225155652808625, |
|
"learning_rate": 0.00019505322795704676, |
|
"loss": 0.5302, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.22704101844750724, |
|
"learning_rate": 0.0001950239860918738, |
|
"loss": 0.5485, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.2135110250199134, |
|
"learning_rate": 0.00019499466025749097, |
|
"loss": 0.5343, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.22772242632973722, |
|
"learning_rate": 0.00019496525047981242, |
|
"loss": 0.5159, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.4444297049160113, |
|
"learning_rate": 0.00019493575678482649, |
|
"loss": 0.5121, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.226632712040011, |
|
"learning_rate": 0.0001949061791985957, |
|
"loss": 0.5304, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.22132303156586286, |
|
"learning_rate": 0.00019487651774725663, |
|
"loss": 0.4817, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.23206581340772667, |
|
"learning_rate": 0.00019484677245702004, |
|
"loss": 0.5258, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.2374903834541946, |
|
"learning_rate": 0.0001948169433541708, |
|
"loss": 0.5318, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.22896458770920267, |
|
"learning_rate": 0.00019478703046506773, |
|
"loss": 0.4806, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.21040301706147688, |
|
"learning_rate": 0.00019475703381614375, |
|
"loss": 0.5144, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.21179618454444762, |
|
"learning_rate": 0.00019472695343390585, |
|
"loss": 0.524, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.20436614333218908, |
|
"learning_rate": 0.00019469678934493488, |
|
"loss": 0.501, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.2478256130980173, |
|
"learning_rate": 0.0001946665415758858, |
|
"loss": 0.5386, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.226116084636948, |
|
"learning_rate": 0.00019463621015348748, |
|
"loss": 0.5101, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.21838947264457534, |
|
"learning_rate": 0.00019460579510454263, |
|
"loss": 0.5296, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2152879498375444, |
|
"learning_rate": 0.00019457529645592792, |
|
"loss": 0.512, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.22514971802642378, |
|
"learning_rate": 0.00019454471423459389, |
|
"loss": 0.5593, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.23402386101532432, |
|
"learning_rate": 0.00019451404846756494, |
|
"loss": 0.555, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2244514913016572, |
|
"learning_rate": 0.00019448329918193927, |
|
"loss": 0.5689, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.22260707231596893, |
|
"learning_rate": 0.00019445246640488893, |
|
"loss": 0.6062, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.21791090145253736, |
|
"learning_rate": 0.00019442155016365965, |
|
"loss": 0.531, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.21895889257567258, |
|
"learning_rate": 0.00019439055048557101, |
|
"loss": 0.5538, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.21306696799872818, |
|
"learning_rate": 0.00019435946739801633, |
|
"loss": 0.5673, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2294607768810707, |
|
"learning_rate": 0.00019432830092846253, |
|
"loss": 0.5855, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.22758409665267085, |
|
"learning_rate": 0.0001942970511044503, |
|
"loss": 0.5783, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.21334100614200935, |
|
"learning_rate": 0.00019426571795359398, |
|
"loss": 0.5056, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.24187624093875965, |
|
"learning_rate": 0.0001942343015035815, |
|
"loss": 0.543, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2275714845035408, |
|
"learning_rate": 0.00019420280178217443, |
|
"loss": 0.5329, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.23237641477505608, |
|
"learning_rate": 0.00019417121881720793, |
|
"loss": 0.5134, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.25196886008416386, |
|
"learning_rate": 0.0001941395526365907, |
|
"loss": 0.6023, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.22418514390796682, |
|
"learning_rate": 0.00019410780326830498, |
|
"loss": 0.5529, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.21438856736265666, |
|
"learning_rate": 0.0001940759707404065, |
|
"loss": 0.5134, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.2331754234870151, |
|
"learning_rate": 0.00019404405508102455, |
|
"loss": 0.5406, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.24908239322819828, |
|
"learning_rate": 0.00019401205631836178, |
|
"loss": 0.5377, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.21332745391417657, |
|
"learning_rate": 0.00019397997448069435, |
|
"loss": 0.5025, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.20749658696001225, |
|
"learning_rate": 0.00019394780959637177, |
|
"loss": 0.5257, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.2237716482529178, |
|
"learning_rate": 0.000193915561693817, |
|
"loss": 0.5, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.25234282015654147, |
|
"learning_rate": 0.00019388323080152633, |
|
"loss": 0.5753, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.2252939326339829, |
|
"learning_rate": 0.00019385081694806936, |
|
"loss": 0.5662, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.21979629294660186, |
|
"learning_rate": 0.00019381832016208904, |
|
"loss": 0.5141, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.24762535901866153, |
|
"learning_rate": 0.0001937857404723016, |
|
"loss": 0.6193, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.25032044234085526, |
|
"learning_rate": 0.00019375307790749647, |
|
"loss": 0.5024, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.22892425302508923, |
|
"learning_rate": 0.0001937203324965364, |
|
"loss": 0.5401, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.2461599771002527, |
|
"learning_rate": 0.0001936875042683573, |
|
"loss": 0.5301, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.22363255721865732, |
|
"learning_rate": 0.00019365459325196825, |
|
"loss": 0.5538, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.22482667580972365, |
|
"learning_rate": 0.00019362159947645152, |
|
"loss": 0.4928, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.22869596173751142, |
|
"learning_rate": 0.00019358852297096253, |
|
"loss": 0.5546, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.2274546469780496, |
|
"learning_rate": 0.00019355536376472972, |
|
"loss": 0.5763, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.21284874650406885, |
|
"learning_rate": 0.0001935221218870547, |
|
"loss": 0.5778, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.23158847478661296, |
|
"learning_rate": 0.0001934887973673121, |
|
"loss": 0.5654, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.24510006704514478, |
|
"learning_rate": 0.0001934553902349496, |
|
"loss": 0.5053, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.20330878586204656, |
|
"learning_rate": 0.00019342190051948777, |
|
"loss": 0.5171, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.2131804710318274, |
|
"learning_rate": 0.0001933883282505203, |
|
"loss": 0.5286, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.23297933515492006, |
|
"learning_rate": 0.00019335467345771377, |
|
"loss": 0.5593, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.24611434220143105, |
|
"learning_rate": 0.0001933209361708077, |
|
"loss": 0.604, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.23281321736249425, |
|
"learning_rate": 0.00019328711641961445, |
|
"loss": 0.5579, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.21399822113981087, |
|
"learning_rate": 0.00019325321423401933, |
|
"loss": 0.5661, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.22113107520723113, |
|
"learning_rate": 0.00019321922964398046, |
|
"loss": 0.5789, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.23262880002546846, |
|
"learning_rate": 0.00019318516267952874, |
|
"loss": 0.5447, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.24962941770082592, |
|
"learning_rate": 0.00019315101337076792, |
|
"loss": 0.5512, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.22210049422713798, |
|
"learning_rate": 0.0001931167817478745, |
|
"loss": 0.5427, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.22647809883332484, |
|
"learning_rate": 0.0001930824678410977, |
|
"loss": 0.4888, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.23660763255678552, |
|
"learning_rate": 0.00019304807168075944, |
|
"loss": 0.5755, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.2354103448271752, |
|
"learning_rate": 0.00019301359329725436, |
|
"loss": 0.5265, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.24322261128085423, |
|
"learning_rate": 0.00019297903272104977, |
|
"loss": 0.5291, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.20525199182278092, |
|
"learning_rate": 0.00019294438998268554, |
|
"loss": 0.4996, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.24678535182755174, |
|
"learning_rate": 0.00019290966511277422, |
|
"loss": 0.567, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.22165331172413838, |
|
"learning_rate": 0.00019287485814200087, |
|
"loss": 0.5348, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.24541020782476444, |
|
"learning_rate": 0.00019283996910112318, |
|
"loss": 0.5432, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.2255959168063083, |
|
"learning_rate": 0.00019280499802097126, |
|
"loss": 0.5891, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.21159018099714821, |
|
"learning_rate": 0.0001927699449324478, |
|
"loss": 0.5003, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.21379995902020923, |
|
"learning_rate": 0.00019273480986652794, |
|
"loss": 0.5314, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.2853169518220406, |
|
"learning_rate": 0.0001926995928542592, |
|
"loss": 0.6108, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.22738285867292138, |
|
"learning_rate": 0.00019266429392676164, |
|
"loss": 0.5217, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.23835369502554374, |
|
"learning_rate": 0.00019262891311522755, |
|
"loss": 0.5318, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.20671557324330114, |
|
"learning_rate": 0.0001925934504509217, |
|
"loss": 0.5234, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.205212164360302, |
|
"learning_rate": 0.00019255790596518112, |
|
"loss": 0.5023, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.21664090577036152, |
|
"learning_rate": 0.00019252227968941522, |
|
"loss": 0.5452, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.22146041084684798, |
|
"learning_rate": 0.00019248657165510556, |
|
"loss": 0.5474, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.2338997589574809, |
|
"learning_rate": 0.00019245078189380604, |
|
"loss": 0.5516, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.2313978280927526, |
|
"learning_rate": 0.0001924149104371428, |
|
"loss": 0.5831, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.2098577112814155, |
|
"learning_rate": 0.00019237895731681408, |
|
"loss": 0.5452, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.26497439164374026, |
|
"learning_rate": 0.0001923429225645904, |
|
"loss": 0.5666, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.21859970576834997, |
|
"learning_rate": 0.00019230680621231425, |
|
"loss": 0.5069, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.20509380886351694, |
|
"learning_rate": 0.0001922706082919004, |
|
"loss": 0.4573, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.2182328366507935, |
|
"learning_rate": 0.0001922343288353356, |
|
"loss": 0.6133, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.2822350271273954, |
|
"learning_rate": 0.00019219796787467867, |
|
"loss": 0.5709, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.24487543268473794, |
|
"learning_rate": 0.00019216152544206049, |
|
"loss": 0.546, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.24221176090281485, |
|
"learning_rate": 0.00019212500156968383, |
|
"loss": 0.5507, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.22053929296251015, |
|
"learning_rate": 0.00019208839628982358, |
|
"loss": 0.5473, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.22975415570737245, |
|
"learning_rate": 0.00019205170963482643, |
|
"loss": 0.5181, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.22969105575505203, |
|
"learning_rate": 0.00019201494163711104, |
|
"loss": 0.5463, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.23764087103158363, |
|
"learning_rate": 0.00019197809232916795, |
|
"loss": 0.55, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.21997498488474826, |
|
"learning_rate": 0.00019194116174355954, |
|
"loss": 0.5421, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.22225824990596896, |
|
"learning_rate": 0.00019190414991291998, |
|
"loss": 0.5439, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.243391488050543, |
|
"learning_rate": 0.00019186705686995533, |
|
"loss": 0.6289, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.222494273038652, |
|
"learning_rate": 0.0001918298826474433, |
|
"loss": 0.5088, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.22114450997419682, |
|
"learning_rate": 0.0001917926272782334, |
|
"loss": 0.5624, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.21964760504534894, |
|
"learning_rate": 0.00019175529079524687, |
|
"loss": 0.5289, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.3042847973140014, |
|
"learning_rate": 0.00019171787323147654, |
|
"loss": 0.5328, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.22425571202210934, |
|
"learning_rate": 0.00019168037461998695, |
|
"loss": 0.5699, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.23406959191320909, |
|
"learning_rate": 0.00019164279499391427, |
|
"loss": 0.5147, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.3604500123158513, |
|
"learning_rate": 0.00019160513438646617, |
|
"loss": 0.5697, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2501436029131694, |
|
"learning_rate": 0.00019156739283092205, |
|
"loss": 0.6015, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.21928141490521824, |
|
"learning_rate": 0.00019152957036063265, |
|
"loss": 0.5111, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.257908225365161, |
|
"learning_rate": 0.00019149166700902032, |
|
"loss": 0.5132, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2713678867101362, |
|
"learning_rate": 0.0001914536828095789, |
|
"loss": 0.5995, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2398794022246256, |
|
"learning_rate": 0.0001914156177958736, |
|
"loss": 0.4993, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2373981477389832, |
|
"learning_rate": 0.0001913774720015411, |
|
"loss": 0.5064, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2188011093608266, |
|
"learning_rate": 0.00019133924546028942, |
|
"loss": 0.5606, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.24077263566935142, |
|
"learning_rate": 0.00019130093820589791, |
|
"loss": 0.5606, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.23519919814487683, |
|
"learning_rate": 0.00019126255027221735, |
|
"loss": 0.5307, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.21480730775028578, |
|
"learning_rate": 0.00019122408169316976, |
|
"loss": 0.526, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2161668548042441, |
|
"learning_rate": 0.00019118553250274832, |
|
"loss": 0.5657, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.22318400428439122, |
|
"learning_rate": 0.00019114690273501765, |
|
"loss": 0.513, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.22252447744680176, |
|
"learning_rate": 0.00019110819242411337, |
|
"loss": 0.5247, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.21358818358042153, |
|
"learning_rate": 0.00019106940160424244, |
|
"loss": 0.556, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.2121229259271081, |
|
"learning_rate": 0.0001910305303096828, |
|
"loss": 0.5138, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.22636146624511622, |
|
"learning_rate": 0.0001909915785747836, |
|
"loss": 0.5111, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.20571954917028099, |
|
"learning_rate": 0.00019095254643396512, |
|
"loss": 0.5125, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.21968966730793454, |
|
"learning_rate": 0.0001909134339217186, |
|
"loss": 0.5358, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.21910723327372644, |
|
"learning_rate": 0.00019087424107260627, |
|
"loss": 0.5382, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.2153841373499183, |
|
"learning_rate": 0.00019083496792126153, |
|
"loss": 0.5375, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.23479205084160673, |
|
"learning_rate": 0.00019079561450238854, |
|
"loss": 0.5984, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.21595571362737268, |
|
"learning_rate": 0.00019075618085076247, |
|
"loss": 0.5417, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.24550770571804625, |
|
"learning_rate": 0.00019071666700122946, |
|
"loss": 0.5306, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.21802243564456578, |
|
"learning_rate": 0.00019067707298870638, |
|
"loss": 0.5157, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.2068796190094572, |
|
"learning_rate": 0.00019063739884818103, |
|
"loss": 0.5254, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.24034732867281272, |
|
"learning_rate": 0.000190597644614712, |
|
"loss": 0.6204, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.2260836607650634, |
|
"learning_rate": 0.00019055781032342864, |
|
"loss": 0.5492, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.2476351525598878, |
|
"learning_rate": 0.00019051789600953102, |
|
"loss": 0.5157, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.2280151093681579, |
|
"learning_rate": 0.00019047790170829003, |
|
"loss": 0.4984, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.2217333524292061, |
|
"learning_rate": 0.00019043782745504711, |
|
"loss": 0.5149, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.2356369467654302, |
|
"learning_rate": 0.00019039767328521442, |
|
"loss": 0.5724, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.21541809863677616, |
|
"learning_rate": 0.0001903574392342747, |
|
"loss": 0.5138, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.21722431891543054, |
|
"learning_rate": 0.00019031712533778137, |
|
"loss": 0.5536, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.2370708268417489, |
|
"learning_rate": 0.00019027673163135827, |
|
"loss": 0.5038, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.22809310323516838, |
|
"learning_rate": 0.00019023625815069989, |
|
"loss": 0.5713, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.22374988575329294, |
|
"learning_rate": 0.00019019570493157114, |
|
"loss": 0.5549, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.20510711707245072, |
|
"learning_rate": 0.0001901550720098074, |
|
"loss": 0.46, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.2621551195786783, |
|
"learning_rate": 0.00019011435942131448, |
|
"loss": 0.5546, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.20503054358781417, |
|
"learning_rate": 0.00019007356720206865, |
|
"loss": 0.5547, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.23586140447856616, |
|
"learning_rate": 0.00019003269538811647, |
|
"loss": 0.6075, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.2828040872125889, |
|
"learning_rate": 0.00018999174401557488, |
|
"loss": 0.602, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.2023429982220119, |
|
"learning_rate": 0.00018995071312063105, |
|
"loss": 0.4975, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.2054777673202953, |
|
"learning_rate": 0.00018990960273954254, |
|
"loss": 0.5295, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.1982185225446849, |
|
"learning_rate": 0.00018986841290863704, |
|
"loss": 0.5461, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.23248022218099268, |
|
"learning_rate": 0.0001898271436643125, |
|
"loss": 0.5924, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2235279893303581, |
|
"learning_rate": 0.00018978579504303706, |
|
"loss": 0.5598, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.21675084465821123, |
|
"learning_rate": 0.000189744367081349, |
|
"loss": 0.5012, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2041881848681654, |
|
"learning_rate": 0.00018970285981585662, |
|
"loss": 0.526, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.23258761727278376, |
|
"learning_rate": 0.00018966127328323842, |
|
"loss": 0.553, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.23066266735191, |
|
"learning_rate": 0.00018961960752024288, |
|
"loss": 0.5506, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.20634958879584178, |
|
"learning_rate": 0.0001895778625636885, |
|
"loss": 0.5006, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.21082421656186934, |
|
"learning_rate": 0.00018953603845046378, |
|
"loss": 0.5279, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2057560041730304, |
|
"learning_rate": 0.00018949413521752713, |
|
"loss": 0.5598, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2096114347066206, |
|
"learning_rate": 0.00018945215290190693, |
|
"loss": 0.5113, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.23218477255443984, |
|
"learning_rate": 0.00018941009154070136, |
|
"loss": 0.5169, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.20857717653678057, |
|
"learning_rate": 0.00018936795117107855, |
|
"loss": 0.5149, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.24006448825761761, |
|
"learning_rate": 0.0001893257318302764, |
|
"loss": 0.5228, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2146671098435255, |
|
"learning_rate": 0.00018928343355560258, |
|
"loss": 0.5257, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.20608859556559073, |
|
"learning_rate": 0.00018924105638443452, |
|
"loss": 0.527, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2336814919363686, |
|
"learning_rate": 0.0001891986003542194, |
|
"loss": 0.5461, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2409130946928026, |
|
"learning_rate": 0.00018915606550247397, |
|
"loss": 0.5493, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.21371348825911873, |
|
"learning_rate": 0.0001891134518667848, |
|
"loss": 0.572, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.2014364828041311, |
|
"learning_rate": 0.000189070759484808, |
|
"loss": 0.5109, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.2290945612613713, |
|
"learning_rate": 0.0001890279883942692, |
|
"loss": 0.5493, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.22127732127756986, |
|
"learning_rate": 0.0001889851386329637, |
|
"loss": 0.5387, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.20564079598559082, |
|
"learning_rate": 0.00018894221023875622, |
|
"loss": 0.5192, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.213993086214796, |
|
"learning_rate": 0.00018889920324958106, |
|
"loss": 0.5044, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.21506249939577854, |
|
"learning_rate": 0.00018885611770344185, |
|
"loss": 0.4969, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.22792164808811663, |
|
"learning_rate": 0.00018881295363841174, |
|
"loss": 0.5564, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.1978731923118128, |
|
"learning_rate": 0.00018876971109263324, |
|
"loss": 0.4898, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.22394451521984352, |
|
"learning_rate": 0.00018872639010431822, |
|
"loss": 0.5586, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.20009625678598073, |
|
"learning_rate": 0.0001886829907117478, |
|
"loss": 0.5399, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.20448355507434923, |
|
"learning_rate": 0.00018863951295327244, |
|
"loss": 0.5263, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.1967777231547204, |
|
"learning_rate": 0.00018859595686731187, |
|
"loss": 0.4904, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.2052388343929957, |
|
"learning_rate": 0.00018855232249235498, |
|
"loss": 0.4951, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.1970956590240829, |
|
"learning_rate": 0.00018850860986695985, |
|
"loss": 0.5112, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.2102143499682878, |
|
"learning_rate": 0.00018846481902975377, |
|
"loss": 0.5234, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.23384214794287286, |
|
"learning_rate": 0.00018842095001943306, |
|
"loss": 0.5387, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.20133953340775343, |
|
"learning_rate": 0.00018837700287476316, |
|
"loss": 0.4995, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.2238467486071384, |
|
"learning_rate": 0.00018833297763457858, |
|
"loss": 0.5709, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.26170161234282546, |
|
"learning_rate": 0.00018828887433778278, |
|
"loss": 0.6314, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.2317819906199683, |
|
"learning_rate": 0.00018824469302334822, |
|
"loss": 0.5333, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.21538390925414544, |
|
"learning_rate": 0.0001882004337303163, |
|
"loss": 0.5603, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.23053571801246284, |
|
"learning_rate": 0.0001881560964977974, |
|
"loss": 0.593, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.21173642276584706, |
|
"learning_rate": 0.0001881116813649706, |
|
"loss": 0.5539, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.24587290888576793, |
|
"learning_rate": 0.00018806718837108402, |
|
"loss": 0.5408, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.22324082101473863, |
|
"learning_rate": 0.00018802261755545443, |
|
"loss": 0.5857, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.21827653101692504, |
|
"learning_rate": 0.0001879779689574674, |
|
"loss": 0.5451, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.2146222856243753, |
|
"learning_rate": 0.00018793324261657737, |
|
"loss": 0.5007, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.20994383183759666, |
|
"learning_rate": 0.00018788843857230726, |
|
"loss": 0.5039, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.23384168426304514, |
|
"learning_rate": 0.00018784355686424876, |
|
"loss": 0.5329, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.20284382518697272, |
|
"learning_rate": 0.00018779859753206225, |
|
"loss": 0.5383, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.22307014132513725, |
|
"learning_rate": 0.00018775356061547662, |
|
"loss": 0.5766, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.21675879523474215, |
|
"learning_rate": 0.00018770844615428932, |
|
"loss": 0.4994, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.2200785983728407, |
|
"learning_rate": 0.00018766325418836637, |
|
"loss": 0.5615, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.20895654400479502, |
|
"learning_rate": 0.00018761798475764224, |
|
"loss": 0.4993, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.22152937631276676, |
|
"learning_rate": 0.00018757263790211988, |
|
"loss": 0.5275, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.209333487906431, |
|
"learning_rate": 0.0001875272136618706, |
|
"loss": 0.4911, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.2123519912763275, |
|
"learning_rate": 0.00018748171207703417, |
|
"loss": 0.5662, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.2147346642469028, |
|
"learning_rate": 0.00018743613318781868, |
|
"loss": 0.5651, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.2017789732342509, |
|
"learning_rate": 0.00018739047703450048, |
|
"loss": 0.5573, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.2084087089737107, |
|
"learning_rate": 0.00018734474365742428, |
|
"loss": 0.562, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.22130968599178, |
|
"learning_rate": 0.00018729893309700295, |
|
"loss": 0.5729, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.22736172090948445, |
|
"learning_rate": 0.0001872530453937176, |
|
"loss": 0.5548, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.21738577850339916, |
|
"learning_rate": 0.0001872070805881176, |
|
"loss": 0.5191, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.20994273135857797, |
|
"learning_rate": 0.00018716103872082026, |
|
"loss": 0.5153, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.25944295362906805, |
|
"learning_rate": 0.00018711491983251113, |
|
"loss": 0.5471, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.2138519097360962, |
|
"learning_rate": 0.00018706872396394376, |
|
"loss": 0.4875, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.23586915663527888, |
|
"learning_rate": 0.00018702245115593974, |
|
"loss": 0.5224, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.20477148046499385, |
|
"learning_rate": 0.0001869761014493887, |
|
"loss": 0.5466, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.21783175505387284, |
|
"learning_rate": 0.00018692967488524812, |
|
"loss": 0.5557, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.20442177589984145, |
|
"learning_rate": 0.0001868831715045435, |
|
"loss": 0.507, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.21291324212369495, |
|
"learning_rate": 0.00018683659134836813, |
|
"loss": 0.5779, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.22670486875141618, |
|
"learning_rate": 0.00018678993445788323, |
|
"loss": 0.5831, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.2431493116309222, |
|
"learning_rate": 0.00018674320087431768, |
|
"loss": 0.5389, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.22102091260855142, |
|
"learning_rate": 0.00018669639063896836, |
|
"loss": 0.5569, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.20001951850669827, |
|
"learning_rate": 0.0001866495037931997, |
|
"loss": 0.486, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.22781103196427857, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.4973, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.21129685691062433, |
|
"learning_rate": 0.00018655550043620073, |
|
"loss": 0.5459, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.20363805081315986, |
|
"learning_rate": 0.0001865083840080378, |
|
"loss": 0.4997, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.22269838654252982, |
|
"learning_rate": 0.00018646119113559006, |
|
"loss": 0.5406, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.20307002281681275, |
|
"learning_rate": 0.00018641392186056016, |
|
"loss": 0.4861, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.20146261628709675, |
|
"learning_rate": 0.0001863665762247182, |
|
"loss": 0.561, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.21049257054009352, |
|
"learning_rate": 0.00018631915426990184, |
|
"loss": 0.5257, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.2245482792823418, |
|
"learning_rate": 0.00018627165603801605, |
|
"loss": 0.5441, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.2106578436256788, |
|
"learning_rate": 0.0001862240815710333, |
|
"loss": 0.5125, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.2091435884054145, |
|
"learning_rate": 0.0001861764309109934, |
|
"loss": 0.523, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.21256854318600532, |
|
"learning_rate": 0.00018612870410000354, |
|
"loss": 0.4851, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.24387962798982954, |
|
"learning_rate": 0.00018608090118023808, |
|
"loss": 0.5423, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.2357478920855788, |
|
"learning_rate": 0.00018603302219393874, |
|
"loss": 0.5386, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.21267780857117077, |
|
"learning_rate": 0.0001859850671834144, |
|
"loss": 0.5545, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.25049614581715324, |
|
"learning_rate": 0.0001859370361910412, |
|
"loss": 0.5241, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.1937807494598699, |
|
"learning_rate": 0.00018588892925926228, |
|
"loss": 0.5533, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.21209972240968475, |
|
"learning_rate": 0.00018584074643058807, |
|
"loss": 0.538, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.22281277082523665, |
|
"learning_rate": 0.00018579248774759586, |
|
"loss": 0.5456, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.22156542955128883, |
|
"learning_rate": 0.00018574415325293018, |
|
"loss": 0.5622, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.20068342929250654, |
|
"learning_rate": 0.00018569574298930237, |
|
"loss": 0.5372, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.21693418845369525, |
|
"learning_rate": 0.00018564725699949083, |
|
"loss": 0.4874, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.2060622909003744, |
|
"learning_rate": 0.0001855986953263409, |
|
"loss": 0.5331, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.20007419545283933, |
|
"learning_rate": 0.00018555005801276463, |
|
"loss": 0.5131, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.21905328017125653, |
|
"learning_rate": 0.00018550134510174115, |
|
"loss": 0.5572, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.21213287568506015, |
|
"learning_rate": 0.0001854525566363162, |
|
"loss": 0.5359, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.20066093050756748, |
|
"learning_rate": 0.00018540369265960242, |
|
"loss": 0.5334, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.2068811720002483, |
|
"learning_rate": 0.00018535475321477906, |
|
"loss": 0.5558, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.2025287668887073, |
|
"learning_rate": 0.00018530573834509215, |
|
"loss": 0.5098, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.20807380346718593, |
|
"learning_rate": 0.0001852566480938543, |
|
"loss": 0.5211, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.2049943719782544, |
|
"learning_rate": 0.00018520748250444474, |
|
"loss": 0.5379, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.8558508208219735, |
|
"learning_rate": 0.00018515824162030934, |
|
"loss": 0.5403, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.25414317775682305, |
|
"learning_rate": 0.00018510892548496047, |
|
"loss": 0.5804, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.20806597400748386, |
|
"learning_rate": 0.00018505953414197696, |
|
"loss": 0.5419, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.1950528976937739, |
|
"learning_rate": 0.00018501006763500414, |
|
"loss": 0.4956, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.20652545713558523, |
|
"learning_rate": 0.00018496052600775376, |
|
"loss": 0.4942, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.20955886781649663, |
|
"learning_rate": 0.0001849109093040039, |
|
"loss": 0.5177, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.21093362015684414, |
|
"learning_rate": 0.00018486121756759906, |
|
"loss": 0.5672, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.22033088091533184, |
|
"learning_rate": 0.00018481145084245002, |
|
"loss": 0.5691, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.20322111965044637, |
|
"learning_rate": 0.00018476160917253373, |
|
"loss": 0.5425, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.2028788101278272, |
|
"learning_rate": 0.0001847116926018935, |
|
"loss": 0.5176, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.19551140156538951, |
|
"learning_rate": 0.0001846617011746388, |
|
"loss": 0.5115, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.21944694996534547, |
|
"learning_rate": 0.00018461163493494517, |
|
"loss": 0.5496, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.21506814147924705, |
|
"learning_rate": 0.0001845614939270543, |
|
"loss": 0.5823, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.2220938137588105, |
|
"learning_rate": 0.00018451127819527402, |
|
"loss": 0.5731, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.21590208362786933, |
|
"learning_rate": 0.00018446098778397807, |
|
"loss": 0.6063, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.20084594317065918, |
|
"learning_rate": 0.00018441062273760628, |
|
"loss": 0.5286, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.21847304705653886, |
|
"learning_rate": 0.00018436018310066435, |
|
"loss": 0.5721, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.2467936487351411, |
|
"learning_rate": 0.000184309668917724, |
|
"loss": 0.571, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.21666156526926003, |
|
"learning_rate": 0.0001842590802334227, |
|
"loss": 0.5244, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.21336859433357677, |
|
"learning_rate": 0.00018420841709246383, |
|
"loss": 0.5724, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.1933070755110986, |
|
"learning_rate": 0.0001841576795396166, |
|
"loss": 0.5347, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.2332186369470874, |
|
"learning_rate": 0.00018410686761971586, |
|
"loss": 0.5474, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.1996293438855639, |
|
"learning_rate": 0.00018405598137766224, |
|
"loss": 0.5421, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.2012759807756364, |
|
"learning_rate": 0.00018400502085842208, |
|
"loss": 0.519, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.24355300568180752, |
|
"learning_rate": 0.00018395398610702733, |
|
"loss": 0.597, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.2136711983483761, |
|
"learning_rate": 0.00018390287716857546, |
|
"loss": 0.5398, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.22275088525970024, |
|
"learning_rate": 0.00018385169408822964, |
|
"loss": 0.5597, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.20011931485707388, |
|
"learning_rate": 0.0001838004369112184, |
|
"loss": 0.4901, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.19544716159187206, |
|
"learning_rate": 0.00018374910568283594, |
|
"loss": 0.4726, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.2176067620544374, |
|
"learning_rate": 0.00018369770044844168, |
|
"loss": 0.5369, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.2005629047810257, |
|
"learning_rate": 0.00018364622125346055, |
|
"loss": 0.4914, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.21497281608823432, |
|
"learning_rate": 0.0001835946681433829, |
|
"loss": 0.5559, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.20354723273049724, |
|
"learning_rate": 0.00018354304116376425, |
|
"loss": 0.5083, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.23536026550959782, |
|
"learning_rate": 0.0001834913403602255, |
|
"loss": 0.5449, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.20887211237530257, |
|
"learning_rate": 0.00018343956577845276, |
|
"loss": 0.5131, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.21728763678777088, |
|
"learning_rate": 0.00018338771746419726, |
|
"loss": 0.5484, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.21910570476522437, |
|
"learning_rate": 0.00018333579546327556, |
|
"loss": 0.5452, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.21247350127543838, |
|
"learning_rate": 0.00018328379982156915, |
|
"loss": 0.5232, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.21706686115456897, |
|
"learning_rate": 0.00018323173058502472, |
|
"loss": 0.5353, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.19529494853666482, |
|
"learning_rate": 0.00018317958779965387, |
|
"loss": 0.4611, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.2194890381897013, |
|
"learning_rate": 0.00018312737151153334, |
|
"loss": 0.4884, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.24336065627870296, |
|
"learning_rate": 0.00018307508176680472, |
|
"loss": 0.5708, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.22638828434923797, |
|
"learning_rate": 0.00018302271861167456, |
|
"loss": 0.5795, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.20501380703607638, |
|
"learning_rate": 0.0001829702820924142, |
|
"loss": 0.5645, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.22705979847255006, |
|
"learning_rate": 0.00018291777225535994, |
|
"loss": 0.4974, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.22629645320684777, |
|
"learning_rate": 0.00018286518914691272, |
|
"loss": 0.5587, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.21772563640763765, |
|
"learning_rate": 0.00018281253281353838, |
|
"loss": 0.5219, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.20447194133414195, |
|
"learning_rate": 0.00018275980330176737, |
|
"loss": 0.5425, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.24126870503035064, |
|
"learning_rate": 0.00018270700065819477, |
|
"loss": 0.5119, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.23269297218381896, |
|
"learning_rate": 0.00018265412492948042, |
|
"loss": 0.5507, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.23416570398912578, |
|
"learning_rate": 0.0001826011761623486, |
|
"loss": 0.5947, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.2186560086983282, |
|
"learning_rate": 0.0001825481544035882, |
|
"loss": 0.5204, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.20624707271501935, |
|
"learning_rate": 0.00018249505970005262, |
|
"loss": 0.4785, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.23418189558532218, |
|
"learning_rate": 0.00018244189209865974, |
|
"loss": 0.4976, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.21372290734059424, |
|
"learning_rate": 0.00018238865164639173, |
|
"loss": 0.5237, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.1986689651795865, |
|
"learning_rate": 0.0001823353383902953, |
|
"loss": 0.5354, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.21154599437074698, |
|
"learning_rate": 0.0001822819523774814, |
|
"loss": 0.5292, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.21348268586605149, |
|
"learning_rate": 0.00018222849365512523, |
|
"loss": 0.5249, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.22296243039072478, |
|
"learning_rate": 0.0001821749622704664, |
|
"loss": 0.5458, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.22596567506529938, |
|
"learning_rate": 0.00018212135827080857, |
|
"loss": 0.5085, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.19012132806019622, |
|
"learning_rate": 0.00018206768170351962, |
|
"loss": 0.4977, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.2125366600531234, |
|
"learning_rate": 0.0001820139326160316, |
|
"loss": 0.5051, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.23677835317412968, |
|
"learning_rate": 0.00018196011105584058, |
|
"loss": 0.575, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.2262210065848097, |
|
"learning_rate": 0.00018190621707050671, |
|
"loss": 0.5744, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.21618545867420894, |
|
"learning_rate": 0.0001818522507076541, |
|
"loss": 0.5715, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.2050215711297079, |
|
"learning_rate": 0.00018179821201497092, |
|
"loss": 0.5201, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.20218467055707082, |
|
"learning_rate": 0.0001817441010402091, |
|
"loss": 0.5058, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.20940987275867923, |
|
"learning_rate": 0.00018168991783118452, |
|
"loss": 0.5095, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.21341822518403558, |
|
"learning_rate": 0.00018163566243577697, |
|
"loss": 0.5599, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.20028205017927186, |
|
"learning_rate": 0.0001815813349019299, |
|
"loss": 0.5318, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.20184912350066175, |
|
"learning_rate": 0.00018152693527765057, |
|
"loss": 0.5643, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.20882160405967118, |
|
"learning_rate": 0.0001814724636110099, |
|
"loss": 0.542, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.20252144356881077, |
|
"learning_rate": 0.00018141791995014255, |
|
"loss": 0.4496, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.1956328371434174, |
|
"learning_rate": 0.00018136330434324674, |
|
"loss": 0.56, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.20691128111503362, |
|
"learning_rate": 0.00018130861683858426, |
|
"loss": 0.5726, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2258004454621585, |
|
"learning_rate": 0.00018125385748448048, |
|
"loss": 0.583, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.22330059205477634, |
|
"learning_rate": 0.00018119902632932416, |
|
"loss": 0.5288, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.20473079466150892, |
|
"learning_rate": 0.0001811441234215677, |
|
"loss": 0.5085, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.19439333859223318, |
|
"learning_rate": 0.0001810891488097267, |
|
"loss": 0.5147, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2037181989313857, |
|
"learning_rate": 0.00018103410254238021, |
|
"loss": 0.5228, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.21580635559566858, |
|
"learning_rate": 0.0001809789846681706, |
|
"loss": 0.5034, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.21490060304667385, |
|
"learning_rate": 0.00018092379523580357, |
|
"loss": 0.5347, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.20927738857723482, |
|
"learning_rate": 0.00018086853429404793, |
|
"loss": 0.5039, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.21391199422702836, |
|
"learning_rate": 0.00018081320189173577, |
|
"loss": 0.5404, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.22355130583819918, |
|
"learning_rate": 0.0001807577980777623, |
|
"loss": 0.5147, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.21899190720848985, |
|
"learning_rate": 0.00018070232290108584, |
|
"loss": 0.5195, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.20636096645560792, |
|
"learning_rate": 0.00018064677641072775, |
|
"loss": 0.5158, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.20462410706105155, |
|
"learning_rate": 0.00018059115865577249, |
|
"loss": 0.5194, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.21978634315593423, |
|
"learning_rate": 0.00018053546968536735, |
|
"loss": 0.4986, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2203882917140438, |
|
"learning_rate": 0.00018047970954872264, |
|
"loss": 0.5855, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.20144829000454462, |
|
"learning_rate": 0.0001804238782951116, |
|
"loss": 0.5212, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.21142991796239274, |
|
"learning_rate": 0.00018036797597387023, |
|
"loss": 0.495, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.21275432668758548, |
|
"learning_rate": 0.00018031200263439736, |
|
"loss": 0.5694, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2035189446424034, |
|
"learning_rate": 0.00018025595832615459, |
|
"loss": 0.55, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.20030837247360464, |
|
"learning_rate": 0.00018019984309866619, |
|
"loss": 0.4748, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.20366715425572, |
|
"learning_rate": 0.00018014365700151912, |
|
"loss": 0.5792, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2082468197583491, |
|
"learning_rate": 0.000180087400084363, |
|
"loss": 0.4973, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.21820027454676755, |
|
"learning_rate": 0.00018003107239691004, |
|
"loss": 0.5512, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2085678250499903, |
|
"learning_rate": 0.00017997467398893488, |
|
"loss": 0.5148, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.20422653056781329, |
|
"learning_rate": 0.00017991820491027472, |
|
"loss": 0.4968, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.1875899162050169, |
|
"learning_rate": 0.0001798616652108293, |
|
"loss": 0.5061, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.20869663705218836, |
|
"learning_rate": 0.00017980505494056062, |
|
"loss": 0.5182, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.19250179476147736, |
|
"learning_rate": 0.00017974837414949307, |
|
"loss": 0.5184, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.21732108838463451, |
|
"learning_rate": 0.00017969162288771347, |
|
"loss": 0.5524, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.20200315361578813, |
|
"learning_rate": 0.0001796348012053707, |
|
"loss": 0.5386, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.20242537832049035, |
|
"learning_rate": 0.00017957790915267615, |
|
"loss": 0.5656, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.1889172192023988, |
|
"learning_rate": 0.0001795209467799031, |
|
"loss": 0.5115, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.19623435201373893, |
|
"learning_rate": 0.0001794639141373872, |
|
"loss": 0.497, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.22372809637554478, |
|
"learning_rate": 0.00017940681127552604, |
|
"loss": 0.5579, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.1968536923376666, |
|
"learning_rate": 0.0001793496382447794, |
|
"loss": 0.4891, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.1990723573146364, |
|
"learning_rate": 0.00017929239509566894, |
|
"loss": 0.5921, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.20388703819339077, |
|
"learning_rate": 0.00017923508187877834, |
|
"loss": 0.5414, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.23657852979478725, |
|
"learning_rate": 0.00017917769864475314, |
|
"loss": 0.5672, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.22888252332289927, |
|
"learning_rate": 0.00017912024544430088, |
|
"loss": 0.5459, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.19383907969249117, |
|
"learning_rate": 0.0001790627223281908, |
|
"loss": 0.5509, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.2154263629956836, |
|
"learning_rate": 0.00017900512934725397, |
|
"loss": 0.5629, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.19802419635693494, |
|
"learning_rate": 0.0001789474665523832, |
|
"loss": 0.5128, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.19783321602266912, |
|
"learning_rate": 0.00017888973399453296, |
|
"loss": 0.5064, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.19864882371614528, |
|
"learning_rate": 0.00017883193172471944, |
|
"loss": 0.5458, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.23609512585527, |
|
"learning_rate": 0.00017877405979402038, |
|
"loss": 0.5069, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.19894144678524353, |
|
"learning_rate": 0.00017871611825357502, |
|
"loss": 0.5812, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.19598819977852033, |
|
"learning_rate": 0.00017865810715458427, |
|
"loss": 0.5223, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.23274847505011953, |
|
"learning_rate": 0.00017860002654831032, |
|
"loss": 0.5703, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.19794477486450376, |
|
"learning_rate": 0.00017854187648607694, |
|
"loss": 0.5538, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.2091737019131215, |
|
"learning_rate": 0.00017848365701926913, |
|
"loss": 0.4962, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.21890749511490995, |
|
"learning_rate": 0.00017842536819933337, |
|
"loss": 0.5074, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.22746821737803938, |
|
"learning_rate": 0.0001783670100777773, |
|
"loss": 0.5849, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.20967916540656184, |
|
"learning_rate": 0.0001783085827061699, |
|
"loss": 0.5246, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.19798059353181535, |
|
"learning_rate": 0.00017825008613614127, |
|
"loss": 0.4667, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.1992664047298655, |
|
"learning_rate": 0.00017819152041938265, |
|
"loss": 0.5247, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.22025628624147217, |
|
"learning_rate": 0.00017813288560764647, |
|
"loss": 0.5291, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.20405038516624363, |
|
"learning_rate": 0.00017807418175274612, |
|
"loss": 0.5235, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.20626127985692586, |
|
"learning_rate": 0.00017801540890655609, |
|
"loss": 0.5103, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.2187527308725265, |
|
"learning_rate": 0.00017795656712101172, |
|
"loss": 0.5515, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.20386714530070776, |
|
"learning_rate": 0.00017789765644810935, |
|
"loss": 0.5109, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.1990293686392052, |
|
"learning_rate": 0.00017783867693990624, |
|
"loss": 0.5208, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.19601721442767256, |
|
"learning_rate": 0.0001777796286485204, |
|
"loss": 0.5318, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.20542580410660244, |
|
"learning_rate": 0.0001777205116261306, |
|
"loss": 0.5198, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.20998518101289002, |
|
"learning_rate": 0.0001776613259249764, |
|
"loss": 0.5384, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.20134476803418952, |
|
"learning_rate": 0.00017760207159735805, |
|
"loss": 0.5448, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.22396912180134018, |
|
"learning_rate": 0.00017754274869563637, |
|
"loss": 0.59, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.2044555533666512, |
|
"learning_rate": 0.00017748335727223294, |
|
"loss": 0.5152, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.2106748606361736, |
|
"learning_rate": 0.00017742389737962966, |
|
"loss": 0.5233, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.19348093577281505, |
|
"learning_rate": 0.0001773643690703691, |
|
"loss": 0.5181, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.20393569458294794, |
|
"learning_rate": 0.00017730477239705428, |
|
"loss": 0.5671, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.19728761757057783, |
|
"learning_rate": 0.00017724510741234858, |
|
"loss": 0.4919, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.2025575313201386, |
|
"learning_rate": 0.0001771853741689757, |
|
"loss": 0.5452, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.19153867099886435, |
|
"learning_rate": 0.0001771255727197198, |
|
"loss": 0.4951, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.2220125331576081, |
|
"learning_rate": 0.00017706570311742516, |
|
"loss": 0.5521, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.22704666961693065, |
|
"learning_rate": 0.0001770057654149964, |
|
"loss": 0.5184, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.20871880228168335, |
|
"learning_rate": 0.00017694575966539823, |
|
"loss": 0.5205, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.2105924919088961, |
|
"learning_rate": 0.00017688568592165552, |
|
"loss": 0.5448, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.19780662201378688, |
|
"learning_rate": 0.00017682554423685329, |
|
"loss": 0.6037, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.23105576261963792, |
|
"learning_rate": 0.0001767653346641365, |
|
"loss": 0.7225, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.21997563912032173, |
|
"learning_rate": 0.00017670505725671013, |
|
"loss": 0.552, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.2033859052649398, |
|
"learning_rate": 0.00017664471206783915, |
|
"loss": 0.5315, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.19979214467824102, |
|
"learning_rate": 0.00017658429915084835, |
|
"loss": 0.5697, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.20567412732571028, |
|
"learning_rate": 0.00017652381855912247, |
|
"loss": 0.5051, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.20563597140752976, |
|
"learning_rate": 0.0001764632703461059, |
|
"loss": 0.5141, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.1979658869623221, |
|
"learning_rate": 0.00017640265456530293, |
|
"loss": 0.5257, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.2241077787834463, |
|
"learning_rate": 0.0001763419712702775, |
|
"loss": 0.5203, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.2197932846972142, |
|
"learning_rate": 0.00017628122051465322, |
|
"loss": 0.5847, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.1990944255813207, |
|
"learning_rate": 0.00017622040235211326, |
|
"loss": 0.4962, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.22179309744704687, |
|
"learning_rate": 0.00017615951683640045, |
|
"loss": 0.5635, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.20505896786344424, |
|
"learning_rate": 0.00017609856402131703, |
|
"loss": 0.4968, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.21771157401053975, |
|
"learning_rate": 0.00017603754396072483, |
|
"loss": 0.4858, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.23357401076131715, |
|
"learning_rate": 0.000175976456708545, |
|
"loss": 0.5766, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.21488993425737504, |
|
"learning_rate": 0.0001759153023187581, |
|
"loss": 0.5419, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2035555999534868, |
|
"learning_rate": 0.00017585408084540405, |
|
"loss": 0.5272, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.20066829451010718, |
|
"learning_rate": 0.00017579279234258198, |
|
"loss": 0.5013, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2052255359730049, |
|
"learning_rate": 0.00017573143686445034, |
|
"loss": 0.5383, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.19180058672325329, |
|
"learning_rate": 0.00017567001446522665, |
|
"loss": 0.5108, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.22862029943228582, |
|
"learning_rate": 0.0001756085251991877, |
|
"loss": 0.531, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2180888066741993, |
|
"learning_rate": 0.00017554696912066924, |
|
"loss": 0.5938, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.19823263656993223, |
|
"learning_rate": 0.00017548534628406616, |
|
"loss": 0.5158, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.18700255356016454, |
|
"learning_rate": 0.00017542365674383227, |
|
"loss": 0.517, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.22948411236460914, |
|
"learning_rate": 0.00017536190055448037, |
|
"loss": 0.5464, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.21370070213829387, |
|
"learning_rate": 0.00017530007777058213, |
|
"loss": 0.5158, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.19174674116457566, |
|
"learning_rate": 0.0001752381884467681, |
|
"loss": 0.5035, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.19069115110218368, |
|
"learning_rate": 0.00017517623263772758, |
|
"loss": 0.5341, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2401612943495333, |
|
"learning_rate": 0.00017511421039820863, |
|
"loss": 0.578, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.20371659209716964, |
|
"learning_rate": 0.00017505212178301805, |
|
"loss": 0.5103, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2029847143168681, |
|
"learning_rate": 0.00017498996684702132, |
|
"loss": 0.537, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.1904915669257304, |
|
"learning_rate": 0.00017492774564514235, |
|
"loss": 0.5129, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.20640027525482552, |
|
"learning_rate": 0.00017486545823236385, |
|
"loss": 0.5585, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.23582084058854208, |
|
"learning_rate": 0.00017480310466372686, |
|
"loss": 0.5648, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.2219618762642625, |
|
"learning_rate": 0.00017474068499433098, |
|
"loss": 0.5365, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.2021980496149104, |
|
"learning_rate": 0.00017467819927933416, |
|
"loss": 0.5232, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.22350007413119136, |
|
"learning_rate": 0.00017461564757395272, |
|
"loss": 0.571, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.1982267515659923, |
|
"learning_rate": 0.00017455302993346134, |
|
"loss": 0.5228, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.1981641437638338, |
|
"learning_rate": 0.00017449034641319288, |
|
"loss": 0.5233, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.21019781303279997, |
|
"learning_rate": 0.00017442759706853855, |
|
"loss": 0.5207, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.2060312831458839, |
|
"learning_rate": 0.00017436478195494756, |
|
"loss": 0.5262, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.21829001169718656, |
|
"learning_rate": 0.00017430190112792737, |
|
"loss": 0.563, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.18511782073951058, |
|
"learning_rate": 0.00017423895464304342, |
|
"loss": 0.5017, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.1883852134929889, |
|
"learning_rate": 0.00017417594255591927, |
|
"loss": 0.4598, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.18093236424530848, |
|
"learning_rate": 0.00017411286492223632, |
|
"loss": 0.4834, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.18428120434597678, |
|
"learning_rate": 0.000174049721797734, |
|
"loss": 0.5032, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.20829275110131446, |
|
"learning_rate": 0.00017398651323820958, |
|
"loss": 0.5844, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.20484798677763622, |
|
"learning_rate": 0.00017392323929951812, |
|
"loss": 0.5674, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.24390628538267795, |
|
"learning_rate": 0.0001738599000375725, |
|
"loss": 0.5415, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7159631217821198, |
|
"learning_rate": 0.00017379649550834327, |
|
"loss": 0.5248, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.2153929799459398, |
|
"learning_rate": 0.00017373302576785874, |
|
"loss": 0.5362, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.18434502268826083, |
|
"learning_rate": 0.00017366949087220472, |
|
"loss": 0.5179, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.1993458339623336, |
|
"learning_rate": 0.0001736058908775247, |
|
"loss": 0.5378, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.20482117487035287, |
|
"learning_rate": 0.0001735422258400197, |
|
"loss": 0.5066, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.22481951556352617, |
|
"learning_rate": 0.0001734784958159481, |
|
"loss": 0.5504, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.20893222575857784, |
|
"learning_rate": 0.00017341470086162586, |
|
"loss": 0.5558, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.21011978723049574, |
|
"learning_rate": 0.0001733508410334262, |
|
"loss": 0.5164, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.19493427746334713, |
|
"learning_rate": 0.0001732869163877797, |
|
"loss": 0.4928, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.21183476672026114, |
|
"learning_rate": 0.00017322292698117425, |
|
"loss": 0.539, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.19833380077404217, |
|
"learning_rate": 0.00017315887287015492, |
|
"loss": 0.5271, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.1914374518219283, |
|
"learning_rate": 0.000173094754111324, |
|
"loss": 0.5408, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.2086814492670768, |
|
"learning_rate": 0.00017303057076134085, |
|
"loss": 0.5289, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.20957903788826676, |
|
"learning_rate": 0.000172966322876922, |
|
"loss": 0.4998, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.20998255172298386, |
|
"learning_rate": 0.00017290201051484085, |
|
"loss": 0.5481, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.2071975609134585, |
|
"learning_rate": 0.00017283763373192798, |
|
"loss": 0.5183, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.21738328519054306, |
|
"learning_rate": 0.00017277319258507073, |
|
"loss": 0.539, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.20518040499899, |
|
"learning_rate": 0.0001727086871312134, |
|
"loss": 0.5109, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.19341379491491822, |
|
"learning_rate": 0.00017264411742735707, |
|
"loss": 0.4882, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.23128359760674316, |
|
"learning_rate": 0.00017257948353055963, |
|
"loss": 0.547, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.1960131633047162, |
|
"learning_rate": 0.0001725147854979357, |
|
"loss": 0.5467, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.21053602560644855, |
|
"learning_rate": 0.00017245002338665656, |
|
"loss": 0.5644, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.19602457133539752, |
|
"learning_rate": 0.00017238519725395007, |
|
"loss": 0.5121, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.1923459024283483, |
|
"learning_rate": 0.00017232030715710076, |
|
"loss": 0.5335, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.19919783133579333, |
|
"learning_rate": 0.00017225535315344955, |
|
"loss": 0.5076, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.23727428892467575, |
|
"learning_rate": 0.00017219033530039397, |
|
"loss": 0.5396, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.18048505778792392, |
|
"learning_rate": 0.00017212525365538792, |
|
"loss": 0.467, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.20071702267002645, |
|
"learning_rate": 0.00017206010827594163, |
|
"loss": 0.5217, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.20315216612025339, |
|
"learning_rate": 0.0001719948992196217, |
|
"loss": 0.4975, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.2142259292765235, |
|
"learning_rate": 0.00017192962654405096, |
|
"loss": 0.5148, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.19450012283752555, |
|
"learning_rate": 0.00017186429030690848, |
|
"loss": 0.5297, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.19853162923467543, |
|
"learning_rate": 0.00017179889056592954, |
|
"loss": 0.547, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.24873174470750906, |
|
"learning_rate": 0.00017173342737890544, |
|
"loss": 0.563, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.18593730182623175, |
|
"learning_rate": 0.00017166790080368357, |
|
"loss": 0.4647, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.19387710879340586, |
|
"learning_rate": 0.00017160231089816748, |
|
"loss": 0.5313, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.20818447206363588, |
|
"learning_rate": 0.00017153665772031643, |
|
"loss": 0.5333, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.17584822143732362, |
|
"learning_rate": 0.0001714709413281458, |
|
"loss": 0.4467, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.19622166504995672, |
|
"learning_rate": 0.00017140516177972676, |
|
"loss": 0.5129, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.18822988249157332, |
|
"learning_rate": 0.00017133931913318625, |
|
"loss": 0.5186, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.2021164051164271, |
|
"learning_rate": 0.00017127341344670696, |
|
"loss": 0.551, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.19354327476685654, |
|
"learning_rate": 0.00017120744477852745, |
|
"loss": 0.5001, |
|
"step": 838 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 3352, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 838, |
|
"total_flos": 829560862015488.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|